diff --git a/caching_compiler_test.go b/caching_compiler_test.go new file mode 100644 index 0000000..7f2c6b1 --- /dev/null +++ b/caching_compiler_test.go @@ -0,0 +1,265 @@ +package expr + +import ( + "testing" + + "github.com/google/cel-go/cel" + "github.com/stretchr/testify/require" +) + +func TestCachingParser_CachesSame(t *testing.T) { + c := cachingCompiler{env: newEnv()} + + a := `event.data.a == "cache"` + b := `event.data.b == "cache"` + + var ( + prevAST *cel.Ast + prevIssues *cel.Issues + prevVars LiftedArgs + ) + + t.Run("With an uncached expression", func(t *testing.T) { + prevAST, prevIssues, prevVars = c.Compile(a) + require.NotNil(t, prevAST) + require.Nil(t, prevIssues) + require.NotNil(t, prevVars) + require.EqualValues(t, 0, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + }) + + t.Run("With a cached expression", func(t *testing.T) { + ast, issues, vars := c.Compile(a) + require.NotNil(t, ast) + require.Nil(t, issues) + + require.Equal(t, prevAST, ast) + require.Equal(t, prevIssues, issues) + require.Equal(t, prevVars, vars) + + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + }) + + t.Run("With another uncached expression", func(t *testing.T) { + prevAST, prevIssues, prevVars = c.Compile(b) + require.NotNil(t, prevAST) + require.Nil(t, prevIssues) + // This misses the cache, as the vars have changed - not the + // literals. + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 2, c.Misses()) + }) +} + +func TestCachingCompile(t *testing.T) { + c := cachingCompiler{env: newEnv()} + + a := `event.data.a == "literal-a" && event.data.b == "yes-1"` + b := `event.data.a == "literal-b" && event.data.b == "yes-2"` + + var ( + prevAST *cel.Ast + prevIssues *cel.Issues + prevVars LiftedArgs + ) + + t.Run("With an uncached expression", func(t *testing.T) { + prevAST, prevIssues, prevVars = c.Compile(a) + require.NotNil(t, prevAST) + require.Nil(t, prevIssues) + require.EqualValues(t, 0, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + }) + + t.Run("With a cached expression", func(t *testing.T) { + ast, issues, vars := c.Compile(a) + require.NotNil(t, ast) + require.Nil(t, issues) + + require.Equal(t, prevAST, ast) + require.Equal(t, prevIssues, issues) + require.Equal(t, prevVars, vars) + + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + }) + + t.Run("With a cached expression having different literals ONLY", func(t *testing.T) { + prevAST, prevIssues, _ = c.Compile(b) + require.NotNil(t, prevAST) + require.Nil(t, prevIssues) + // This misses the cache. + require.EqualValues(t, 2, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + }) +} + +func TestCachingCompile_IntegerLiteralDedup(t *testing.T) { + c := cachingCompiler{env: newEnv()} + + a := `event.data.id == 1` + b := `event.data.id == 2` + + astA, issA, varsA := c.Compile(a) + require.NotNil(t, astA) + require.Nil(t, issA) + require.EqualValues(t, int64(1), mustGet(t, varsA, "a")) + require.EqualValues(t, 0, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + + // Both normalise to "event.data.id == vars.a"; AST pointer must be identical. + astB, issB, varsB := c.Compile(b) + require.NotNil(t, astB) + require.Nil(t, issB) + require.Equal(t, astA, astB) + require.EqualValues(t, int64(2), mustGet(t, varsB, "a")) + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 1, c.Misses()) +} + +func TestCachingCompile_FloatLiteralDedup(t *testing.T) { + c := cachingCompiler{env: newEnv()} + + a := `event.data.score >= 1.5` + b := `event.data.score >= 99.9` + + astA, issA, varsA := c.Compile(a) + require.NotNil(t, astA) + require.Nil(t, issA) + require.EqualValues(t, float64(1.5), mustGet(t, varsA, "a")) + require.EqualValues(t, 0, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + + astB, issB, varsB := c.Compile(b) + require.NotNil(t, astB) + require.Nil(t, issB) + require.Equal(t, astA, astB) + require.EqualValues(t, float64(99.9), mustGet(t, varsB, "a")) + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 1, c.Misses()) +} + +// Scientific notation must be consumed whole; leaving "e10" produces "vars.ae10" (field access). +func TestCachingCompile_ScientificNotationDedup(t *testing.T) { + c := cachingCompiler{env: newEnv()} + + a := `event.data.count > 1e6` + b := `event.data.count > 2.5e3` + + astA, issA, varsA := c.Compile(a) + require.NotNil(t, astA) + require.Nil(t, issA) + require.EqualValues(t, float64(1e6), mustGet(t, varsA, "a")) + require.EqualValues(t, 0, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + + astB, issB, varsB := c.Compile(b) + require.NotNil(t, astB) + require.Nil(t, issB) + require.Equal(t, astA, astB) + require.EqualValues(t, float64(2.5e3), mustGet(t, varsB, "a")) + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + + // explicit positive sign in exponent + astC, issC, varsC := c.Compile(`event.data.count > 1e+6`) + require.NotNil(t, astC) + require.Nil(t, issC) + require.Equal(t, astA, astC) + require.EqualValues(t, float64(1e+6), mustGet(t, varsC, "a")) + require.EqualValues(t, 2, c.Hits()) + require.EqualValues(t, 1, c.Misses()) +} + +func TestCachingCompile_IdentExprDedup(t *testing.T) { + c := cachingCompiler{env: newEnv()} + + expr := `event.data.id == async.data.id` + + astA, _, _ := c.Compile(expr) + require.EqualValues(t, 0, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + + astB, _, _ := c.Compile(expr) + require.Equal(t, astA, astB) + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 1, c.Misses()) +} + +func TestCachingCompile_MixedLiteralsDedup(t *testing.T) { + c := cachingCompiler{env: newEnv()} + + // Both normalise to: event.name == vars.a && event.data.amount > vars.b + a := `event.name == "order/created" && event.data.amount > 100` + b := `event.name == "item/shipped" && event.data.amount > 9999` + + astA, issA, varsA := c.Compile(a) + require.NotNil(t, astA) + require.Nil(t, issA) + require.EqualValues(t, "order/created", mustGet(t, varsA, "a")) + require.EqualValues(t, int64(100), mustGet(t, varsA, "b")) + require.EqualValues(t, 0, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + + astB, issB, varsB := c.Compile(b) + require.NotNil(t, astB) + require.Nil(t, issB) + require.Equal(t, astA, astB) + require.EqualValues(t, "item/shipped", mustGet(t, varsB, "a")) + require.EqualValues(t, int64(9999), mustGet(t, varsB, "b")) + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 1, c.Misses()) +} + +// Unary minus stays in the expression; only the positive digit is lifted. +// "-5" and "-3" both normalise to "-vars.a". +func TestCachingCompile_NegativeIntegerDedup(t *testing.T) { + c := cachingCompiler{env: newEnv()} + + a := `event.data.offset > -5` + b := `event.data.offset > -100` + + astA, issA, varsA := c.Compile(a) + require.NotNil(t, astA) + require.Nil(t, issA) + require.EqualValues(t, int64(5), mustGet(t, varsA, "a")) + require.EqualValues(t, 0, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + + astB, issB, varsB := c.Compile(b) + require.NotNil(t, astB) + require.Nil(t, issB) + require.Equal(t, astA, astB) + require.EqualValues(t, int64(100), mustGet(t, varsB, "a")) + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 1, c.Misses()) +} + +// Digits within an identifier (e.g. "version2") must not be lifted. +func TestCachingCompile_IdentifierDigitsNotLifted(t *testing.T) { + c := cachingCompiler{env: newEnv()} + + a := `event.data.version2 == "v1"` + b := `event.data.version2 == "v2"` + + astA, issA, _ := c.Compile(a) + require.NotNil(t, astA) + require.Nil(t, issA) + require.EqualValues(t, 0, c.Hits()) + require.EqualValues(t, 1, c.Misses()) + + astB, issB, _ := c.Compile(b) + require.NotNil(t, astB) + require.Nil(t, issB) + require.Equal(t, astA, astB) + require.EqualValues(t, 1, c.Hits()) + require.EqualValues(t, 1, c.Misses()) +} + +func mustGet(t *testing.T, args LiftedArgs, key string) any { + t.Helper() + val, ok := args.Get(key) + require.True(t, ok, "expected lifted variable %q to be present", key) + return val +} diff --git a/caching_coompiler_test.go b/caching_coompiler_test.go deleted file mode 100644 index f07ab1f..0000000 --- a/caching_coompiler_test.go +++ /dev/null @@ -1,96 +0,0 @@ -package expr - -import ( - "testing" - - "github.com/google/cel-go/cel" - "github.com/stretchr/testify/require" -) - -func TestCachingParser_CachesSame(t *testing.T) { - c := cachingCompiler{env: newEnv()} - - a := `event.data.a == "cache"` - b := `event.data.b == "cache"` - - var ( - prevAST *cel.Ast - prevIssues *cel.Issues - prevVars LiftedArgs - ) - - t.Run("With an uncached expression", func(t *testing.T) { - prevAST, prevIssues, prevVars = c.Compile(a) - require.NotNil(t, prevAST) - require.Nil(t, prevIssues) - require.NotNil(t, prevVars) - require.EqualValues(t, 0, c.Hits()) - require.EqualValues(t, 1, c.Misses()) - }) - - t.Run("With a cached expression", func(t *testing.T) { - ast, issues, vars := c.Compile(a) - require.NotNil(t, ast) - require.Nil(t, issues) - - require.Equal(t, prevAST, ast) - require.Equal(t, prevIssues, issues) - require.Equal(t, prevVars, vars) - - require.EqualValues(t, 1, c.Hits()) - require.EqualValues(t, 1, c.Misses()) - }) - - t.Run("With another uncached expression", func(t *testing.T) { - prevAST, prevIssues, prevVars = c.Compile(b) - require.NotNil(t, prevAST) - require.Nil(t, prevIssues) - // This misses the cache, as the vars have changed - not the - // literals. - require.EqualValues(t, 1, c.Hits()) - require.EqualValues(t, 2, c.Misses()) - }) -} - -func TestCachingCompile(t *testing.T) { - c := cachingCompiler{env: newEnv()} - - a := `event.data.a == "literal-a" && event.data.b == "yes-1"` - b := `event.data.a == "literal-b" && event.data.b == "yes-2"` - - var ( - prevAST *cel.Ast - prevIssues *cel.Issues - prevVars LiftedArgs - ) - - t.Run("With an uncached expression", func(t *testing.T) { - prevAST, prevIssues, prevVars = c.Compile(a) - require.NotNil(t, prevAST) - require.Nil(t, prevIssues) - require.EqualValues(t, 0, c.Hits()) - require.EqualValues(t, 1, c.Misses()) - }) - - t.Run("With a cached expression", func(t *testing.T) { - ast, issues, vars := c.Compile(a) - require.NotNil(t, ast) - require.Nil(t, issues) - - require.Equal(t, prevAST, ast) - require.Equal(t, prevIssues, issues) - require.Equal(t, prevVars, vars) - - require.EqualValues(t, 1, c.Hits()) - require.EqualValues(t, 1, c.Misses()) - }) - - t.Run("With a cached expression having different literals ONLY", func(t *testing.T) { - prevAST, prevIssues, _ = c.Compile(b) - require.NotNil(t, prevAST) - require.Nil(t, prevIssues) - // This misses the cache. - require.EqualValues(t, 2, c.Hits()) - require.EqualValues(t, 1, c.Misses()) - }) -} diff --git a/expr.go b/expr.go index d950842..950cd14 100644 --- a/expr.go +++ b/expr.go @@ -11,7 +11,7 @@ import ( "sync/atomic" "time" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/vfs" "github.com/google/cel-go/common/operators" "github.com/google/uuid" ) diff --git a/go.mod b/go.mod index 8503159..831eda0 100644 --- a/go.mod +++ b/go.mod @@ -3,9 +3,8 @@ module github.com/inngest/expr go 1.24 require ( - github.com/RoaringBitmap/roaring v1.9.4 github.com/cespare/xxhash/v2 v2.2.0 - github.com/cockroachdb/pebble v1.1.5 + github.com/cockroachdb/pebble/v2 v2.1.4 github.com/google/cel-go v0.27.0 github.com/google/uuid v1.6.0 github.com/karlseguin/ccache/v2 v2.0.8 @@ -18,31 +17,33 @@ require ( require ( cel.dev/expr v0.25.1 // indirect - github.com/DataDog/zstd v1.4.5 // indirect + github.com/DataDog/zstd v1.5.7 // indirect + github.com/RaduBerinde/axisds v0.1.0 // indirect + github.com/RaduBerinde/btreemap v0.0.0-20250419174037-3d62b7205d54 // indirect github.com/antlr4-go/antlr/v4 v4.13.1 // indirect github.com/beorn7/perks v1.0.1 // indirect - github.com/bits-and-blooms/bitset v1.12.0 // indirect + github.com/cockroachdb/crlib v0.0.0-20241112164430-1264a2edc35b // indirect github.com/cockroachdb/errors v1.11.3 // indirect - github.com/cockroachdb/fifo v0.0.0-20240606204812-0bbfbd93a7ce // indirect github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b // indirect github.com/cockroachdb/redact v1.1.5 // indirect + github.com/cockroachdb/swiss v0.0.0-20251224182025-b0f6560f979b // indirect github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/getsentry/sentry-go v0.27.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.3 // indirect - github.com/golang/snappy v0.0.4 // indirect - github.com/klauspost/compress v1.16.0 // indirect + github.com/golang/snappy v0.0.5-0.20231225225746-43d5d4cd4e0e // indirect + github.com/klauspost/compress v1.17.11 // indirect github.com/kr/pretty v0.3.1 // indirect github.com/kr/text v0.2.0 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect - github.com/mschoch/smat v0.2.0 // indirect + github.com/minio/minlz v1.0.1-0.20250507153514-87eb42fe8882 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_golang v1.15.0 // indirect + github.com/prometheus/client_golang v1.16.0 // indirect github.com/prometheus/client_model v0.3.0 // indirect github.com/prometheus/common v0.42.0 // indirect - github.com/prometheus/procfs v0.9.0 // indirect + github.com/prometheus/procfs v0.10.1 // indirect github.com/rogpeppe/go-internal v1.9.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.9.0 // indirect diff --git a/go.sum b/go.sum index 3fb8e1e..1daef80 100644 --- a/go.sum +++ b/go.sum @@ -1,29 +1,35 @@ cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= -github.com/DataDog/zstd v1.4.5 h1:EndNeuB0l9syBZhut0wns3gV1hL8zX8LIu6ZiVHWLIQ= -github.com/DataDog/zstd v1.4.5/go.mod h1:1jcaCB/ufaK+sKp1NBhlGmpz41jOoPQ35bpF36t7BBo= -github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ= -github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= +github.com/DataDog/zstd v1.5.7 h1:ybO8RBeh29qrxIhCA9E8gKY6xfONU9T6G6aP9DTKfLE= +github.com/DataDog/zstd v1.5.7/go.mod h1:g4AWEaM3yOg3HYfnJ3YIawPnVdXJh9QME85blwSAmyw= +github.com/RaduBerinde/axisds v0.1.0 h1:YItk/RmU5nvlsv/awo2Fjx97Mfpt4JfgtEVAGPrLdz8= +github.com/RaduBerinde/axisds v0.1.0/go.mod h1:UHGJonU9z4YYGKJxSaC6/TNcLOBptpmM5m2Cksbnw0Y= +github.com/RaduBerinde/btreemap v0.0.0-20250419174037-3d62b7205d54 h1:bsU8Tzxr/PNz75ayvCnxKZWEYdLMPDkUgticP4a4Bvk= +github.com/RaduBerinde/btreemap v0.0.0-20250419174037-3d62b7205d54/go.mod h1:0tr7FllbE9gJkHq7CVeeDDFAFKQVy5RnCSSNBOvdqbc= +github.com/aclements/go-perfevent v0.0.0-20240301234650-f7843625020f h1:JjxwchlOepwsUWcQwD2mLUAGE9aCp0/ehy6yCHFBOvo= +github.com/aclements/go-perfevent v0.0.0-20240301234650-f7843625020f/go.mod h1:tMDTce/yLLN/SK8gMOxQfnyeMeCg8KGzp0D1cbECEeo= github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= -github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA= -github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cockroachdb/datadriven v1.0.3-0.20230413201302-be42291fc80f h1:otljaYPt5hWxV3MUfO5dFPFiOXg9CyG5/kCfayTqsJ4= -github.com/cockroachdb/datadriven v1.0.3-0.20230413201302-be42291fc80f/go.mod h1:a9RdTaap04u637JoCzcUoIcDmvwSUtcUFtT/C3kJlTU= +github.com/cockroachdb/crlib v0.0.0-20241112164430-1264a2edc35b h1:SHlYZ/bMx7frnmeqCu+xm0TCxXLzX3jQIVuFbnFGtFU= +github.com/cockroachdb/crlib v0.0.0-20241112164430-1264a2edc35b/go.mod h1:Gq51ZeKaFCXk6QwuGM0w1dnaOqc/F5zKT2zA9D6Xeac= +github.com/cockroachdb/datadriven v1.0.3-0.20250407164829-2945557346d5 h1:UycK/E0TkisVrQbSoxvU827FwgBBcZ95nRRmpj/12QI= +github.com/cockroachdb/datadriven v1.0.3-0.20250407164829-2945557346d5/go.mod h1:jsaKMvD3RBCATk1/jbUZM8C9idWBJME9+VRZ5+Liq1g= github.com/cockroachdb/errors v1.11.3 h1:5bA+k2Y6r+oz/6Z/RFlNeVCesGARKuC6YymtcDrbC/I= github.com/cockroachdb/errors v1.11.3/go.mod h1:m4UIW4CDjx+R5cybPsNrRbreomiFqt8o1h1wUVazSd8= -github.com/cockroachdb/fifo v0.0.0-20240606204812-0bbfbd93a7ce h1:giXvy4KSc/6g/esnpM7Geqxka4WSqI1SZc7sMJFd3y4= -github.com/cockroachdb/fifo v0.0.0-20240606204812-0bbfbd93a7ce/go.mod h1:9/y3cnZ5GKakj/H4y9r9GTjCvAFta7KLgSHPJJYc52M= github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b h1:r6VH0faHjZeQy818SGhaone5OnYfxFR/+AzdY3sf5aE= github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b/go.mod h1:Vz9DsVWQQhf3vs21MhPMZpMGSht7O/2vFW2xusFUVOs= -github.com/cockroachdb/pebble v1.1.5 h1:5AAWCBWbat0uE0blr8qzufZP5tBjkRyy/jWe1QWLnvw= -github.com/cockroachdb/pebble v1.1.5/go.mod h1:17wO9el1YEigxkP/YtV8NtCivQDgoCyBg5c4VR/eOWo= +github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895 h1:XANOgPYtvELQ/h4IrmPAohXqe2pWA8Bwhejr3VQoZsA= +github.com/cockroachdb/metamorphic v0.0.0-20231108215700-4ba948b56895/go.mod h1:aPd7gM9ov9M8v32Yy5NJrDyOcD8z642dqs+F0CeNXfA= +github.com/cockroachdb/pebble/v2 v2.1.4 h1:j9wPgMDbkErFdAKYFGhsoCcvzcjR+6zrJ4jhKtJ6bOk= +github.com/cockroachdb/pebble/v2 v2.1.4/go.mod h1:Reo1RTniv1UjVTAu/Fv74y5i3kJ5gmVrPhO9UtFiKn8= github.com/cockroachdb/redact v1.1.5 h1:u1PMllDkdFfPWaNGMyLD1+so+aq3uUItthCFqzwPJ30= github.com/cockroachdb/redact v1.1.5/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg= +github.com/cockroachdb/swiss v0.0.0-20251224182025-b0f6560f979b h1:VXvSNzmr8hMj8XTuY0PT9Ane9qZGul/p67vGYwl9BFI= +github.com/cockroachdb/swiss v0.0.0-20251224182025-b0f6560f979b/go.mod h1:yBRu/cnL4ks9bgy4vAASdjIW+/xMlFwuHKqtmh3GZQg= github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06 h1:zuQyyAKVxetITBuuhv3BI9cMrmStnpT18zmgmTxunpo= github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06/go.mod h1:7nc4anLGjupUW/PeY5qiNYsdNXj7zopG+eqsS7To5IQ= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= @@ -32,6 +38,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/getsentry/sentry-go v0.27.0 h1:Pv98CIbtB3LkMWmXi4Joa5OOcwbmnX88sF5qbK3r3Ps= github.com/getsentry/sentry-go v0.27.0/go.mod h1:lc76E2QywIyW8WuBnwl8Lc4bkmQH4+w1gwTf25trprY= +github.com/ghemawat/stream v0.0.0-20171120220530-696b145b53b9 h1:r5GgOLGbza2wVHRzK7aAj6lWZjfbAwiu/RDCVOKjRyM= +github.com/ghemawat/stream v0.0.0-20171120220530-696b145b53b9/go.mod h1:106OIgooyS7OzLDOpUGgm9fA3bQENb/cFSyyBmMoJDs= github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= @@ -41,8 +49,8 @@ github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgj github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= -github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.5-0.20231225225746-43d5d4cd4e0e h1:4bw4WeyTYPp0smaXiJZCNnLrvVBqirQVreixayXezGc= +github.com/golang/snappy v0.0.5-0.20231225225746-43d5d4cd4e0e/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/cel-go v0.27.0 h1:e7ih85+4qVrBuqQWTW4FKSqZYokVuc3HnhH5keboFTo= github.com/google/cel-go v0.27.0/go.mod h1:tTJ11FWqnhw5KKpnWpvW9CJC3Y9GK4EIS0WXnBbebzw= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= @@ -56,16 +64,16 @@ github.com/karlseguin/expect v1.0.2-0.20190806010014-778a5f0c6003 h1:vJ0Snvo+SLM github.com/karlseguin/expect v1.0.2-0.20190806010014-778a5f0c6003/go.mod h1:zNBxMY8P21owkeogJELCLeHIt+voOSduHYTFUbwRAV8= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.16.0 h1:iULayQNOReoYUe+1qtKOqw9CwJv3aNQu8ivo7lw1HU4= -github.com/klauspost/compress v1.16.0/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= +github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= -github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= -github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= +github.com/minio/minlz v1.0.1-0.20250507153514-87eb42fe8882 h1:0lgqHvJWHLGW5TuObJrfyEi6+ASTKDBWikGvPqy9Yiw= +github.com/minio/minlz v1.0.1-0.20250507153514-87eb42fe8882/go.mod h1:qT0aEB35q79LLornSzeDH75LBf3aH1MV+jB5w9Wasec= github.com/ohler55/ojg v1.21.0 h1:niqSS6yl3PQZJrqh7pKs/zinl4HebGe8urXEfpvlpYY= github.com/ohler55/ojg v1.21.0/go.mod h1:gQhDVpQLqrmnd2eqGAvJtn+NfKoYJbe/A4Sj3/Vro4o= github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= @@ -75,21 +83,20 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.15.0 h1:5fCgGYogn0hFdhyhLbw7hEsWxufKtY9klyvdNfFlFhM= -github.com/prometheus/client_golang v1.15.0/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= +github.com/prometheus/client_golang v1.16.0 h1:yk/hx9hDbrGHovbci4BY+pRMfSuuat626eFsHb7tmT8= +github.com/prometheus/client_golang v1.16.0/go.mod h1:Zsulrv/L9oM40tJ7T815tM89lFEugiJ9HzIqaAx4LKc= github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= -github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= -github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= +github.com/prometheus/procfs v0.10.1 h1:kYK1Va/YMlutzCGazswoHKo//tZVlFpKYh+PymziUAg= +github.com/prometheus/procfs v0.10.1/go.mod h1:nwNm2aOCAYw8uTR/9bWRREkZFxAUcWzPHWJq+XBB/FM= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/tidwall/btree v1.7.0 h1:L1fkJH/AuEh5zBnnBbmTwQ5Lt+bRJ5A8EWecslvo9iI= @@ -149,6 +156,5 @@ google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/kvdb.go b/kvdb.go index 7924dc6..c50afc5 100644 --- a/kvdb.go +++ b/kvdb.go @@ -4,8 +4,8 @@ import ( "os" "sync/atomic" - "github.com/cockroachdb/pebble" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2" + "github.com/cockroachdb/pebble/v2/vfs" "github.com/google/uuid" ) @@ -38,7 +38,15 @@ func NewKV[T Evaluable](o KVOpts[T]) (KV[T], error) { o.FS = vfs.Default } - db, err := pebble.Open(o.Dir, &pebble.Options{FS: o.FS}) + db, err := pebble.Open(o.Dir, &pebble.Options{ + FS: o.FS, + // cockroachdb defaults that should slightly help with faster writes + // https://github.com/cockroachdb/cockroach/blob/5a1f5da5bb3b2d962d8737848a4fca69f915dacb/pkg/storage/pebble.go#L668-L673 + L0CompactionThreshold: 2, + L0StopWritesThreshold: 1000, + MemTableSize: 64 << 20, // 64 MB + MemTableStopWritesThreshold: 4, + }) if err != nil { return nil, err } diff --git a/lift.go b/lift.go index 56a5c68..3942a88 100644 --- a/lift.go +++ b/lift.go @@ -45,7 +45,6 @@ func liftLiterals(expr string) (string, LiftedArgs) { return expr, nil } - // TODO: Lift numeric literals out of expressions. lp := liftParser{expr: expr} return lp.lift() } @@ -60,6 +59,12 @@ type liftParser struct { varCounter int vars pointerArgMap + + // prevChar distinguishes a numeric literal start from a digit within an identifier. + prevChar byte + + // bracketDepth: array indices must not be lifted; parseArrayAccess expects integer literals. + bracketDepth int } func (l *liftParser) lift() (string, LiftedArgs) { @@ -91,7 +96,9 @@ func (l *liftParser) lift() (string, LiftedArgs) { comment = true continue } + // prevChar must be '/' so digits immediately after (e.g. x/2) are lifted, not skipped. l.rewritten.WriteByte(char) + l.prevChar = char case '"': // Consume the string arg. val := l.consumeString('"') @@ -100,19 +107,184 @@ func (l *liftParser) lift() (string, LiftedArgs) { case '\'': val := l.consumeString('\'') l.addLiftedVar(val) - default: + case '.': + // Leading-dot float (.5): if we wrote the dot then lifted the digit we'd produce ".vars.a". + if !isIdentChar(l.prevChar) && l.idx < len(l.expr) && l.expr[l.idx] >= '0' && l.expr[l.idx] <= '9' { + l.consumeLeadingDotFloat() + } else { + l.rewritten.WriteByte(char) + l.prevChar = char + } + case '[': + l.bracketDepth++ l.rewritten.WriteByte(char) + l.prevChar = char + case ']': + l.bracketDepth-- + l.rewritten.WriteByte(char) + l.prevChar = char + default: + if char >= '0' && char <= '9' && !isIdentChar(l.prevChar) { + l.consumeNumeric(char) + } else { + l.rewritten.WriteByte(char) + l.prevChar = char + } } } return strings.TrimSpace(l.rewritten.String()), &l.vars } +// isIdentChar returns true if c can be part of an identifier (a-z, A-Z, 0-9, _). +func isIdentChar(c byte) bool { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' +} + +// consumeLeadingDotFloat lifts a leading-dot float literal (.5, .5e2). +// The dot has already been consumed; l.idx points to the first digit. +func (l *liftParser) consumeLeadingDotFloat() { + start := l.idx - 1 // include the leading dot + + for l.idx < len(l.expr) && l.expr[l.idx] >= '0' && l.expr[l.idx] <= '9' { + l.idx++ + } + + if l.idx < len(l.expr) && (l.expr[l.idx] == 'e' || l.expr[l.idx] == 'E') { + l.idx++ + if l.idx < len(l.expr) && (l.expr[l.idx] == '+' || l.expr[l.idx] == '-') { + l.idx++ + } + for l.idx < len(l.expr) && l.expr[l.idx] >= '0' && l.expr[l.idx] <= '9' { + l.idx++ + } + } + + numStr := l.expr[start:l.idx] + f, err := strconv.ParseFloat(numStr, 64) + if err != nil { + l.rewritten.WriteString(numStr) + if len(numStr) > 0 { + l.prevChar = numStr[len(numStr)-1] + } + return + } + l.addLiftedVar(argMapValue{parsed: f}) +} + +// consumeNumeric lifts a numeric literal so expressions differing only in value share +// the same CEL cache entry. +func (l *liftParser) consumeNumeric(first byte) { + // Array index — parseArrayAccess expects an integer literal, not vars.X. + if l.bracketDepth > 0 { + l.rewritten.WriteByte(first) + l.prevChar = first + return + } + + start := l.idx - 1 // first was already consumed (l.idx was incremented before the switch) + + // 0x/0b/0o prefix — base-10 parsing would give the wrong value. + // TODO: we can lift those as well but not a priority? + if first == '0' && l.idx < len(l.expr) { + next := l.expr[l.idx] + if (next >= 'a' && next <= 'z') || (next >= 'A' && next <= 'Z') { + l.rewritten.WriteByte(first) + l.prevChar = first + return + } + } + + for l.idx < len(l.expr) && l.expr[l.idx] >= '0' && l.expr[l.idx] <= '9' { + l.idx++ + } + + // u/U suffix — lifting the digits alone leaves "u" in the expression, producing "vars.au" which is completely wrong + // TODO: we can lift those as well but not a priority? + if l.idx < len(l.expr) && (l.expr[l.idx] == 'u' || l.expr[l.idx] == 'U') { + l.idx++ // consume the suffix as part of the token + numStr := l.expr[start:l.idx] + l.rewritten.WriteString(numStr) + l.prevChar = numStr[len(numStr)-1] + return + } + + // Dot is fractional only when followed by a digit; trailing dot (1.) or field accessor (.field) are not. + isFloat := false + if l.idx < len(l.expr) && l.expr[l.idx] == '.' && + l.idx+1 < len(l.expr) && l.expr[l.idx+1] >= '0' && l.expr[l.idx+1] <= '9' { + isFloat = true + l.idx++ // consume '.' + for l.idx < len(l.expr) && l.expr[l.idx] >= '0' && l.expr[l.idx] <= '9' { + l.idx++ + } + } + + // Consume e/E exponent whole; leaving "e10" would produce "vars.ae10" (field access, not a number) which is wrong as well + if l.idx < len(l.expr) && (l.expr[l.idx] == 'e' || l.expr[l.idx] == 'E') { + l.idx++ + if l.idx < len(l.expr) && (l.expr[l.idx] == '+' || l.expr[l.idx] == '-') { + l.idx++ + } + for l.idx < len(l.expr) && l.expr[l.idx] >= '0' && l.expr[l.idx] <= '9' { + l.idx++ + } + numStr := l.expr[start:l.idx] + f, err := strconv.ParseFloat(numStr, 64) + if err != nil { + l.rewritten.WriteString(numStr) + if len(numStr) > 0 { + l.prevChar = numStr[len(numStr)-1] + } + return + } + l.addLiftedVar(argMapValue{parsed: f}) + return + } + + numStr := l.expr[start:l.idx] + if isFloat { + f, err := strconv.ParseFloat(numStr, 64) + if err != nil { + l.rewritten.WriteString(numStr) + if len(numStr) > 0 { + l.prevChar = numStr[len(numStr)-1] + } + return + } + l.addLiftedVar(argMapValue{parsed: f}) + } else { + n, err := strconv.ParseInt(numStr, 10, 64) + if err != nil { + l.rewritten.WriteString(numStr) + if len(numStr) > 0 { + l.prevChar = numStr[len(numStr)-1] + } + return + } + l.addLiftedVar(argMapValue{parsed: n}) + } +} + func (l *liftParser) addLiftedVar(val argMapValue) { if l.varCounter >= len(replace) { // Do nothing. - str := val.get(l.expr) - l.rewritten.WriteString(strconv.Quote(str.(string))) + v := val.get(l.expr) + var s string + switch typed := v.(type) { + case string: + s = strconv.Quote(typed) + case int64: + s = strconv.FormatInt(typed, 10) + case float64: + s = strconv.FormatFloat(typed, 'f', -1, 64) + default: + s = fmt.Sprintf("%v", v) + } + l.rewritten.WriteString(s) + if len(s) > 0 { + l.prevChar = s[len(s)-1] + } return } @@ -122,6 +294,7 @@ func (l *liftParser) addLiftedVar(val argMapValue) { l.varCounter++ l.rewritten.WriteString(VarPrefix + "." + letter) + l.prevChar = letter[0] } func (l *liftParser) consumeString(quoteChar byte) argMapValue { @@ -142,7 +315,7 @@ func (l *liftParser) consumeString(quoteChar byte) argMapValue { // Skip over the end quote. l.idx++ // Return the substring offset/length - return argMapValue{offset, length} + return argMapValue{offset: offset, length: length} } // Grab the next char for evaluation. @@ -191,12 +364,17 @@ func (p pointerArgMap) Get(key string) (any, bool) { return data, true } -// argMapValue represents an offset and length for an argument in an expression string -type argMapValue [2]int +// argMapValue is either a string slice (offset/length into expr) or a pre-parsed numeric (parsed != nil). +type argMapValue struct { + offset, length int + parsed any +} func (a argMapValue) get(expr string) any { - data := expr[a[0] : a[0]+a[1]] - return data + if a.parsed != nil { + return a.parsed + } + return expr[a.offset : a.offset+a.length] } type regularArgMap map[string]any diff --git a/lift_test.go b/lift_test.go index 3b0d29e..b6abfea 100644 --- a/lift_test.go +++ b/lift_test.go @@ -51,11 +51,114 @@ func TestLiftLiterals(t *testing.T) { }, }, { - name: "division operator", - expr: `event.ts / 1000 > 1745436368`, - expectedStr: `event.ts / 1000 > 1745436368`, + name: "division operator", + expr: `event.ts / 1000 > 1745436368`, + expectedStr: `event.ts / vars.a > vars.b`, + expectedArgs: map[string]any{ + "a": int64(1000), + "b": int64(1745436368), + }, + }, + { + name: "hex literal not lifted", + expr: `event.data.flags == 0xFF`, + expectedStr: `event.data.flags == 0xFF`, + expectedArgs: map[string]any{}, + }, + { + name: "leading-dot float with signed negative exponent", + expr: `event.data.x > .5e-2`, + expectedStr: `event.data.x > vars.a`, + expectedArgs: map[string]any{ + "a": float64(.5e-2), + }, + }, + { + name: "leading-dot float with signed positive exponent", + expr: `event.data.x > .5e+3`, + expectedStr: `event.data.x > vars.a`, + expectedArgs: map[string]any{ + "a": float64(.5e+3), + }, + }, + { + name: "scientific notation with explicit positive exponent", + expr: `event.data.count > 1e+6`, + expectedStr: `event.data.count > vars.a`, + expectedArgs: map[string]any{ + "a": float64(1e+6), + }, + }, + { + // cel-go parses .5 as a valid float literal (DIGIT* . DIGIT+, zero digits before dot). + // Without special handling, the scanner writes "." then lifts "5" as int64 → + // producing ".vars.a", which CEL rejects as a syntax error. + name: "leading-dot float lifted", + expr: `event.data.x > .5`, + expectedStr: `event.data.x > vars.a`, + expectedArgs: map[string]any{ + "a": float64(0.5), + }, + }, + { + name: "leading-dot float with exponent lifted", + expr: `event.data.x > .5e2`, + expectedStr: `event.data.x > vars.a`, + expectedArgs: map[string]any{ + "a": float64(50), + }, + }, + { + // A dot between ident characters is a field-access separator, not a float. + name: "field access dot not confused with leading-dot float", + expr: `event.data.x == "ok"`, + expectedStr: `event.data.x == vars.a`, + expectedArgs: map[string]any{ + "a": "ok", + }, + }, + { + // u/U suffix makes it an unsigned integer literal in CEL. + // Lifting just the digits would produce "vars.au" — a field access, not a uint. + name: "unsigned integer literal not lifted", + expr: `event.data.count == 42u`, + expectedStr: `event.data.count == 42u`, + expectedArgs: map[string]any{}, + }, + { + name: "unsigned integer literal with capital U not lifted", + expr: `event.data.count == 100U`, + expectedStr: `event.data.count == 100U`, expectedArgs: map[string]any{}, }, + { + // Regular integer literals next to u/U that aren't a suffix (e.g. in a string) + // should still be lifted normally. + name: "integer before unrelated u identifier", + expr: `event.data.id == 5 && event.data.unit == "kg"`, + expectedStr: `event.data.id == vars.a && event.data.unit == vars.b`, + expectedArgs: map[string]any{ + "a": int64(5), + "b": "kg", + }, + }, + { + name: "array index not lifted", + expr: `event.data.ids[1] == "id-b"`, + expectedStr: `event.data.ids[1] == vars.a`, + expectedArgs: map[string]any{ + "a": "id-b", + }, + }, + { + name: "multi-dimensional array index not lifted", + expr: `event.data.ids[1] == "id-b" && event.data.ids[2] == "id-c"`, + expectedStr: `event.data.ids[1] == vars.a && event.data.ids[2] == vars.b`, + expectedArgs: map[string]any{ + "a": "id-b", + "b": "id-c", + }, + }, { name: "ignore comments", expr: `// foo`, diff --git a/vendor/github.com/DataDog/zstd/README.md b/vendor/github.com/DataDog/zstd/README.md index 03f2cf6..8443da4 100644 --- a/vendor/github.com/DataDog/zstd/README.md +++ b/vendor/github.com/DataDog/zstd/README.md @@ -6,8 +6,8 @@ [C Zstd Homepage](https://github.com/facebook/zstd) -The current headers and C files are from *v1.4.4* (Commit -[10f0e699](https://github.com/facebook/zstd/releases/tag/v1.4.4)). +The current headers and C files are from *v1.5.7* (Commit +[f8745da](https://github.com/facebook/zstd/releases/tag/v1.5.7)). ## Usage @@ -19,6 +19,21 @@ There are two main APIs: The compress/decompress APIs mirror that of lz4, while the streaming API was designed to be a drop-in replacement for zlib. +### Building against an external libzstd + +By default, zstd source code is vendored in this repository and the binding will be built with +the vendored source code bundled. + +If you want to build this binding against an external static or shared libzstd library, you can +use the `external_libzstd` build tag. This will look for the libzstd pkg-config file and extract +build and linking parameters from that pkg-config file. + +Note that it requires at least libzstd 1.4.0. + +```bash +go build -tags external_libzstd +``` + ### Simple `Compress/Decompress` @@ -26,7 +41,7 @@ designed to be a drop-in replacement for zlib. // Compress compresses the byte array given in src and writes it to dst. // If you already have a buffer allocated, you can pass it to prevent allocation // If not, you can pass nil as dst. -// If the buffer is too small, it will be reallocated, resized, and returned bu the function +// If the buffer is too small, it will be reallocated, resized, and returned by the function // If dst is nil, this will allocate the worst case size (CompressBound(src)) Compress(dst, src []byte) ([]byte, error) ``` @@ -60,6 +75,9 @@ NewWriterLevelDict(w io.Writer, level int, dict []byte) *Writer // Write compresses the input data and write it to the underlying writer (w *Writer) Write(p []byte) (int, error) +// Flush writes any unwritten data to the underlying writer +(w *Writer) Flush() error + // Close flushes the buffer and frees C zstd objects (w *Writer) Close() error ``` diff --git a/vendor/github.com/DataDog/zstd/ZSTD_LICENSE b/vendor/github.com/DataDog/zstd/ZSTD_LICENSE index a793a80..7580028 100644 --- a/vendor/github.com/DataDog/zstd/ZSTD_LICENSE +++ b/vendor/github.com/DataDog/zstd/ZSTD_LICENSE @@ -2,7 +2,7 @@ BSD License For Zstandard software -Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -14,9 +14,9 @@ are permitted provided that the following conditions are met: this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name Facebook nor the names of its contributors may be used to - endorse or promote products derived from this software without specific - prior written permission. + * Neither the name Facebook, nor Meta, nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED diff --git a/vendor/github.com/DataDog/zstd/allocations.h b/vendor/github.com/DataDog/zstd/allocations.h new file mode 100644 index 0000000..61bc09e --- /dev/null +++ b/vendor/github.com/DataDog/zstd/allocations.h @@ -0,0 +1,58 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* This file provides custom allocation primitives + */ + +#define ZSTD_DEPS_NEED_MALLOC +#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ + +#include "compiler.h" /* MEM_STATIC */ +#define ZSTD_STATIC_LINKING_ONLY +#include "zstd.h" /* ZSTD_customMem */ + +#ifndef ZSTD_ALLOCATIONS_H +#define ZSTD_ALLOCATIONS_H + +/* custom memory allocation functions */ + +MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) +{ + if (customMem.customAlloc) + return customMem.customAlloc(customMem.opaque, size); + return ZSTD_malloc(size); +} + +MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) +{ + if (customMem.customAlloc) { + /* calloc implemented as malloc+memset; + * not as efficient as calloc, but next best guess for custom malloc */ + void* const ptr = customMem.customAlloc(customMem.opaque, size); + ZSTD_memset(ptr, 0, size); + return ptr; + } + return ZSTD_calloc(1, size); +} + +MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) +{ + if (ptr!=NULL) { + if (customMem.customFree) + customMem.customFree(customMem.opaque, ptr); + else + ZSTD_free(ptr); + } +} + +#endif /* ZSTD_ALLOCATIONS_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/bits.h b/vendor/github.com/DataDog/zstd/bits.h new file mode 100644 index 0000000..f0ba8e6 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/bits.h @@ -0,0 +1,208 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_BITS_H +#define ZSTD_BITS_H + +#include "mem.h" + +MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) +{ + assert(val != 0); + { + static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, + 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, + 26, 12, 18, 6, 11, 5, 10, 9}; + return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27]; + } +} + +MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) +{ + assert(val != 0); +#if defined(_MSC_VER) +# if STATIC_BMI2 + return (unsigned)_tzcnt_u32(val); +# else + if (val != 0) { + unsigned long r; + _BitScanForward(&r, val); + return (unsigned)r; + } else { + __assume(0); /* Should not reach this code path */ + } +# endif +#elif defined(__GNUC__) && (__GNUC__ >= 4) + return (unsigned)__builtin_ctz(val); +#elif defined(__ICCARM__) + return (unsigned)__builtin_ctz(val); +#else + return ZSTD_countTrailingZeros32_fallback(val); +#endif +} + +MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) +{ + assert(val != 0); + { + static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, + 11, 14, 16, 18, 22, 25, 3, 30, + 8, 12, 20, 28, 15, 17, 24, 7, + 19, 27, 23, 6, 26, 5, 4, 31}; + val |= val >> 1; + val |= val >> 2; + val |= val >> 4; + val |= val >> 8; + val |= val >> 16; + return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; + } +} + +MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) +{ + assert(val != 0); +#if defined(_MSC_VER) +# if STATIC_BMI2 + return (unsigned)_lzcnt_u32(val); +# else + if (val != 0) { + unsigned long r; + _BitScanReverse(&r, val); + return (unsigned)(31 - r); + } else { + __assume(0); /* Should not reach this code path */ + } +# endif +#elif defined(__GNUC__) && (__GNUC__ >= 4) + return (unsigned)__builtin_clz(val); +#elif defined(__ICCARM__) + return (unsigned)__builtin_clz(val); +#else + return ZSTD_countLeadingZeros32_fallback(val); +#endif +} + +MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) +{ + assert(val != 0); +#if defined(_MSC_VER) && defined(_WIN64) +# if STATIC_BMI2 + return (unsigned)_tzcnt_u64(val); +# else + if (val != 0) { + unsigned long r; + _BitScanForward64(&r, val); + return (unsigned)r; + } else { + __assume(0); /* Should not reach this code path */ + } +# endif +#elif defined(__GNUC__) && (__GNUC__ >= 4) && defined(__LP64__) + return (unsigned)__builtin_ctzll(val); +#elif defined(__ICCARM__) + return (unsigned)__builtin_ctzll(val); +#else + { + U32 mostSignificantWord = (U32)(val >> 32); + U32 leastSignificantWord = (U32)val; + if (leastSignificantWord == 0) { + return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); + } else { + return ZSTD_countTrailingZeros32(leastSignificantWord); + } + } +#endif +} + +MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) +{ + assert(val != 0); +#if defined(_MSC_VER) && defined(_WIN64) +# if STATIC_BMI2 + return (unsigned)_lzcnt_u64(val); +# else + if (val != 0) { + unsigned long r; + _BitScanReverse64(&r, val); + return (unsigned)(63 - r); + } else { + __assume(0); /* Should not reach this code path */ + } +# endif +#elif defined(__GNUC__) && (__GNUC__ >= 4) + return (unsigned)(__builtin_clzll(val)); +#elif defined(__ICCARM__) + return (unsigned)(__builtin_clzll(val)); +#else + { + U32 mostSignificantWord = (U32)(val >> 32); + U32 leastSignificantWord = (U32)val; + if (mostSignificantWord == 0) { + return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); + } else { + return ZSTD_countLeadingZeros32(mostSignificantWord); + } + } +#endif +} + +MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) +{ + if (MEM_isLittleEndian()) { + if (MEM_64bits()) { + return ZSTD_countTrailingZeros64((U64)val) >> 3; + } else { + return ZSTD_countTrailingZeros32((U32)val) >> 3; + } + } else { /* Big Endian CPU */ + if (MEM_64bits()) { + return ZSTD_countLeadingZeros64((U64)val) >> 3; + } else { + return ZSTD_countLeadingZeros32((U32)val) >> 3; + } + } +} + +MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +{ + assert(val != 0); + return 31 - ZSTD_countLeadingZeros32(val); +} + +/* ZSTD_rotateRight_*(): + * Rotates a bitfield to the right by "count" bits. + * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts + */ +MEM_STATIC +U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { + assert(count < 64); + count &= 0x3F; /* for fickle pattern recognition */ + return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +} + +MEM_STATIC +U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { + assert(count < 32); + count &= 0x1F; /* for fickle pattern recognition */ + return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +} + +MEM_STATIC +U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { + assert(count < 16); + count &= 0x0F; /* for fickle pattern recognition */ + return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); +} + +#endif /* ZSTD_BITS_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/bitstream.h b/vendor/github.com/DataDog/zstd/bitstream.h index 1c294b8..0323212 100644 --- a/vendor/github.com/DataDog/zstd/bitstream.h +++ b/vendor/github.com/DataDog/zstd/bitstream.h @@ -1,43 +1,20 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - bitstream - Part of FSE library - Copyright (C) 2013-present, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * bitstream + * Part of FSE library + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ #ifndef BITSTREAM_H_MODULE #define BITSTREAM_H_MODULE -#if defined (__cplusplus) -extern "C" { -#endif - /* * This API consists of small unitary functions, which must be inlined for best performance. * Since link-time-optimization is not available for all compilers, @@ -48,17 +25,20 @@ extern "C" { * Dependencies ******************************************/ #include "mem.h" /* unaligned access routines */ +#include "compiler.h" /* UNLIKELY() */ #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ #include "error_private.h" /* error codes and messages */ - +#include "bits.h" /* ZSTD_highbit32 */ /*========================================= * Target specific =========================================*/ -#if defined(__BMI__) && defined(__GNUC__) -# include /* support for bextr (experimental) */ -#elif defined(__ICCARM__) -# include +#ifndef ZSTD_NO_INTRINSICS +# if (defined(__BMI__) || defined(__BMI2__)) && defined(__GNUC__) +# include /* support for bextr (experimental)/bzhi */ +# elif defined(__ICCARM__) +# include +# endif #endif #define STREAM_ACCUMULATOR_MIN_32 25 @@ -69,12 +49,13 @@ extern "C" { /*-****************************************** * bitStream encoding API (write forward) ********************************************/ +typedef size_t BitContainerType; /* bitStream can mix input from multiple sources. * A critical property of these streams is that they encode and decode in **reverse** direction. * So the first bit sequence you add will be the last to be read, like a LIFO stack. */ typedef struct { - size_t bitContainer; + BitContainerType bitContainer; unsigned bitPos; char* startPtr; char* ptr; @@ -82,7 +63,7 @@ typedef struct { } BIT_CStream_t; MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity); -MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits); +MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC); MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); @@ -91,7 +72,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); * `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code. * * bits are first added to a local register. -* Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems. +* Local register is BitContainerType, 64-bits on 64-bits systems, or 32-bits on 32-bits systems. * Writing data into memory is an explicit operation, performed by the flushBits function. * Hence keep track how many bits are potentially stored into local register to avoid register overflow. * After a flushBits, a maximum of 7 bits might still be stored into local register. @@ -108,28 +89,28 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); * bitStream decoding API (read backward) **********************************************/ typedef struct { - size_t bitContainer; + BitContainerType bitContainer; unsigned bitsConsumed; const char* ptr; const char* start; const char* limitPtr; } BIT_DStream_t; -typedef enum { BIT_DStream_unfinished = 0, - BIT_DStream_endOfBuffer = 1, - BIT_DStream_completed = 2, - BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ - /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ +typedef enum { BIT_DStream_unfinished = 0, /* fully refilled */ + BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */ + BIT_DStream_completed = 2, /* bitstream entirely consumed, bit-exact */ + BIT_DStream_overflow = 3 /* user requested more bits than present in bitstream */ + } BIT_DStream_status; /* result of BIT_reloadDStream() */ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); -MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); +MEM_STATIC BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD); MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); /* Start by invoking BIT_initDStream(). * A chunk of the bitStream is then stored into a local register. -* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). +* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType). * You can then retrieve bitFields stored into the local register, **in reverse order**. * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. @@ -141,7 +122,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); /*-**************************************** * unsafe API ******************************************/ -MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits); +MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); /* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); @@ -150,39 +131,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); /* faster, but works only if nbBits >= 1 */ - - -/*-************************************************************** -* Internal functions -****************************************************************/ -MEM_STATIC unsigned BIT_highbit32 (U32 val) -{ - assert(val != 0); - { -# if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - _BitScanReverse ( &r, val ); - return (unsigned) r; -# elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ - return __builtin_clz (val) ^ 31; -# elif defined(__ICCARM__) /* IAR Intrinsic */ - return 31 - __CLZ(val); -# else /* Software version */ - static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, - 11, 14, 16, 18, 22, 25, 3, 30, - 8, 12, 20, 28, 15, 17, 24, 7, - 19, 27, 23, 6, 26, 5, 4, 31 }; - U32 v = val; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; -# endif - } -} - /*===== Local Constants =====*/ static const unsigned BIT_mask[] = { 0, 1, 3, 7, 0xF, 0x1F, @@ -212,16 +160,31 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, return 0; } +FORCE_INLINE_TEMPLATE BitContainerType BIT_getLowerBits(BitContainerType bitContainer, U32 const nbBits) +{ +#if STATIC_BMI2 && !defined(ZSTD_NO_INTRINSICS) +# if (defined(__x86_64__) || defined(_M_X64)) && !defined(__ILP32__) + return _bzhi_u64(bitContainer, nbBits); +# else + DEBUG_STATIC_ASSERT(sizeof(bitContainer) == sizeof(U32)); + return _bzhi_u32(bitContainer, nbBits); +# endif +#else + assert(nbBits < BIT_MASK_SIZE); + return bitContainer & BIT_mask[nbBits]; +#endif +} + /*! BIT_addBits() : * can add up to 31 bits into `bitC`. * Note : does not check for register overflow ! */ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, - size_t value, unsigned nbBits) + BitContainerType value, unsigned nbBits) { - MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32); + DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); assert(nbBits < BIT_MASK_SIZE); assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); - bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; + bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; bitC->bitPos += nbBits; } @@ -229,7 +192,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, * works only if `value` is _clean_, * meaning all high bits above nbBits are 0 */ MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, - size_t value, unsigned nbBits) + BitContainerType value, unsigned nbBits) { assert((value>>nbBits) == 0); assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); @@ -276,7 +239,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC) BIT_addBitsFast(bitC, 1, 1); /* endMark */ BIT_flushBits(bitC); if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ - return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); + return (size_t)(bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); } @@ -291,7 +254,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC) */ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize) { - if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); } + if (srcSize < 1) { ZSTD_memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); } bitD->start = (const char*)srcBuffer; bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer); @@ -300,35 +263,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); bitD->bitContainer = MEM_readLEST(bitD->ptr); { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; - bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } } else { bitD->ptr = bitD->start; bitD->bitContainer = *(const BYTE*)(bitD->start); switch(srcSize) { - case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); - /* fall-through */ + case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + ZSTD_FALLTHROUGH; - case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); - /* fall-through */ + case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + ZSTD_FALLTHROUGH; - case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); - /* fall-through */ + case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + ZSTD_FALLTHROUGH; - case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; - /* fall-through */ + case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24; + ZSTD_FALLTHROUGH; - case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; - /* fall-through */ + case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16; + ZSTD_FALLTHROUGH; - case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; - /* fall-through */ + case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) << 8; + ZSTD_FALLTHROUGH; default: break; } { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; - bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; + bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ } bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; @@ -337,23 +300,26 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si return srcSize; } -MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start) +FORCE_INLINE_TEMPLATE BitContainerType BIT_getUpperBits(BitContainerType bitContainer, U32 const start) { return bitContainer >> start; } -MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) +FORCE_INLINE_TEMPLATE BitContainerType BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) { U32 const regMask = sizeof(bitContainer)*8 - 1; /* if start > regMask, bitstream is corrupted, and result is undefined */ assert(nbBits < BIT_MASK_SIZE); + /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better + * than accessing memory. When bmi2 instruction is not present, we consider + * such cpus old (pre-Haswell, 2013) and their performance is not of that + * importance. + */ +#if defined(__x86_64__) || defined(_M_X64) + return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1); +#else return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; -} - -MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) -{ - assert(nbBits < BIT_MASK_SIZE); - return bitContainer & BIT_mask[nbBits]; +#endif } /*! BIT_lookBits() : @@ -362,7 +328,7 @@ MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) * On 32-bits, maxNbBits==24. * On 64-bits, maxNbBits==56. * @return : value extracted */ -MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) +FORCE_INLINE_TEMPLATE BitContainerType BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) { /* arbitrate between double-shift and shift+mask */ #if 1 @@ -378,14 +344,14 @@ MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) /*! BIT_lookBitsFast() : * unsafe version; only works if nbBits >= 1 */ -MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) +MEM_STATIC BitContainerType BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) { U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; assert(nbBits >= 1); return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); } -MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) +FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) { bitD->bitsConsumed += nbBits; } @@ -394,44 +360,77 @@ MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) * Read (consume) next n bits from local register and update. * Pay attention to not read more than nbBits contained into local register. * @return : extracted value. */ -MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) +FORCE_INLINE_TEMPLATE BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) { - size_t const value = BIT_lookBits(bitD, nbBits); + BitContainerType const value = BIT_lookBits(bitD, nbBits); BIT_skipBits(bitD, nbBits); return value; } /*! BIT_readBitsFast() : - * unsafe version; only works only if nbBits >= 1 */ -MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + * unsafe version; only works if nbBits >= 1 */ +MEM_STATIC BitContainerType BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) { - size_t const value = BIT_lookBitsFast(bitD, nbBits); + BitContainerType const value = BIT_lookBitsFast(bitD, nbBits); assert(nbBits >= 1); BIT_skipBits(bitD, nbBits); return value; } +/*! BIT_reloadDStream_internal() : + * Simple variant of BIT_reloadDStream(), with two conditions: + * 1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8 + * 2. look window is valid after shifted down : bitD->ptr >= bitD->start + */ +MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD) +{ + assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); + bitD->ptr -= bitD->bitsConsumed >> 3; + assert(bitD->ptr >= bitD->start); + bitD->bitsConsumed &= 7; + bitD->bitContainer = MEM_readLEST(bitD->ptr); + return BIT_DStream_unfinished; +} + +/*! BIT_reloadDStreamFast() : + * Similar to BIT_reloadDStream(), but with two differences: + * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! + * 2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this + * point you must use BIT_reloadDStream() to reload. + */ +MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) +{ + if (UNLIKELY(bitD->ptr < bitD->limitPtr)) + return BIT_DStream_overflow; + return BIT_reloadDStream_internal(bitD); +} + /*! BIT_reloadDStream() : * Refill `bitD` from buffer previously set in BIT_initDStream() . - * This function is safe, it guarantees it will not read beyond src buffer. + * This function is safe, it guarantees it will not never beyond src buffer. * @return : status of `BIT_DStream_t` internal register. * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ -MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) +FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) { - if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ + /* note : once in overflow mode, a bitstream remains in this mode until it's reset */ + if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) { + static const BitContainerType zeroFilled = 0; + bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */ + /* overflow detected, erroneous scenario or end of stream: no update */ return BIT_DStream_overflow; + } + + assert(bitD->ptr >= bitD->start); if (bitD->ptr >= bitD->limitPtr) { - bitD->ptr -= bitD->bitsConsumed >> 3; - bitD->bitsConsumed &= 7; - bitD->bitContainer = MEM_readLEST(bitD->ptr); - return BIT_DStream_unfinished; + return BIT_reloadDStream_internal(bitD); } if (bitD->ptr == bitD->start) { + /* reached end of bitStream => no update */ if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; return BIT_DStream_completed; } - /* start < ptr < limitPtr */ + /* start < ptr < limitPtr => cautious update */ { U32 nbBytes = bitD->bitsConsumed >> 3; BIT_DStream_status result = BIT_DStream_unfinished; if (bitD->ptr - nbBytes < bitD->start) { @@ -453,8 +452,6 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8)); } -#if defined (__cplusplus) -} -#endif - #endif /* BITSTREAM_H_MODULE */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/clevels.h b/vendor/github.com/DataDog/zstd/clevels.h new file mode 100644 index 0000000..9215306 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/clevels.h @@ -0,0 +1,137 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_CLEVELS_H +#define ZSTD_CLEVELS_H + +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressionParameters */ +#include "zstd.h" + +/*-===== Pre-defined compression levels =====-*/ + +#define ZSTD_MAX_CLEVEL 22 + +#ifdef __GNUC__ +__attribute__((__unused__)) +#endif + +static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { +{ /* "default" - for any srcSize > 256 KB */ + /* W, C, H, S, L, TL, strat */ + { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */ + { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ + { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ + { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ + { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ + { 21, 18, 19, 3, 5, 2, ZSTD_greedy }, /* level 5 */ + { 21, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6 */ + { 21, 19, 20, 4, 5, 8, ZSTD_lazy }, /* level 7 */ + { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 8 */ + { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ + { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 10 */ + { 22, 21, 22, 6, 5, 16, ZSTD_lazy2 }, /* level 11 */ + { 22, 22, 23, 6, 5, 32, ZSTD_lazy2 }, /* level 12 */ + { 22, 22, 22, 4, 5, 32, ZSTD_btlazy2 }, /* level 13 */ + { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ + { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ + { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ + { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ + { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ + { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ + { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ + { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ + { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ +}, +{ /* for srcSize <= 256 KB */ + /* W, C, H, S, L, T, strat */ + { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ + { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ + { 18, 16, 17, 3, 5, 2, ZSTD_greedy }, /* level 4.*/ + { 18, 17, 18, 5, 5, 2, ZSTD_greedy }, /* level 5.*/ + { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ + { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ + { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ + { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ + { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ + { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <= 128 KB */ + /* W, C, H, S, L, T, strat */ + { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ + { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ + { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ + { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ + { 17, 16, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ + { 17, 16, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 17, 16, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 17, 16, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 17, 16, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ + { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ + { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ + { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ + { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ + { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <= 16 KB */ + /* W, C, H, S, L, T, strat */ + { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ + { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ + { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ + { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ + { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ + { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ + { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ + { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ + { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ + { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ + { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ + { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ + { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ + { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ + { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ + { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +}; + + + +#endif /* ZSTD_CLEVELS_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/compiler.h b/vendor/github.com/DataDog/zstd/compiler.h index 1877a0c..2ab3100 100644 --- a/vendor/github.com/DataDog/zstd/compiler.h +++ b/vendor/github.com/DataDog/zstd/compiler.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,19 +12,23 @@ #ifndef ZSTD_COMPILER_H #define ZSTD_COMPILER_H +#include + +#include "portability_macros.h" + /*-******************************************************* * Compiler specifics *********************************************************/ /* force inlining */ #if !defined(ZSTD_NO_INLINE) -#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ # define INLINE_KEYWORD inline #else # define INLINE_KEYWORD #endif -#if defined(__GNUC__) || defined(__ICCARM__) +#if defined(__GNUC__) || defined(__IAR_SYSTEMS_ICC__) # define FORCE_INLINE_ATTR __attribute__((always_inline)) #elif defined(_MSC_VER) # define FORCE_INLINE_ATTR __forceinline @@ -38,12 +43,30 @@ #endif +/** + On MSVC qsort requires that functions passed into it use the __cdecl calling conversion(CC). + This explicitly marks such functions as __cdecl so that the code will still compile + if a CC other than __cdecl has been made the default. +*/ +#if defined(_MSC_VER) +# define WIN_CDECL __cdecl +#else +# define WIN_CDECL +#endif + +/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +#if defined(__GNUC__) || defined(__IAR_SYSTEMS_ICC__) +# define UNUSED_ATTR __attribute__((unused)) +#else +# define UNUSED_ATTR +#endif + /** * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant * parameters. They must be inlined for the compiler to eliminate the constant * branches. */ -#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR /** * HINT_INLINE is used to help the compiler generate better code. It is *not* * used for "templates", so it can be tweaked based on the compilers @@ -58,85 +81,95 @@ #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 # define HINT_INLINE static INLINE_KEYWORD #else -# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR +# define HINT_INLINE FORCE_INLINE_TEMPLATE #endif -/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +/* "soft" inline : + * The compiler is free to select if it's a good idea to inline or not. + * The main objective is to silence compiler warnings + * when a defined function in included but not used. + * + * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit. + * Updating the prefix is probably preferable, but requires a fairly large codemod, + * since this name is used everywhere. + */ +#ifndef MEM_STATIC /* already defined in Linux Kernel mem.h */ #if defined(__GNUC__) -# define UNUSED_ATTR __attribute__((unused)) +# define MEM_STATIC static __inline UNUSED_ATTR +#elif defined(__IAR_SYSTEMS_ICC__) +# define MEM_STATIC static inline UNUSED_ATTR +#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define MEM_STATIC static inline +#elif defined(_MSC_VER) +# define MEM_STATIC static __inline #else -# define UNUSED_ATTR +# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ +#endif #endif /* force no inlining */ #ifdef _MSC_VER # define FORCE_NOINLINE static __declspec(noinline) #else -# if defined(__GNUC__) || defined(__ICCARM__) +# if defined(__GNUC__) || defined(__IAR_SYSTEMS_ICC__) # define FORCE_NOINLINE static __attribute__((__noinline__)) # else # define FORCE_NOINLINE static # endif #endif + /* target attribute */ -#ifndef __has_attribute - #define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */ -#endif -#if defined(__GNUC__) || defined(__ICCARM__) +#if defined(__GNUC__) || defined(__IAR_SYSTEMS_ICC__) # define TARGET_ATTRIBUTE(target) __attribute__((__target__(target))) #else # define TARGET_ATTRIBUTE(target) #endif -/* Enable runtime BMI2 dispatch based on the CPU. - * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. +/* Target attribute for BMI2 dynamic dispatch. + * Enable lzcnt, bmi, and bmi2. + * We test for bmi1 & bmi2. lzcnt is included in bmi1. */ -#ifndef DYNAMIC_BMI2 - #if ((defined(__clang__) && __has_attribute(__target__)) \ - || (defined(__GNUC__) \ - && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ - && (defined(__x86_64__) || defined(_M_X86)) \ - && !defined(__BMI2__) - # define DYNAMIC_BMI2 1 - #else - # define DYNAMIC_BMI2 0 - #endif -#endif +#define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2") /* prefetch * can be disabled, by declaring NO_PREFETCH build macro */ #if defined(NO_PREFETCH) -# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ -# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ +# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ #else -# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC) /* _mm_prefetch() is not defined outside of x86/x64 */ # include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ # define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) # define PREFETCH_L2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1) # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) # define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) # define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) +# elif defined(__aarch64__) +# define PREFETCH_L1(ptr) do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0) +# define PREFETCH_L2(ptr) do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0) # else -# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ -# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ +# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ # endif #endif /* NO_PREFETCH */ #define CACHELINE_SIZE 64 -#define PREFETCH_AREA(p, s) { \ - const char* const _ptr = (const char*)(p); \ - size_t const _size = (size_t)(s); \ - size_t _pos; \ - for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ - PREFETCH_L2(_ptr + _pos); \ - } \ -} +#define PREFETCH_AREA(p, s) \ + do { \ + const char* const _ptr = (const char*)(p); \ + size_t const _size = (size_t)(s); \ + size_t _pos; \ + for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ + PREFETCH_L2(_ptr + _pos); \ + } \ + } while (0) /* vectorization - * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */ -#if !defined(__clang__) && defined(__GNUC__) + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, + * and some compilers, like Intel ICC and MCST LCC, do not support it at all. */ +#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) && !defined(__LCC__) # if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5) # define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) # else @@ -146,6 +179,25 @@ # define DONT_VECTORIZE #endif +/* Tell the compiler that a branch is likely or unlikely. + * Only use these macros if it causes the compiler to generate better code. + * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc + * and clang, please do. + */ +#if defined(__GNUC__) +#define LIKELY(x) (__builtin_expect((x), 1)) +#define UNLIKELY(x) (__builtin_expect((x), 0)) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +#if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) +# define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0) +#else +# define ZSTD_UNREACHABLE do { assert(0); } while (0) +#endif + /* disable warnings */ #ifdef _MSC_VER /* Visual Studio */ # include /* For Visual 2005 */ @@ -156,4 +208,260 @@ # pragma warning(disable : 4324) /* disable: C4324: padded structure */ #endif +/* compile time determination of SIMD support */ +#if !defined(ZSTD_NO_INTRINSICS) +# if defined(__AVX2__) +# define ZSTD_ARCH_X86_AVX2 +# endif +# if defined(__SSE2__) || defined(_M_X64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) +# define ZSTD_ARCH_X86_SSE2 +# endif +# if defined(__ARM_NEON) || defined(_M_ARM64) +# define ZSTD_ARCH_ARM_NEON +# endif +# +# if defined(ZSTD_ARCH_X86_AVX2) +# include +# endif +# if defined(ZSTD_ARCH_X86_SSE2) +# include +# elif defined(ZSTD_ARCH_ARM_NEON) +# include +# endif +#endif + +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute) +# define ZSTD_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define ZSTD_HAS_C_ATTRIBUTE(x) 0 +#endif + +/* Only use C++ attributes in C++. Some compilers report support for C++ + * attributes when compiling with C. + */ +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define ZSTD_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define ZSTD_HAS_CPP_ATTRIBUTE(x) 0 +#endif + +/* Define ZSTD_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute. + * - C23: https://en.cppreference.com/w/c/language/attributes/fallthrough + * - CPP17: https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * - Else: __attribute__((__fallthrough__)) + */ +#ifndef ZSTD_FALLTHROUGH +# if ZSTD_HAS_C_ATTRIBUTE(fallthrough) +# define ZSTD_FALLTHROUGH [[fallthrough]] +# elif ZSTD_HAS_CPP_ATTRIBUTE(fallthrough) +# define ZSTD_FALLTHROUGH [[fallthrough]] +# elif __has_attribute(__fallthrough__) +/* Leading semicolon is to satisfy gcc-11 with -pedantic. Without the semicolon + * gcc complains about: a label can only be part of a statement and a declaration is not a statement. + */ +# define ZSTD_FALLTHROUGH ; __attribute__((__fallthrough__)) +# else +# define ZSTD_FALLTHROUGH +# endif +#endif + +/*-************************************************************** +* Alignment +*****************************************************************/ + +/* @return 1 if @u is a 2^n value, 0 otherwise + * useful to check a value is valid for alignment restrictions */ +MEM_STATIC int ZSTD_isPower2(size_t u) { + return (u & (u-1)) == 0; +} + +/* this test was initially positioned in mem.h, + * but this file is removed (or replaced) for linux kernel + * so it's now hosted in compiler.h, + * which remains valid for both user & kernel spaces. + */ + +#ifndef ZSTD_ALIGNOF +# if defined(__GNUC__) || defined(_MSC_VER) +/* covers gcc, clang & MSVC */ +/* note : this section must come first, before C11, + * due to a limitation in the kernel source generator */ +# define ZSTD_ALIGNOF(T) __alignof(T) + +# elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) +/* C11 support */ +# include +# define ZSTD_ALIGNOF(T) alignof(T) + +# else +/* No known support for alignof() - imperfect backup */ +# define ZSTD_ALIGNOF(T) (sizeof(void*) < sizeof(T) ? sizeof(void*) : sizeof(T)) + +# endif +#endif /* ZSTD_ALIGNOF */ + +#ifndef ZSTD_ALIGNED +/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ +# if defined(__GNUC__) || defined(__clang__) +# define ZSTD_ALIGNED(a) __attribute__((aligned(a))) +# elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define ZSTD_ALIGNED(a) _Alignas(a) +#elif defined(_MSC_VER) +# define ZSTD_ALIGNED(n) __declspec(align(n)) +# else + /* this compiler will require its own alignment instruction */ +# define ZSTD_ALIGNED(...) +# endif +#endif /* ZSTD_ALIGNED */ + + +/*-************************************************************** +* Sanitizer +*****************************************************************/ + +/** + * Zstd relies on pointer overflow in its decompressor. + * We add this attribute to functions that rely on pointer overflow. + */ +#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +# if __has_attribute(no_sanitize) +# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8 + /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */ +# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow"))) +# else + /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */ +# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow"))) +# endif +# else +# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +# endif +#endif + +/** + * Helper function to perform a wrapped pointer difference without triggering + * UBSAN. + * + * @returns lhs - rhs with wrapping + */ +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs) +{ + return lhs - rhs; +} + +/** + * Helper function to perform a wrapped pointer add without triggering UBSAN. + * + * @return ptr + add with wrapping + */ +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add) +{ + return ptr + add; +} + +/** + * Helper function to perform a wrapped pointer subtraction without triggering + * UBSAN. + * + * @return ptr - sub with wrapping + */ +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub) +{ + return ptr - sub; +} + +/** + * Helper function to add to a pointer that works around C's undefined behavior + * of adding 0 to NULL. + * + * @returns `ptr + add` except it defines `NULL + 0 == NULL`. + */ +MEM_STATIC +unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add) +{ + return add > 0 ? ptr + add : ptr; +} + +/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an + * abundance of caution, disable our custom poisoning on mingw. */ +#ifdef __MINGW32__ +#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE +#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1 +#endif +#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE +#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1 +#endif +#endif + +#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE) +/* Not all platforms that support msan provide sanitizers/msan_interface.h. + * We therefore declare the functions we need ourselves, rather than trying to + * include the header file... */ +#include /* size_t */ +#define ZSTD_DEPS_NEED_STDINT +#include "zstd_deps.h" /* intptr_t */ + +/* Make memory region fully initialized (without changing its contents). */ +void __msan_unpoison(const volatile void *a, size_t size); + +/* Make memory region fully uninitialized (without changing its contents). + This is a legacy interface that does not update origin information. Use + __msan_allocated_memory() instead. */ +void __msan_poison(const volatile void *a, size_t size); + +/* Returns the offset of the first (at least partially) poisoned byte in the + memory range, or -1 if the whole range is good. */ +intptr_t __msan_test_shadow(const volatile void *x, size_t size); + +/* Print shadow and origin for the memory range to stderr in a human-readable + format. */ +void __msan_print_shadow(const volatile void *x, size_t size); +#endif + +#if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE) +/* Not all platforms that support asan provide sanitizers/asan_interface.h. + * We therefore declare the functions we need ourselves, rather than trying to + * include the header file... */ +#include /* size_t */ + +/** + * Marks a memory region ([addr, addr+size)) as unaddressable. + * + * This memory must be previously allocated by your program. Instrumented + * code is forbidden from accessing addresses in this region until it is + * unpoisoned. This function is not guaranteed to poison the entire region - + * it could poison only a subregion of [addr, addr+size) due to ASan + * alignment restrictions. + * + * \note This function is not thread-safe because no two threads can poison or + * unpoison memory in the same memory region simultaneously. + * + * \param addr Start of memory region. + * \param size Size of memory region. */ +void __asan_poison_memory_region(void const volatile *addr, size_t size); + +/** + * Marks a memory region ([addr, addr+size)) as addressable. + * + * This memory must be previously allocated by your program. Accessing + * addresses in this region is allowed until this region is poisoned again. + * This function could unpoison a super-region of [addr, addr+size) due + * to ASan alignment restrictions. + * + * \note This function is not thread-safe because no two threads can + * poison or unpoison memory in the same memory region simultaneously. + * + * \param addr Start of memory region. + * \param size Size of memory region. */ +void __asan_unpoison_memory_region(void const volatile *addr, size_t size); +#endif + #endif /* ZSTD_COMPILER_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/cover.c b/vendor/github.com/DataDog/zstd/cover.c index 2e129dd..7445156 100644 --- a/vendor/github.com/DataDog/zstd/cover.c +++ b/vendor/github.com/DataDog/zstd/cover.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -21,52 +22,79 @@ /*-************************************* * Dependencies ***************************************/ +/* qsort_r is an extension. */ +#if defined(__linux) || defined(__linux__) || defined(linux) || defined(__gnu_linux__) || \ + defined(__CYGWIN__) || defined(__MSYS__) +#if !defined(_GNU_SOURCE) && !defined(__ANDROID__) /* NDK doesn't ship qsort_r(). */ +#define _GNU_SOURCE +#endif +#endif + #include /* fprintf */ -#include /* malloc, free, qsort */ +#include /* malloc, free, qsort_r */ + #include /* memset */ #include /* clock */ -#include "mem.h" /* read */ -#include "pool.h" -#include "threading.h" -#include "cover.h" -#include "zstd_internal.h" /* includes zstd.h */ #ifndef ZDICT_STATIC_LINKING_ONLY -#define ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY #endif + +#include "mem.h" /* read */ +#include "pool.h" /* POOL_ctx */ +#include "threading.h" /* ZSTD_pthread_mutex_t */ +#include "zstd_internal.h" /* includes zstd.h */ +#include "bits.h" /* ZSTD_highbit32 */ #include "zdict.h" +#include "cover.h" /*-************************************* * Constants ***************************************/ +/** +* There are 32bit indexes used to ref samples, so limit samples size to 4GB +* on 64bit builds. +* For 32bit builds we choose 1 GB. +* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large +* contiguous buffer, so 1GB is already a high limit. +*/ #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB)) -#define DEFAULT_SPLITPOINT 1.0 +#define COVER_DEFAULT_SPLITPOINT 1.0 /*-************************************* * Console display ***************************************/ -static int g_displayLevel = 2; +#ifndef LOCALDISPLAYLEVEL +static int g_displayLevel = 0; +#endif +#undef DISPLAY #define DISPLAY(...) \ { \ fprintf(stderr, __VA_ARGS__); \ fflush(stderr); \ } +#undef LOCALDISPLAYLEVEL #define LOCALDISPLAYLEVEL(displayLevel, l, ...) \ if (displayLevel >= l) { \ DISPLAY(__VA_ARGS__); \ } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ +#undef DISPLAYLEVEL #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__) +#ifndef LOCALDISPLAYUPDATE +static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; +#endif +#undef LOCALDISPLAYUPDATE #define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ if (displayLevel >= l) { \ - if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \ g_time = clock(); \ DISPLAY(__VA_ARGS__); \ } \ } +#undef DISPLAYUPDATE #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__) -static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; -static clock_t g_time = 0; /*-************************************* * Hash table @@ -120,9 +148,9 @@ static int COVER_map_init(COVER_map_t *map, U32 size) { /** * Internal hash function */ -static const U32 prime4bytes = 2654435761U; +static const U32 COVER_prime4bytes = 2654435761U; static U32 COVER_map_hash(COVER_map_t *map, U32 key) { - return (key * prime4bytes) >> (32 - map->sizeLog); + return (key * COVER_prime4bytes) >> (32 - map->sizeLog); } /** @@ -214,8 +242,10 @@ typedef struct { unsigned d; } COVER_ctx_t; -/* We need a global context for qsort... */ -static COVER_ctx_t *g_ctx = NULL; +#if !defined(_GNU_SOURCE) && !defined(__APPLE__) && !defined(_MSC_VER) +/* C90 only offers qsort() that needs a global context. */ +static COVER_ctx_t *g_coverCtx = NULL; +#endif /*-************************************* * Helper functions @@ -258,11 +288,15 @@ static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) { /** * Same as COVER_cmp() except ties are broken by pointer value - * NOTE: g_ctx must be set to call this function. A global is required because - * qsort doesn't take an opaque pointer. */ +#if (defined(_WIN32) && defined(_MSC_VER)) || defined(__APPLE__) +static int WIN_CDECL COVER_strict_cmp(void* g_coverCtx, const void* lp, const void* rp) { +#elif defined(_GNU_SOURCE) +static int COVER_strict_cmp(const void *lp, const void *rp, void *g_coverCtx) { +#else /* C90 fallback.*/ static int COVER_strict_cmp(const void *lp, const void *rp) { - int result = COVER_cmp(g_ctx, lp, rp); +#endif + int result = COVER_cmp((COVER_ctx_t*)g_coverCtx, lp, rp); if (result == 0) { result = lp < rp ? -1 : 1; } @@ -271,21 +305,58 @@ static int COVER_strict_cmp(const void *lp, const void *rp) { /** * Faster version for d <= 8. */ +#if (defined(_WIN32) && defined(_MSC_VER)) || defined(__APPLE__) +static int WIN_CDECL COVER_strict_cmp8(void* g_coverCtx, const void* lp, const void* rp) { +#elif defined(_GNU_SOURCE) +static int COVER_strict_cmp8(const void *lp, const void *rp, void *g_coverCtx) { +#else /* C90 fallback.*/ static int COVER_strict_cmp8(const void *lp, const void *rp) { - int result = COVER_cmp8(g_ctx, lp, rp); +#endif + int result = COVER_cmp8((COVER_ctx_t*)g_coverCtx, lp, rp); if (result == 0) { result = lp < rp ? -1 : 1; } return result; } +/** + * Abstract away divergence of qsort_r() parameters. + * Hopefully when C11 become the norm, we will be able + * to clean it up. + */ +static void stableSort(COVER_ctx_t *ctx) { +#if defined(__APPLE__) + qsort_r(ctx->suffix, ctx->suffixSize, sizeof(U32), + ctx, + (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); +#elif defined(_GNU_SOURCE) + qsort_r(ctx->suffix, ctx->suffixSize, sizeof(U32), + (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp), + ctx); +#elif defined(_WIN32) && defined(_MSC_VER) + qsort_s(ctx->suffix, ctx->suffixSize, sizeof(U32), + (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp), + ctx); +#elif defined(__OpenBSD__) + g_coverCtx = ctx; + mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32), + (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); +#else /* C90 fallback.*/ + g_coverCtx = ctx; + /* TODO(cavalcanti): implement a reentrant qsort() when is not available. */ + qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), + (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); +#endif +} + /** * Returns the first pointer in [first, last) whose element does not compare * less than value. If no such element exists it returns last. */ -static const size_t *COVER_lower_bound(const size_t *first, const size_t *last, +static const size_t *COVER_lower_bound(const size_t* first, const size_t* last, size_t value) { - size_t count = last - first; + size_t count = (size_t)(last - first); + assert(last >= first); while (count != 0) { size_t step = count / 2; const size_t *ptr = first; @@ -524,14 +595,15 @@ static void COVER_ctx_destroy(COVER_ctx_t *ctx) { /** * Prepare a context for dictionary building. - * The context is only dependent on the parameter `d` and can used multiple + * The context is only dependent on the parameter `d` and can be used multiple * times. * Returns 0 on success or error code on error. * The context must be destroyed with `COVER_ctx_destroy()`. */ static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, - unsigned d, double splitPoint) { + unsigned d, double splitPoint) +{ const BYTE *const samples = (const BYTE *)samplesBuffer; const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples); /* Split samples into testing and training sets */ @@ -600,17 +672,7 @@ static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, for (i = 0; i < ctx->suffixSize; ++i) { ctx->suffix[i] = i; } - /* qsort doesn't take an opaque pointer, so pass as a global. - * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is. - */ - g_ctx = ctx; -#if defined(__OpenBSD__) - mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32), - (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); -#else - qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), - (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp)); -#endif + stableSort(ctx); } DISPLAYLEVEL(2, "Computing frequencies\n"); /* For each dmer group (group of positions with the same first d bytes): @@ -629,7 +691,7 @@ static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel) { - const double ratio = (double)nbDmers / maxDictSize; + const double ratio = (double)nbDmers / (double)maxDictSize; if (ratio >= 10) { return; } @@ -715,7 +777,7 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs, return tail; } -ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( +ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_cover_params_t parameters) @@ -725,7 +787,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( COVER_map_t activeDmers; parameters.splitPoint = 1.0; /* Initialize global data */ - g_displayLevel = parameters.zParams.notificationLevel; + g_displayLevel = (int)parameters.zParams.notificationLevel; /* Checks */ if (!COVER_checkParameters(parameters, dictBufferCapacity)) { DISPLAYLEVEL(1, "Cover parameters incorrect\n"); @@ -889,8 +951,10 @@ void COVER_best_start(COVER_best_t *best) { * Decrements liveJobs and signals any waiting threads if liveJobs == 0. * If this dictionary is the best so far save it and its parameters. */ -void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, - COVER_dictSelection_t selection) { +void COVER_best_finish(COVER_best_t* best, + ZDICT_cover_params_t parameters, + COVER_dictSelection_t selection) +{ void* dict = selection.dictContent; size_t compressedSize = selection.totalCompressedSize; size_t dictSize = selection.dictSize; @@ -933,9 +997,17 @@ void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters, } } +static COVER_dictSelection_t setDictSelection(BYTE* buf, size_t s, size_t csz) +{ + COVER_dictSelection_t ds; + ds.dictContent = buf; + ds.dictSize = s; + ds.totalCompressedSize = csz; + return ds; +} + COVER_dictSelection_t COVER_dictSelectionError(size_t error) { - COVER_dictSelection_t selection = { NULL, 0, error }; - return selection; + return setDictSelection(NULL, 0, error); } unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) { @@ -946,7 +1018,7 @@ void COVER_dictSelectionFree(COVER_dictSelection_t selection){ free(selection.dictContent); } -COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, +COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) { @@ -954,8 +1026,8 @@ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t largestCompressed = 0; BYTE* customDictContentEnd = customDictContent + dictContentSize; - BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize); - BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize); + BYTE* largestDictbuffer = (BYTE*)malloc(dictBufferCapacity); + BYTE* candidateDictBuffer = (BYTE*)malloc(dictBufferCapacity); double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00; if (!largestDictbuffer || !candidateDictBuffer) { @@ -967,7 +1039,7 @@ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, /* Initial dictionary size and compressed size */ memcpy(largestDictbuffer, customDictContent, dictContentSize); dictContentSize = ZDICT_finalizeDictionary( - largestDictbuffer, dictContentSize, customDictContent, dictContentSize, + largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize, samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams); if (ZDICT_isError(dictContentSize)) { @@ -988,9 +1060,8 @@ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, } if (params.shrinkDict == 0) { - COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize }; free(candidateDictBuffer); - return selection; + return setDictSelection(largestDictbuffer, dictContentSize, totalCompressedSize); } largestDict = dictContentSize; @@ -1001,7 +1072,7 @@ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, while (dictContentSize < largestDict) { memcpy(candidateDictBuffer, largestDictbuffer, largestDict); dictContentSize = ZDICT_finalizeDictionary( - candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize, + candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize, samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams); if (ZDICT_isError(dictContentSize)) { @@ -1022,20 +1093,16 @@ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, return COVER_dictSelectionError(totalCompressedSize); } - if (totalCompressedSize <= largestCompressed * regressionTolerance) { - COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize }; + if ((double)totalCompressedSize <= (double)largestCompressed * regressionTolerance) { free(largestDictbuffer); - return selection; + return setDictSelection( candidateDictBuffer, dictContentSize, totalCompressedSize ); } dictContentSize *= 2; } dictContentSize = largestDict; totalCompressedSize = largestCompressed; - { - COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize }; - free(candidateDictBuffer); - return selection; - } + free(candidateDictBuffer); + return setDictSelection( largestDictbuffer, dictContentSize, totalCompressedSize ); } /** @@ -1053,18 +1120,19 @@ typedef struct COVER_tryParameters_data_s { * This function is thread safe if zstd is compiled with multithreaded support. * It takes its parameters as an *OWNING* opaque pointer to support threading. */ -static void COVER_tryParameters(void *opaque) { +static void COVER_tryParameters(void *opaque) +{ /* Save parameters as local variables */ - COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque; + COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque; const COVER_ctx_t *const ctx = data->ctx; const ZDICT_cover_params_t parameters = data->parameters; size_t dictBufferCapacity = data->dictBufferCapacity; size_t totalCompressedSize = ERROR(GENERIC); /* Allocate space for hash table, dict, and freqs */ COVER_map_t activeDmers; - BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + BYTE* const dict = (BYTE*)malloc(dictBufferCapacity); COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC)); - U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); + U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32)); if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); goto _cleanup; @@ -1079,7 +1147,7 @@ static void COVER_tryParameters(void *opaque) { { const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict, dictBufferCapacity, parameters); - selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail, + selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail, ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets, totalCompressedSize); @@ -1094,19 +1162,18 @@ static void COVER_tryParameters(void *opaque) { free(data); COVER_map_destroy(&activeDmers); COVER_dictSelectionFree(selection); - if (freqs) { - free(freqs); - } + free(freqs); } -ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( - void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples, - ZDICT_cover_params_t *parameters) { +ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover( + void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, + const size_t* samplesSizes, unsigned nbSamples, + ZDICT_cover_params_t* parameters) +{ /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = - parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; + parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; @@ -1234,3 +1301,5 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( return dictSize; } } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/cover.h b/vendor/github.com/DataDog/zstd/cover.h index d9e0636..5653b29 100644 --- a/vendor/github.com/DataDog/zstd/cover.h +++ b/vendor/github.com/DataDog/zstd/cover.h @@ -1,14 +1,20 @@ -#include /* fprintf */ -#include /* malloc, free, qsort */ -#include /* memset */ -#include /* clock */ -#include "mem.h" /* read */ -#include "pool.h" -#include "threading.h" -#include "zstd_internal.h" /* includes zstd.h */ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + #ifndef ZDICT_STATIC_LINKING_ONLY -#define ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY #endif + +#include "threading.h" /* ZSTD_pthread_mutex_t */ +#include "mem.h" /* U32, BYTE */ #include "zdict.h" /** @@ -142,6 +148,8 @@ void COVER_dictSelectionFree(COVER_dictSelection_t selection); * smallest dictionary within a specified regression of the compressed size * from the largest dictionary. */ - COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, + COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity, size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples, size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize); + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/cpu.h b/vendor/github.com/DataDog/zstd/cpu.h index 5f0923f..2350a59 100644 --- a/vendor/github.com/DataDog/zstd/cpu.h +++ b/vendor/github.com/DataDog/zstd/cpu.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2018-present, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -16,8 +17,6 @@ * https://github.com/facebook/folly/blob/master/folly/CpuId.h */ -#include - #include "mem.h" #ifdef _MSC_VER @@ -37,6 +36,7 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { U32 f7b = 0; U32 f7c = 0; #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) +#if !defined(_M_X64) || !defined(__clang__) || __clang_major__ >= 16 int reg[4]; __cpuid((int*)reg, 0); { @@ -52,6 +52,41 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { f7c = (U32)reg[2]; } } +#else + /* Clang compiler has a bug (fixed in https://reviews.llvm.org/D101338) in + * which the `__cpuid` intrinsic does not save and restore `rbx` as it needs + * to due to being a reserved register. So in that case, do the `cpuid` + * ourselves. Clang supports inline assembly anyway. + */ + U32 n; + __asm__( + "pushq %%rbx\n\t" + "cpuid\n\t" + "popq %%rbx\n\t" + : "=a"(n) + : "a"(0) + : "rcx", "rdx"); + if (n >= 1) { + U32 f1a; + __asm__( + "pushq %%rbx\n\t" + "cpuid\n\t" + "popq %%rbx\n\t" + : "=a"(f1a), "=c"(f1c), "=d"(f1d) + : "a"(1) + :); + } + if (n >= 7) { + __asm__( + "pushq %%rbx\n\t" + "cpuid\n\t" + "movq %%rbx, %%rax\n\t" + "popq %%rbx" + : "=a"(f7b), "=c"(f7c) + : "a"(7), "c"(0) + : "rdx"); + } +#endif #elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__) /* The following block like the normal cpuid branch below, but gcc * reserves ebx for use of its pic register so we must specially @@ -213,3 +248,5 @@ MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { #undef X #endif /* ZSTD_COMMON_CPU_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/debug.c b/vendor/github.com/DataDog/zstd/debug.c index 3ebdd1c..1ce10e9 100644 --- a/vendor/github.com/DataDog/zstd/debug.c +++ b/vendor/github.com/DataDog/zstd/debug.c @@ -1,35 +1,16 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - debug - Part of FSE library - Copyright (C) 2013-present, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * debug + * Part of FSE library + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ @@ -41,4 +22,12 @@ #include "debug.h" +#if !defined(ZSTD_LINUX_KERNEL) || (DEBUGLEVEL>=2) +/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a + * translation unit is empty. So remove this from Linux kernel builds, but + * otherwise just leave it in. + */ int g_debuglevel = DEBUGLEVEL; +#endif + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/debug.h b/vendor/github.com/DataDog/zstd/debug.h index b4fc89d..ad47593 100644 --- a/vendor/github.com/DataDog/zstd/debug.h +++ b/vendor/github.com/DataDog/zstd/debug.h @@ -1,35 +1,16 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - debug - Part of FSE library - Copyright (C) 2013-present, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * debug + * Part of FSE library + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ @@ -52,10 +33,6 @@ #ifndef DEBUG_H_12987983217 #define DEBUG_H_12987983217 -#if defined (__cplusplus) -extern "C" { -#endif - /* static assert is triggered at compile time, leaving no runtime artefact. * static assert only works with compile-time constants. @@ -71,15 +48,6 @@ extern "C" { #endif -/* DEBUGFILE can be defined externally, - * typically through compiler command line. - * note : currently useless. - * Value must be stderr or stdout */ -#ifndef DEBUGFILE -# define DEBUGFILE stderr -#endif - - /* recommended values for DEBUGLEVEL : * 0 : release mode, no debug, all run-time checks disabled * 1 : enables assert() only, no display @@ -96,7 +64,8 @@ extern "C" { */ #if (DEBUGLEVEL>=1) -# include +# define ZSTD_DEPS_NEED_ASSERT +# include "zstd_deps.h" #else # ifndef assert /* assert may be already defined, due to prior #include */ # define assert(condition) ((void)0) /* disable assert (default) */ @@ -104,7 +73,8 @@ extern "C" { #endif #if (DEBUGLEVEL>=2) -# include +# define ZSTD_DEPS_NEED_IO +# include "zstd_deps.h" extern int g_debuglevel; /* the variable is only declared, it actually lives in debug.c, and is shared by the whole process. @@ -112,23 +82,29 @@ extern int g_debuglevel; /* the variable is only declared, It's useful when enabling very verbose levels on selective conditions (such as position in src) */ -# define RAWLOG(l, ...) { \ - if (l<=g_debuglevel) { \ - fprintf(stderr, __VA_ARGS__); \ - } } -# define DEBUGLOG(l, ...) { \ - if (l<=g_debuglevel) { \ - fprintf(stderr, __FILE__ ": " __VA_ARGS__); \ - fprintf(stderr, " \n"); \ - } } +# define RAWLOG(l, ...) \ + do { \ + if (l<=g_debuglevel) { \ + ZSTD_DEBUG_PRINT(__VA_ARGS__); \ + } \ + } while (0) + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) +#define LINE_AS_STRING TOSTRING(__LINE__) + +# define DEBUGLOG(l, ...) \ + do { \ + if (l<=g_debuglevel) { \ + ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \ + ZSTD_DEBUG_PRINT(" \n"); \ + } \ + } while (0) #else -# define RAWLOG(l, ...) {} /* disabled */ -# define DEBUGLOG(l, ...) {} /* disabled */ -#endif - - -#if defined (__cplusplus) -} +# define RAWLOG(l, ...) do { } while (0) /* disabled */ +# define DEBUGLOG(l, ...) do { } while (0) /* disabled */ #endif #endif /* DEBUG_H_12987983217 */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/divsufsort.c b/vendor/github.com/DataDog/zstd/divsufsort.c index ead9220..82e6a80 100644 --- a/vendor/github.com/DataDog/zstd/divsufsort.c +++ b/vendor/github.com/DataDog/zstd/divsufsort.c @@ -1,3 +1,4 @@ +#ifndef USE_EXTERNAL_ZSTD /* * divsufsort.c for libdivsufsort-lite * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. @@ -1576,7 +1577,7 @@ sort_typeBstar(const unsigned char *T, int *SA, /* Construct the inverse suffix array of type B* suffixes using trsort. */ trsort(ISAb, SA, m, 1); - /* Set the sorted order of tyoe B* suffixes. */ + /* Set the sorted order of type B* suffixes. */ for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } if(0 <= i) { @@ -1911,3 +1912,5 @@ divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * return pidx; } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/divsufsort.h b/vendor/github.com/DataDog/zstd/divsufsort.h index 5440994..d723a48 100644 --- a/vendor/github.com/DataDog/zstd/divsufsort.h +++ b/vendor/github.com/DataDog/zstd/divsufsort.h @@ -1,3 +1,4 @@ +#ifndef USE_EXTERNAL_ZSTD /* * divsufsort.h for libdivsufsort-lite * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. @@ -27,11 +28,6 @@ #ifndef _DIVSUFSORT_H #define _DIVSUFSORT_H 1 -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - - /*- Prototypes -*/ /** @@ -59,9 +55,6 @@ divsufsort(const unsigned char *T, int *SA, int n, int openMP); int divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP); - -#ifdef __cplusplus -} /* extern "C" */ -#endif /* __cplusplus */ - #endif /* _DIVSUFSORT_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/entropy_common.c b/vendor/github.com/DataDog/zstd/entropy_common.c index b12944e..1bfbb52 100644 --- a/vendor/github.com/DataDog/zstd/entropy_common.c +++ b/vendor/github.com/DataDog/zstd/entropy_common.c @@ -1,36 +1,17 @@ -/* - Common functions of New Generation Entropy library - Copyright (C) 2016, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy - - Public forum : https://groups.google.com/forum/#!forum/lz4c -*************************************************************************** */ +#ifndef USE_EXTERNAL_ZSTD +/* ****************************************************************** + * Common functions of New Generation Entropy library + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ /* ************************************* * Dependencies @@ -39,8 +20,8 @@ #include "error_private.h" /* ERR_*, ERROR */ #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ #include "fse.h" -#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ #include "huf.h" +#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */ /*=== Version ===*/ @@ -58,8 +39,9 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } /*-************************************************************** * FSE NCount encoding-decoding ****************************************************************/ -size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, - const void* headerBuffer, size_t hbSize) +FORCE_INLINE_TEMPLATE +size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) { const BYTE* const istart = (const BYTE*) headerBuffer; const BYTE* const iend = istart + hbSize; @@ -70,23 +52,23 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t U32 bitStream; int bitCount; unsigned charnum = 0; + unsigned const maxSV1 = *maxSVPtr + 1; int previous0 = 0; - if (hbSize < 4) { - /* This function only works when hbSize >= 4 */ - char buffer[4]; - memset(buffer, 0, sizeof(buffer)); - memcpy(buffer, headerBuffer, hbSize); + if (hbSize < 8) { + /* This function only works when hbSize >= 8 */ + char buffer[8] = {0}; + ZSTD_memcpy(buffer, headerBuffer, hbSize); { size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr, buffer, sizeof(buffer)); if (FSE_isError(countSize)) return countSize; if (countSize > hbSize) return ERROR(corruption_detected); return countSize; } } - assert(hbSize >= 4); + assert(hbSize >= 8); /* init */ - memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0])); /* all symbols not present in NCount have a frequency of 0 */ + ZSTD_memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0])); /* all symbols not present in NCount have a frequency of 0 */ bitStream = MEM_readLE32(ip); nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */ if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge); @@ -97,36 +79,58 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t threshold = 1<1) & (charnum<=*maxSVPtr)) { + for (;;) { if (previous0) { - unsigned n0 = charnum; - while ((bitStream & 0xFFFF) == 0xFFFF) { - n0 += 24; - if (ip < iend-5) { - ip += 2; - bitStream = MEM_readLE32(ip) >> bitCount; + /* Count the number of repeats. Each time the + * 2-bit repeat code is 0b11 there is another + * repeat. + * Avoid UB by setting the high bit to 1. + */ + int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + while (repeats >= 12) { + charnum += 3 * 12; + if (LIKELY(ip <= iend-7)) { + ip += 3; } else { - bitStream >>= 16; - bitCount += 16; - } } - while ((bitStream & 3) == 3) { - n0 += 3; - bitStream >>= 2; - bitCount += 2; + bitCount -= (int)(8 * (iend - 7 - ip)); + bitCount &= 31; + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> bitCount; + repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; } - n0 += bitStream & 3; + charnum += 3 * repeats; + bitStream >>= 2 * repeats; + bitCount += 2 * repeats; + + /* Add the final repeat which isn't 0b11. */ + assert((bitStream & 3) < 3); + charnum += bitStream & 3; bitCount += 2; - if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall); - while (charnum < n0) normalizedCounter[charnum++] = 0; - if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) { + + /* This is an error, but break and return an error + * at the end, because returning out of a loop makes + * it harder for the compiler to optimize. + */ + if (charnum >= maxSV1) break; + + /* We don't need to set the normalized count to 0 + * because we already memset the whole buffer to 0. + */ + + if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) { assert((bitCount >> 3) <= 3); /* For first condition to work */ ip += bitCount>>3; bitCount &= 7; - bitStream = MEM_readLE32(ip) >> bitCount; } else { - bitStream >>= 2; - } } - { int const max = (2*threshold-1) - remaining; + bitCount -= (int)(8 * (iend - 4 - ip)); + bitCount &= 31; + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> bitCount; + } + { + int const max = (2*threshold-1) - remaining; int count; if ((bitStream & (threshold-1)) < (U32)max) { @@ -139,24 +143,43 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t } count--; /* extra accuracy */ - remaining -= count < 0 ? -count : count; /* -1 means +1 */ + /* When it matters (small blocks), this is a + * predictable branch, because we don't use -1. + */ + if (count >= 0) { + remaining -= count; + } else { + assert(count == -1); + remaining += count; + } normalizedCounter[charnum++] = (short)count; previous0 = !count; - while (remaining < threshold) { - nbBits--; - threshold >>= 1; + + assert(threshold > 1); + if (remaining < threshold) { + /* This branch can be folded into the + * threshold update condition because we + * know that threshold > 1. + */ + if (remaining <= 1) break; + nbBits = ZSTD_highbit32(remaining) + 1; + threshold = 1 << (nbBits - 1); } + if (charnum >= maxSV1) break; - if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) { + if (LIKELY(ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) { ip += bitCount>>3; bitCount &= 7; } else { bitCount -= (int)(8 * (iend - 4 - ip)); + bitCount &= 31; ip = iend - 4; } - bitStream = MEM_readLE32(ip) >> (bitCount & 31); - } } /* while ((remaining>1) & (charnum<=*maxSVPtr)) */ + bitStream = MEM_readLE32(ip) >> bitCount; + } } if (remaining != 1) return ERROR(corruption_detected); + /* Only possible when there are too many zeros. */ + if (charnum > maxSV1) return ERROR(maxSymbolValue_tooSmall); if (bitCount > 32) return ERROR(corruption_detected); *maxSVPtr = charnum-1; @@ -164,6 +187,43 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t return ip-istart; } +/* Avoids the FORCE_INLINE of the _body() function. */ +static size_t FSE_readNCount_body_default( + short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +{ + return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize); +} + +#if DYNAMIC_BMI2 +BMI2_TARGET_ATTRIBUTE static size_t FSE_readNCount_body_bmi2( + short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +{ + return FSE_readNCount_body(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize); +} +#endif + +size_t FSE_readNCount_bmi2( + short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { + return FSE_readNCount_body_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize); + } +#endif + (void)bmi2; + return FSE_readNCount_body_default(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize); +} + +size_t FSE_readNCount( + short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +{ + return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize, /* bmi2 */ 0); +} + /*! HUF_readStats() : Read compact Huffman tree, saved by HUF_writeCTable(). @@ -175,6 +235,17 @@ size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* t size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, const void* src, size_t srcSize) +{ + U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; + return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); +} + +FORCE_INLINE_TEMPLATE size_t +HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, + int bmi2) { U32 weightTotal; const BYTE* ip = (const BYTE*) src; @@ -183,7 +254,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, if (!srcSize) return ERROR(srcSize_wrong); iSize = ip[0]; - /* memset(huffWeight, 0, hwSize); *//* is not necessary, even though some analyzer complain ... */ + /* ZSTD_memset(huffWeight, 0, hwSize); *//* is not necessary, even though some analyzer complain ... */ if (iSize >= 128) { /* special header */ oSize = iSize - 127; @@ -197,31 +268,31 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, huffWeight[n+1] = ip[n/2] & 15; } } } else { /* header compressed with FSE (normal case) */ - FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */ if (iSize+1 > srcSize) return ERROR(srcSize_wrong); - oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */ + /* max (hwSize-1) values decoded, as last one is implied */ + oSize = FSE_decompress_wksp_bmi2(huffWeight, hwSize-1, ip+1, iSize, 6, workSpace, wkspSize, bmi2); if (FSE_isError(oSize)) return oSize; } /* collect weight stats */ - memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32)); + ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32)); weightTotal = 0; { U32 n; for (n=0; n= HUF_TABLELOG_MAX) return ERROR(corruption_detected); + if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_detected); rankStats[huffWeight[n]]++; weightTotal += (1 << huffWeight[n]) >> 1; } } if (weightTotal == 0) return ERROR(corruption_detected); /* get last non-null symbol weight (implied, total must be 2^n) */ - { U32 const tableLog = BIT_highbit32(weightTotal) + 1; + { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1; if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); *tableLogPtr = tableLog; /* determine last weight */ { U32 const total = 1 << tableLog; U32 const rest = total - weightTotal; - U32 const verif = 1 << BIT_highbit32(rest); - U32 const lastWeight = BIT_highbit32(rest) + 1; + U32 const verif = 1 << ZSTD_highbit32(rest); + U32 const lastWeight = ZSTD_highbit32(rest) + 1; if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ huffWeight[oSize] = (BYTE)lastWeight; rankStats[lastWeight]++; @@ -234,3 +305,39 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, *nbSymbolsPtr = (U32)(oSize+1); return iSize+1; } + +/* Avoids the FORCE_INLINE of the _body() function. */ +static size_t HUF_readStats_body_default(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize) +{ + return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 0); +} + +#if DYNAMIC_BMI2 +static BMI2_TARGET_ATTRIBUTE size_t HUF_readStats_body_bmi2(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize) +{ + return HUF_readStats_body(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize, 1); +} +#endif + +size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, + int flags) +{ +#if DYNAMIC_BMI2 + if (flags & HUF_flags_bmi2) { + return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } +#endif + (void)flags; + return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); +} + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/error_private.c b/vendor/github.com/DataDog/zstd/error_private.c index 7c1bb67..62e7413 100644 --- a/vendor/github.com/DataDog/zstd/error_private.c +++ b/vendor/github.com/DataDog/zstd/error_private.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code) case PREFIX(version_unsupported): return "Version not supported"; case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; - case PREFIX(corruption_detected): return "Corrupted block detected"; + case PREFIX(corruption_detected): return "Data corruption detected"; case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; + case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; case PREFIX(parameter_unsupported): return "Unsupported parameter"; + case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; case PREFIX(init_missing): return "Context should be init first"; case PREFIX(memory_allocation): return "Allocation error : not enough memory"; @@ -38,17 +41,27 @@ const char* ERR_getErrorString(ERR_enum code) case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; + case PREFIX(cannotProduce_uncompressedBlock): return "This mode cannot generate an uncompressed block"; + case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; case PREFIX(dictionary_wrong): return "Dictionary mismatch"; case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; case PREFIX(srcSize_wrong): return "Src size is incorrect"; case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; + case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full"; + case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty"; /* following error codes are not stable and may be removed or changed in a future version */ case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; + case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code"; + case PREFIX(externalSequences_invalid): return "External sequences are not valid"; case PREFIX(maxCode): default: return notErrorCode; } #endif } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/error_private.h b/vendor/github.com/DataDog/zstd/error_private.h index 0d2fa7e..f3c845e 100644 --- a/vendor/github.com/DataDog/zstd/error_private.h +++ b/vendor/github.com/DataDog/zstd/error_private.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -13,17 +14,13 @@ #ifndef ERROR_H_MODULE #define ERROR_H_MODULE -#if defined (__cplusplus) -extern "C" { -#endif - - /* **************************************** * Dependencies ******************************************/ -#include /* size_t */ #include "zstd_errors.h" /* enum list */ - +#include "compiler.h" +#include "debug.h" +#include "zstd_deps.h" /* size_t */ /* **************************************** * Compiler-specific @@ -49,7 +46,7 @@ typedef ZSTD_ErrorCode ERR_enum; /*-**************************************** * Error codes handling ******************************************/ -#undef ERROR /* reported already defined on VS 2015 (Rich Geldreich) */ +#undef ERROR /* already defined on Visual Studio */ #define ERROR(name) ZSTD_ERROR(name) #define ZSTD_ERROR(name) ((size_t)-PREFIX(name)) @@ -57,6 +54,15 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } +/* check and forward error code */ +#define CHECK_V_F(e, f) \ + size_t const e = f; \ + do { \ + if (ERR_isError(e)) \ + return e; \ + } while (0) +#define CHECK_F(f) do { CHECK_V_F(_var_err__, f); } while (0) + /*-**************************************** * Error Strings @@ -69,8 +75,87 @@ ERR_STATIC const char* ERR_getErrorName(size_t code) return ERR_getErrorString(ERR_getErrorCode(code)); } -#if defined (__cplusplus) +/** + * Ignore: this is an internal helper. + * + * This is a helper function to help force C99-correctness during compilation. + * Under strict compilation modes, variadic macro arguments can't be empty. + * However, variadic function arguments can be. Using a function therefore lets + * us statically check that at least one (string) argument was passed, + * independent of the compilation flags. + */ +static INLINE_KEYWORD UNUSED_ATTR +void _force_has_format_string(const char *format, ...) { + (void)format; } -#endif + +/** + * Ignore: this is an internal helper. + * + * We want to force this function invocation to be syntactically correct, but + * we don't want to force runtime evaluation of its arguments. + */ +#define _FORCE_HAS_FORMAT_STRING(...) \ + do { \ + if (0) { \ + _force_has_format_string(__VA_ARGS__); \ + } \ + } while (0) + +#define ERR_QUOTE(str) #str + +/** + * Return the specified error if the condition evaluates to true. + * + * In debug modes, prints additional information. + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). + */ +#define RETURN_ERROR_IF(cond, err, ...) \ + do { \ + if (cond) { \ + RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ + __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } \ + } while (0) + +/** + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +#define RETURN_ERROR(err, ...) \ + do { \ + RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ + __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } while(0) + +/** + * If the provided expression evaluates to an error code, returns that error code. + * + * In debug modes, prints additional information. + */ +#define FORWARD_IF_ERROR(err, ...) \ + do { \ + size_t const err_code = (err); \ + if (ERR_isError(err_code)) { \ + RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ + __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return err_code; \ + } \ + } while(0) #endif /* ERROR_H_MODULE */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/errors.go b/vendor/github.com/DataDog/zstd/errors.go index 38db0d5..dbeb816 100644 --- a/vendor/github.com/DataDog/zstd/errors.go +++ b/vendor/github.com/DataDog/zstd/errors.go @@ -1,7 +1,6 @@ package zstd /* -#define ZSTD_STATIC_LINKING_ONLY #include "zstd.h" */ import "C" diff --git a/vendor/github.com/DataDog/zstd/external_zstd.go b/vendor/github.com/DataDog/zstd/external_zstd.go new file mode 100644 index 0000000..fc4ceb2 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/external_zstd.go @@ -0,0 +1,14 @@ +//go:build external_libzstd +// +build external_libzstd + +package zstd + +// #cgo CFLAGS: -DUSE_EXTERNAL_ZSTD +// #cgo pkg-config: libzstd +/* +#include +#if ZSTD_VERSION_NUMBER < 10400 +#error "ZSTD version >= 1.4 is required" +#endif +*/ +import "C" diff --git a/vendor/github.com/DataDog/zstd/fastcover.c b/vendor/github.com/DataDog/zstd/fastcover.c index 941bb5a..0d559e2 100644 --- a/vendor/github.com/DataDog/zstd/fastcover.c +++ b/vendor/github.com/DataDog/zstd/fastcover.c @@ -1,3 +1,14 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + /*-************************************* * Dependencies ***************************************/ @@ -6,24 +17,33 @@ #include /* memset */ #include /* clock */ +#ifndef ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY +#endif + #include "mem.h" /* read */ #include "pool.h" #include "threading.h" -#include "cover.h" #include "zstd_internal.h" /* includes zstd.h */ -#ifndef ZDICT_STATIC_LINKING_ONLY -#define ZDICT_STATIC_LINKING_ONLY -#endif +#include "zstd_compress_internal.h" /* ZSTD_hash*() */ #include "zdict.h" +#include "cover.h" /*-************************************* * Constants ***************************************/ +/** +* There are 32bit indexes used to ref samples, so limit samples size to 4GB +* on 64bit builds. +* For 32bit builds we choose 1 GB. +* Most 32bit platforms have 2GB user-mode addressable space and we allocate a large +* contiguous buffer, so 1GB is already a high limit. +*/ #define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB)) #define FASTCOVER_MAX_F 31 #define FASTCOVER_MAX_ACCEL 10 -#define DEFAULT_SPLITPOINT 0.75 +#define FASTCOVER_DEFAULT_SPLITPOINT 0.75 #define DEFAULT_F 20 #define DEFAULT_ACCEL 1 @@ -31,50 +51,50 @@ /*-************************************* * Console display ***************************************/ -static int g_displayLevel = 2; +#ifndef LOCALDISPLAYLEVEL +static int g_displayLevel = 0; +#endif +#undef DISPLAY #define DISPLAY(...) \ { \ fprintf(stderr, __VA_ARGS__); \ fflush(stderr); \ } +#undef LOCALDISPLAYLEVEL #define LOCALDISPLAYLEVEL(displayLevel, l, ...) \ if (displayLevel >= l) { \ DISPLAY(__VA_ARGS__); \ } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ +#undef DISPLAYLEVEL #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__) +#ifndef LOCALDISPLAYUPDATE +static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100; +static clock_t g_time = 0; +#endif +#undef LOCALDISPLAYUPDATE #define LOCALDISPLAYUPDATE(displayLevel, l, ...) \ if (displayLevel >= l) { \ - if ((clock() - g_time > refreshRate) || (displayLevel >= 4)) { \ + if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \ g_time = clock(); \ DISPLAY(__VA_ARGS__); \ } \ } +#undef DISPLAYUPDATE #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__) -static const clock_t refreshRate = CLOCKS_PER_SEC * 15 / 100; -static clock_t g_time = 0; /*-************************************* * Hash Functions ***************************************/ -static const U64 prime6bytes = 227718039650203ULL; -static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } -static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } - -static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; -static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } -static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } - - /** - * Hash the d-byte value pointed to by p and mod 2^f + * Hash the d-byte value pointed to by p and mod 2^f into the frequency vector */ -static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 h, unsigned d) { +static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 f, unsigned d) { if (d == 6) { - return ZSTD_hash6Ptr(p, h) & ((1 << h) - 1); + return ZSTD_hash6Ptr(p, f); } - return ZSTD_hash8Ptr(p, h) & ((1 << h) - 1); + return ZSTD_hash8Ptr(p, f); } @@ -285,7 +305,7 @@ FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx) /** * Prepare a context for dictionary building. - * The context is only dependent on the parameter `d` and can used multiple + * The context is only dependent on the parameter `d` and can be used multiple * times. * Returns 0 on success or error code on error. * The context must be destroyed with `FASTCOVER_ctx_destroy()`. @@ -451,20 +471,20 @@ typedef struct FASTCOVER_tryParameters_data_s { * This function is thread safe if zstd is compiled with multithreaded support. * It takes its parameters as an *OWNING* opaque pointer to support threading. */ -static void FASTCOVER_tryParameters(void *opaque) +static void FASTCOVER_tryParameters(void* opaque) { /* Save parameters as local variables */ - FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque; + FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t*)opaque; const FASTCOVER_ctx_t *const ctx = data->ctx; const ZDICT_cover_params_t parameters = data->parameters; size_t dictBufferCapacity = data->dictBufferCapacity; size_t totalCompressedSize = ERROR(GENERIC); /* Initialize array to keep track of frequency of dmer within activeSegment */ - U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16)); + U16* segmentFreqs = (U16*)calloc(((U64)1 << ctx->f), sizeof(U16)); /* Allocate space for hash table, dict, and freqs */ - BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + BYTE *const dict = (BYTE*)malloc(dictBufferCapacity); COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC)); - U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32)); + U32* freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32)); if (!segmentFreqs || !dict || !freqs) { DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); goto _cleanup; @@ -476,7 +496,7 @@ static void FASTCOVER_tryParameters(void *opaque) parameters, segmentFreqs); const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100); - selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail, + selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail, ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets, totalCompressedSize); @@ -526,7 +546,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams, } -ZDICTLIB_API size_t +ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, @@ -537,7 +557,7 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity, ZDICT_cover_params_t coverParams; FASTCOVER_accel_t accelParams; /* Initialize global data */ - g_displayLevel = parameters.zParams.notificationLevel; + g_displayLevel = (int)parameters.zParams.notificationLevel; /* Assign splitPoint and f if not provided */ parameters.splitPoint = 1.0; parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f; @@ -595,7 +615,7 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity, } -ZDICTLIB_API size_t +ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_fastCover( void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, @@ -607,7 +627,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover( /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = - parameters->splitPoint <= 0.0 ? DEFAULT_SPLITPOINT : parameters->splitPoint; + parameters->splitPoint <= 0.0 ? FASTCOVER_DEFAULT_SPLITPOINT : parameters->splitPoint; const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d; const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d; const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k; @@ -620,7 +640,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover( const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel; const unsigned shrinkDict = 0; /* Local variables */ - const int displayLevel = parameters->zParams.notificationLevel; + const int displayLevel = (int)parameters->zParams.notificationLevel; unsigned iteration = 1; unsigned d; unsigned k; @@ -704,7 +724,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover( data->parameters.splitPoint = splitPoint; data->parameters.steps = kSteps; data->parameters.shrinkDict = shrinkDict; - data->parameters.zParams.notificationLevel = g_displayLevel; + data->parameters.zParams.notificationLevel = (unsigned)g_displayLevel; /* Check the parameters */ if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity, data->ctx->f, accel)) { @@ -745,3 +765,5 @@ ZDICT_optimizeTrainFromBuffer_fastCover( } } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/fse.h b/vendor/github.com/DataDog/zstd/fse.h index a7553e3..232d71e 100644 --- a/vendor/github.com/DataDog/zstd/fse.h +++ b/vendor/github.com/DataDog/zstd/fse.h @@ -1,41 +1,17 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - FSE : Finite State Entropy codec - Public Prototypes declaration - Copyright (C) 2013-2016, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * FSE : Finite State Entropy codec + * Public Prototypes declaration + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ - -#if defined (__cplusplus) -extern "C" { -#endif - #ifndef FSE_H #define FSE_H @@ -43,8 +19,7 @@ extern "C" { /*-***************************************** * Dependencies ******************************************/ -#include /* size_t, ptrdiff_t */ - +#include "zstd_deps.h" /* size_t, ptrdiff_t */ /*-***************************************** * FSE_PUBLIC_API : control library symbols visibility @@ -73,34 +48,6 @@ extern "C" { FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */ -/*-**************************************** -* FSE simple functions -******************************************/ -/*! FSE_compress() : - Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. - 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). - @return : size of compressed data (<= dstCapacity). - Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! - if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. - if FSE_isError(return), compression failed (more details using FSE_getErrorName()) -*/ -FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, - const void* src, size_t srcSize); - -/*! FSE_decompress(): - Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', - into already allocated destination buffer 'dst', of size 'dstCapacity'. - @return : size of regenerated data (<= maxDstSize), - or an error code, which can be tested using FSE_isError() . - - ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! - Why ? : making this distinction requires a header. - Header management is intentionally delegated to the user layer, which can better manage special cases. -*/ -FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, - const void* cSrc, size_t cSrcSize); - - /*-***************************************** * Tool functions ******************************************/ @@ -111,20 +58,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ -/*-***************************************** -* FSE advanced functions -******************************************/ -/*! FSE_compress2() : - Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' - Both parameters can be defined as '0' to mean : use default value - @return : size of compressed data - Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! - if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. - if FSE_isError(return), it's an error code. -*/ -FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); - - /*-***************************************** * FSE detailed API ******************************************/ @@ -157,10 +90,16 @@ FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize /*! FSE_normalizeCount(): normalize counts so that sum(count[]) == Power_of_2 (2^tableLog) 'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1). + useLowProbCount is a boolean parameter which trades off compressed size for + faster header decoding. When it is set to 1, the compressed data will be slightly + smaller. And when it is set to 0, FSE_readNCount() and FSE_buildDTable() will be + faster. If you are compressing a small amount of data (< 2 KB) then useLowProbCount=0 + is a good default, since header deserialization makes a big speed difference. + Otherwise, useLowProbCount=1 is a good default, since the speed difference is small. @return : tableLog, or an errorCode, which can be tested using FSE_isError() */ FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, - const unsigned* count, size_t srcSize, unsigned maxSymbolValue); + const unsigned* count, size_t srcSize, unsigned maxSymbolValue, unsigned useLowProbCount); /*! FSE_NCountWriteBound(): Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'. @@ -178,8 +117,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, /*! Constructor and Destructor of FSE_CTable. Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ -FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); -FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); /*! FSE_buildCTable(): Builds `ct`, which must be already allocated, using FSE_createCTable(). @@ -248,23 +185,14 @@ FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize); -/*! Constructor and Destructor of FSE_DTable. - Note that its size depends on 'tableLog' */ +/*! FSE_readNCount_bmi2(): + * Same as FSE_readNCount() but pass bmi2=1 when your CPU supports BMI2 and 0 otherwise. + */ +FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize, int bmi2); + typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ -FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); -FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); - -/*! FSE_buildDTable(): - Builds 'dt', which must be already allocated, using FSE_createDTable(). - return : 0, or an errorCode, which can be tested using FSE_isError() */ -FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); - -/*! FSE_decompress_usingDTable(): - Decompress compressed source `cSrc` of size `cSrcSize` using `dt` - into `dst` which must be already allocated. - @return : size of regenerated data (necessarily <= `dstCapacity`), - or an errorCode, which can be tested using FSE_isError() */ -FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); /*! Tutorial : @@ -296,24 +224,22 @@ If there is an error, the function will return an error code, which can be teste #endif /* FSE_H */ + #if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY) #define FSE_H_FSE_STATIC_LINKING_ONLY - -/* *** Dependency *** */ #include "bitstream.h" - /* ***************************************** * Static allocation *******************************************/ /* FSE buffer bounds */ #define FSE_NCOUNTBOUND 512 -#define FSE_BLOCKBOUND(size) (size + (size>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */) +#define FSE_BLOCKBOUND(size) ((size) + ((size)>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */) #define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ /* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */ -#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2)) -#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1< 12) ? (1 << (maxTableLog - 2)) : 1024) ) -size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); - -size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); -/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ - size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); /**< build a fake FSE_CTable, designed to compress always the same symbolValue */ /* FSE_buildCTable_wksp() : * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). - * `wkspSize` must be >= `(1<= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`. + * See FSE_buildCTable_wksp() for breakdown of workspace usage. */ +#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* additional 8 bytes for potential table overwrite */) +#define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)) size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); -size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); -/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ - -size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); -/**< build a fake FSE_DTable, designed to always generate the same symbolValue */ +#define FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) (sizeof(short) * (maxSymbolValue + 1) + (1ULL << maxTableLog) + 8) +#define FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ((FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) + sizeof(unsigned) - 1) / sizeof(unsigned)) +FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +/**< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ -size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog); -/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */ +#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) +#define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); +/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. + * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ typedef enum { FSE_repeat_none, /**< Cannot use the previous table */ @@ -536,20 +457,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; const U16* const stateTable = (const U16*)(statePtr->stateTable); U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); - BIT_addBits(bitC, statePtr->value, nbBitsOut); + BIT_addBits(bitC, (BitContainerType)statePtr->value, nbBitsOut); statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; } MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) { - BIT_addBits(bitC, statePtr->value, statePtr->stateLog); + BIT_addBits(bitC, (BitContainerType)statePtr->value, statePtr->stateLog); BIT_flushBits(bitC); } /* FSE_getMaxNbBits() : * Approximate maximum cost of a symbol, in bits. - * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) * note 1 : assume symbolValue is valid (<= maxSymbolValue) * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) @@ -664,6 +585,9 @@ MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) #ifndef FSE_DEFAULT_MEMORY_USAGE # define FSE_DEFAULT_MEMORY_USAGE 13 #endif +#if (FSE_DEFAULT_MEMORY_USAGE > FSE_MAX_MEMORY_USAGE) +# error "FSE_DEFAULT_MEMORY_USAGE must be <= FSE_MAX_MEMORY_USAGE" +#endif /*!FSE_MAX_SYMBOL_VALUE : * Maximum symbol value authorized. @@ -697,12 +621,8 @@ MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) # error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported" #endif -#define FSE_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3) - +#define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3) #endif /* FSE_STATIC_LINKING_ONLY */ - -#if defined (__cplusplus) -} -#endif +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/fse_compress.c b/vendor/github.com/DataDog/zstd/fse_compress.c index 68b47e1..b42e848 100644 --- a/vendor/github.com/DataDog/zstd/fse_compress.c +++ b/vendor/github.com/DataDog/zstd/fse_compress.c @@ -1,42 +1,21 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - FSE : Finite State Entropy encoder - Copyright (C) 2013-present, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy - - Public forum : https://groups.google.com/forum/#!forum/lz4c + * FSE : Finite State Entropy encoder + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ /* ************************************************************** * Includes ****************************************************************/ -#include /* malloc, free, qsort */ -#include /* memcpy, memset */ #include "compiler.h" #include "mem.h" /* U32, U16, etc. */ #include "debug.h" /* assert, DEBUGLOG */ @@ -45,6 +24,10 @@ #define FSE_STATIC_LINKING_ONLY #include "fse.h" #include "error_private.h" +#define ZSTD_DEPS_NEED_MALLOC +#define ZSTD_DEPS_NEED_MATH64 +#include "zstd_deps.h" /* ZSTD_memset */ +#include "bits.h" /* ZSTD_highbit32 */ /* ************************************************************** @@ -94,41 +77,85 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ; FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); U32 const step = FSE_TABLESTEP(tableSize); - U32 cumul[FSE_MAX_SYMBOL_VALUE+2]; + U32 const maxSV1 = maxSymbolValue+1; + + U16* cumul = (U16*)workSpace; /* size = maxSV1 */ + FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1)); /* size = tableSize */ - FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace; U32 highThreshold = tableSize-1; + assert(((size_t)workSpace & 1) == 0); /* Must be 2 bytes-aligned */ + if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge); /* CTable header */ - if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge); tableU16[-2] = (U16) tableLog; tableU16[-1] = (U16) maxSymbolValue; assert(tableLog < 16); /* required for threshold strategy to work */ /* For explanations on how to distribute symbol values over the table : - * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ #ifdef __clang_analyzer__ - memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ + ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ #endif /* symbol start positions */ { U32 u; cumul[0] = 0; - for (u=1; u <= maxSymbolValue+1; u++) { + for (u=1; u <= maxSV1; u++) { if (normalizedCounter[u-1]==-1) { /* Low proba symbol */ cumul[u] = cumul[u-1] + 1; tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1); } else { - cumul[u] = cumul[u-1] + normalizedCounter[u-1]; + assert(normalizedCounter[u-1] >= 0); + cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1]; + assert(cumul[u] >= cumul[u-1]); /* no overflow */ } } - cumul[maxSymbolValue+1] = tableSize+1; + cumul[maxSV1] = (U16)(tableSize+1); } /* Spread symbols */ - { U32 position = 0; + if (highThreshold == tableSize - 1) { + /* Case for no low prob count symbols. Lay down 8 bytes at a time + * to reduce branch misses since we are operating on a small block + */ + BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */ + { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; s=0); + pos += (size_t)n; + } + } + /* Spread symbols across the table. Lack of lowprob symbols means that + * we don't need variable sized inner loop, so we can unroll the loop and + * reduce branch misses. + */ + { size_t position = 0; + size_t s; + size_t const unroll = 2; /* Experimentally determined optimal unroll */ + assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ + for (s = 0; s < (size_t)tableSize; s += unroll) { + size_t u; + for (u = 0; u < unroll; ++u) { + size_t const uPosition = (position + (u * step)) & tableMask; + tableSymbol[uPosition] = spread[s + u]; + } + position = (position + (unroll * step)) & tableMask; + } + assert(position == 0); /* Must have initialized all positions */ + } + } else { + U32 position = 0; U32 symbol; - for (symbol=0; symbol<=maxSymbolValue; symbol++) { + for (symbol=0; symbol highThreshold) position = (position + step) & tableMask; /* Low proba area */ } } - assert(position==0); /* Must have initialized all positions */ } @@ -161,16 +187,17 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, case -1: case 1: symbolTT[s].deltaNbBits = (tableLog << 16) - (1< 1); + { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; - symbolTT[s].deltaFindState = total - normalizedCounter[s]; - total += normalizedCounter[s]; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); + total += (unsigned)normalizedCounter[s]; } } } } #if 0 /* debug : symbol costs */ @@ -181,31 +208,26 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, symbol, normalizedCounter[symbol], FSE_getMaxNbBits(symbolTT, symbol), (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256); - } - } + } } #endif return 0; } -size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) -{ - FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE]; /* memset() is not necessary, even if static analyzer complain about it */ - return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol)); -} - - #ifndef FSE_COMMONDEFS_ONLY - /*-************************************************************** * FSE NCount encoding ****************************************************************/ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) { - size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3; + size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog + + 4 /* bitCount initialized at 4 */ + + 2 /* first two symbols may use one additional bit each */) / 8) + + 1 /* round up to whole nb bytes */ + + 2 /* additional two bytes for bitstream flush */; return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ } @@ -234,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, /* Init */ remaining = tableSize+1; /* +1 for extra accuracy */ threshold = tableSize; - nbBits = tableLog+1; + nbBits = (int)tableLog+1; while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ if (previousIs0) { @@ -253,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, } while (symbol >= start+3) { start+=3; - bitStream += 3 << bitCount; + bitStream += 3U << bitCount; bitCount += 2; } bitStream += (symbol-start) << bitCount; @@ -273,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, count++; /* +1 for extra accuracy */ if (count>=threshold) count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ - bitStream += count << bitCount; + bitStream += (U32)count << bitCount; bitCount += nbBits; bitCount -= (count>8); out+= (bitCount+7) /8; - return (out-ostart); + assert(out >= ostart); + return (size_t)(out-ostart); } @@ -322,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, * FSE Compression Code ****************************************************************/ -FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) -{ - size_t size; - if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; - size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); - return (FSE_CTable*)malloc(size); -} - -void FSE_freeCTable (FSE_CTable* ct) { free(ct); } - /* provides the minimum logSize to safely represent a distribution */ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) { - U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; - U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; + U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; + U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; assert(srcSize > 1); /* Not supported, RLE should be used instead */ return minBits; @@ -344,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) { - U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; + U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; U32 tableLog = maxTableLog; U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); assert(srcSize > 1); /* Not supported, RLE should be used instead */ @@ -361,11 +374,10 @@ unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2); } - /* Secondary normalization method. To be used when primary method fails. */ -static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue) +static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue, short lowProbCount) { short const NOT_YET_ASSIGNED = -2; U32 s; @@ -382,7 +394,7 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, continue; } if (count[s] <= lowThreshold) { - norm[s] = -1; + norm[s] = lowProbCount; distributed++; total -= count[s]; continue; @@ -434,7 +446,7 @@ static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, { U64 const vStepLog = 62 - tableLog; U64 const mid = (1ULL << (vStepLog-1)) - 1; - U64 const rStep = ((((U64)1<> scale); @@ -490,7 +502,7 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, } } if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) { /* corner case, need another normalization method */ - size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue); + size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue, lowProbCount); if (FSE_isError(errorCode)) return errorCode; } else normalizedCounter[largest] += (short)stillToDistribute; @@ -513,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, return tableLog; } - -/* fake FSE_CTable, for raw (uncompressed) input */ -size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits) -{ - const unsigned tableSize = 1 << nbBits; - const unsigned tableMask = tableSize - 1; - const unsigned maxSymbolValue = tableMask; - void* const ptr = ct; - U16* const tableU16 = ( (U16*) ptr) + 2; - void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */ - FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); - unsigned s; - - /* Sanity checks */ - if (nbBits < 1) return ERROR(GENERIC); /* min size */ - - /* header */ - tableU16[-2] = (U16) nbBits; - tableU16[-1] = (U16) maxSymbolValue; - - /* Build table */ - for (s=0; s not compressible */ - if (maxCount < (srcSize >> 7)) return 0; /* Heuristic : not compressible enough */ - } - - tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue); - CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) ); - - /* Write table description header */ - { CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) ); - op += nc_err; - } - - /* Compress */ - CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) ); - { CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) ); - if (cSize == 0) return 0; /* not enough space for compressed data */ - op += cSize; - } - - /* check compressibility */ - if ( (size_t)(op-ostart) >= srcSize-1 ) return 0; - - return op-ostart; -} - -typedef struct { - FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)]; - BYTE scratchBuffer[1 << FSE_MAX_TABLELOG]; -} fseWkspMax_t; - -size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog) -{ - fseWkspMax_t scratchBuffer; - DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */ - if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); - return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer)); -} - -size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize) -{ - return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG); -} - - #endif /* FSE_COMMONDEFS_ONLY */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/fse_decompress.c b/vendor/github.com/DataDog/zstd/fse_decompress.c index 4f07378..1d15197 100644 --- a/vendor/github.com/DataDog/zstd/fse_decompress.c +++ b/vendor/github.com/DataDog/zstd/fse_decompress.c @@ -1,48 +1,30 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - FSE : Finite State Entropy decoder - Copyright (C) 2013-2015, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy - - Public forum : https://groups.google.com/forum/#!forum/lz4c + * FSE : Finite State Entropy decoder + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ /* ************************************************************** * Includes ****************************************************************/ -#include /* malloc, free, qsort */ -#include /* memcpy, memset */ +#include "debug.h" /* assert */ #include "bitstream.h" #include "compiler.h" #define FSE_STATIC_LINKING_ONLY #include "fse.h" #include "error_private.h" +#include "zstd_deps.h" /* ZSTD_memcpy */ +#include "bits.h" /* ZSTD_highbit32 */ /* ************************************************************** @@ -51,11 +33,6 @@ #define FSE_isError ERR_isError #define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */ -/* check and forward error code */ -#ifndef CHECK_F -#define CHECK_F(f) { size_t const e = f; if (FSE_isError(e)) return e; } -#endif - /* ************************************************************** * Templates @@ -79,30 +56,19 @@ #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) - -/* Function templates */ -FSE_DTable* FSE_createDTable (unsigned tableLog) -{ - if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; - return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); -} - -void FSE_freeDTable (FSE_DTable* dt) -{ - free(dt); -} - -size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) { void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr); - U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1]; + U16* symbolNext = (U16*)workSpace; + BYTE* spread = (BYTE*)(symbolNext + maxSymbolValue + 1); U32 const maxSV1 = maxSymbolValue + 1; U32 const tableSize = 1 << tableLog; U32 highThreshold = tableSize-1; /* Sanity Checks */ + if (FSE_BUILD_DTABLE_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(maxSymbolValue_tooLarge); if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge); if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); @@ -118,13 +84,57 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned symbolNext[s] = 1; } else { if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; - symbolNext[s] = normalizedCounter[s]; + symbolNext[s] = (U16)normalizedCounter[s]; } } } - memcpy(dt, &DTableH, sizeof(DTableH)); + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); } /* Spread symbols */ - { U32 const tableMask = tableSize-1; + if (highThreshold == tableSize - 1) { + size_t const tableMask = tableSize-1; + size_t const step = FSE_TABLESTEP(tableSize); + /* First lay down the symbols in order. + * We use a uint64_t to lay down 8 bytes at a time. This reduces branch + * misses since small blocks generally have small table logs, so nearly + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ + { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; stableLog = 0; - DTableH->fastMode = 0; - - cell->newState = 0; - cell->symbol = symbolValue; - cell->nbBits = 0; - - return 0; -} - - -size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) -{ - void* ptr = dt; - FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; - void* dPtr = dt + 1; - FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; - const unsigned tableSize = 1 << nbBits; - const unsigned tableMask = tableSize - 1; - const unsigned maxSV1 = tableMask+1; - unsigned s; - - /* Sanity checks */ - if (nbBits < 1) return ERROR(GENERIC); /* min size */ - - /* Build Decoding Table */ - DTableH->tableLog = (U16)nbBits; - DTableH->fastMode = 1; - for (s=0; s= ostart); + return (size_t)(op-ostart); } - -size_t FSE_decompress_usingDTable(void* dst, size_t originalSize, - const void* cSrc, size_t cSrcSize, - const FSE_DTable* dt) -{ - const void* ptr = dt; - const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; - const U32 fastMode = DTableH->fastMode; - - /* select fast mode (static) */ - if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); - return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); -} +typedef struct { + short ncount[FSE_MAX_SYMBOL_VALUE + 1]; +} FSE_DecompressWksp; -size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog) +FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + void* dst, size_t dstCapacity, + const void* cSrc, size_t cSrcSize, + unsigned maxLog, void* workSpace, size_t wkspSize, + int bmi2) { const BYTE* const istart = (const BYTE*)cSrc; const BYTE* ip = istart; - short counting[FSE_MAX_SYMBOL_VALUE+1]; unsigned tableLog; unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; + size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable); + FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos; + + FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); + if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC); + + /* correct offset to dtable depends on this property */ + FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0); /* normal FSE decoding mode */ - size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize); - if (FSE_isError(NCountLength)) return NCountLength; - //if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong); /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */ - if (tableLog > maxLog) return ERROR(tableLog_tooLarge); - ip += NCountLength; - cSrcSize -= NCountLength; + { size_t const NCountLength = + FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); + if (FSE_isError(NCountLength)) return NCountLength; + if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + assert(NCountLength <= cSrcSize); + ip += NCountLength; + cSrcSize -= NCountLength; + } - CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) ); + if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); + assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize); + workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); - return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */ -} + CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); + { + const void* ptr = dtable; + const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; + const U32 fastMode = DTableH->fastMode; -typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; + /* select fast mode (static) */ + if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); + return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); + } +} -size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize) +/* Avoids the FORCE_INLINE of the _body() function. */ +static size_t FSE_decompress_wksp_body_default(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) { - DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */ - return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG); + return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 0); } +#if DYNAMIC_BMI2 +BMI2_TARGET_ATTRIBUTE static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +{ + return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 1); +} +#endif +size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { + return FSE_decompress_wksp_body_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); + } +#endif + (void)bmi2; + return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); +} #endif /* FSE_COMMONDEFS_ONLY */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/hist.c b/vendor/github.com/DataDog/zstd/hist.c index 45b7bab..58d37aa 100644 --- a/vendor/github.com/DataDog/zstd/hist.c +++ b/vendor/github.com/DataDog/zstd/hist.c @@ -1,36 +1,17 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - hist : Histogram functions - part of Finite State Entropy project - Copyright (C) 2013-present, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy - - Public forum : https://groups.google.com/forum/#!forum/lz4c + * hist : Histogram functions + * part of Finite State Entropy project + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ /* --- dependencies --- */ @@ -46,6 +27,16 @@ unsigned HIST_isError(size_t code) { return ERR_isError(code); } /*-************************************************************** * Histogram functions ****************************************************************/ +void HIST_add(unsigned* count, const void* src, size_t srcSize) +{ + const BYTE* ip = (const BYTE*)src; + const BYTE* const end = ip + srcSize; + + while (ip= HIST_WKSP_SIZE_U32. + * `workSpace` must be a U32 table of size >= HIST_WKSP_SIZE_U32. * @return : largest histogram frequency, - * or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */ + * or an error code (notably when histogram's alphabet is larger than *maxSymbolValuePtr) */ static size_t HIST_count_parallel_wksp( unsigned* count, unsigned* maxSymbolValuePtr, const void* source, size_t sourceSize, @@ -91,22 +82,21 @@ static size_t HIST_count_parallel_wksp( { const BYTE* ip = (const BYTE*)source; const BYTE* const iend = ip+sourceSize; - unsigned maxSymbolValue = *maxSymbolValuePtr; + size_t const countSize = (*maxSymbolValuePtr + 1) * sizeof(*count); unsigned max=0; U32* const Counting1 = workSpace; U32* const Counting2 = Counting1 + 256; U32* const Counting3 = Counting2 + 256; U32* const Counting4 = Counting3 + 256; - memset(workSpace, 0, 4*256*sizeof(unsigned)); - /* safety checks */ + assert(*maxSymbolValuePtr <= 255); if (!sourceSize) { - memset(count, 0, maxSymbolValue + 1); + ZSTD_memset(count, 0, countSize); *maxSymbolValuePtr = 0; return 0; } - if (!maxSymbolValue) maxSymbolValue = 255; /* 0 == default */ + ZSTD_memset(workSpace, 0, 4*256*sizeof(unsigned)); /* by stripes of 16 bytes */ { U32 cached = MEM_read32(ip); ip += 4; @@ -138,21 +128,18 @@ static size_t HIST_count_parallel_wksp( /* finish last symbols */ while (ipmaxSymbolValue; s--) { - Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s]; - if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall); - } } - { U32 s; - if (maxSymbolValue > 255) maxSymbolValue = 255; - for (s=0; s<=maxSymbolValue; s++) { - count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s]; - if (count[s] > max) max = count[s]; + for (s=0; s<256; s++) { + Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s]; + if (Counting1[s] > max) max = Counting1[s]; } } - while (!count[maxSymbolValue]) maxSymbolValue--; - *maxSymbolValuePtr = maxSymbolValue; + { unsigned maxSymbolValue = 255; + while (!Counting1[maxSymbolValue]) maxSymbolValue--; + if (check && maxSymbolValue > *maxSymbolValuePtr) return ERROR(maxSymbolValue_tooSmall); + *maxSymbolValuePtr = maxSymbolValue; + ZSTD_memmove(count, Counting1, countSize); /* in case count & Counting1 are overlapping */ + } return (size_t)max; } @@ -172,14 +159,6 @@ size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace); } -/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */ -size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr, - const void* source, size_t sourceSize) -{ - unsigned tmpCounters[HIST_WKSP_SIZE_U32]; - return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters)); -} - /* HIST_count_wksp() : * Same as HIST_count(), but using an externally provided scratch buffer. * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */ @@ -195,9 +174,21 @@ size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr, return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize); } +#ifndef ZSTD_NO_UNUSED_FUNCTIONS +/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */ +size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize) +{ + unsigned tmpCounters[HIST_WKSP_SIZE_U32]; + return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters)); +} + size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize) { unsigned tmpCounters[HIST_WKSP_SIZE_U32]; return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters)); } +#endif + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/hist.h b/vendor/github.com/DataDog/zstd/hist.h index 8b38935..28cf4fe 100644 --- a/vendor/github.com/DataDog/zstd/hist.h +++ b/vendor/github.com/DataDog/zstd/hist.h @@ -1,40 +1,21 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - hist : Histogram functions - part of Finite State Entropy project - Copyright (C) 2013-present, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy - - Public forum : https://groups.google.com/forum/#!forum/lz4c + * hist : Histogram functions + * part of Finite State Entropy project + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ /* --- dependencies --- */ -#include /* size_t */ +#include "zstd_deps.h" /* size_t */ /* --- simple histogram functions --- */ @@ -93,3 +74,12 @@ size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, */ unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize); + +/*! HIST_add() : + * Lowest level: just add nb of occurrences of characters from @src into @count. + * @count is not reset. @count array is presumed large enough (i.e. 1 KB). + @ This function does not need any additional stack memory. + */ +void HIST_add(unsigned* count, const void* src, size_t srcSize); + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/huf.h b/vendor/github.com/DataDog/zstd/huf.h index 6b572c4..b619deb 100644 --- a/vendor/github.com/DataDog/zstd/huf.h +++ b/vendor/github.com/DataDog/zstd/huf.h @@ -1,144 +1,45 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - huff0 huffman codec, - part of Finite State Entropy library - Copyright (C) 2013-present, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * huff0 huffman codec, + * part of Finite State Entropy library + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ -#if defined (__cplusplus) -extern "C" { -#endif - #ifndef HUF_H_298734234 #define HUF_H_298734234 /* *** Dependencies *** */ -#include /* size_t */ - - -/* *** library symbols visibility *** */ -/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, - * HUF symbols remain "private" (internal symbols for library only). - * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ -#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) -# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) -#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ -# define HUF_PUBLIC_API __declspec(dllexport) -#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) -# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ -#else -# define HUF_PUBLIC_API -#endif - - -/* ========================== */ -/* *** simple functions *** */ -/* ========================== */ - -/** HUF_compress() : - * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. - * 'dst' buffer must be already allocated. - * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). - * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. - * @return : size of compressed data (<= `dstCapacity`). - * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! - * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) - */ -HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, - const void* src, size_t srcSize); - -/** HUF_decompress() : - * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', - * into already allocated buffer 'dst', of minimum size 'dstSize'. - * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. - * Note : in contrast with FSE, HUF_decompress can regenerate - * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, - * because it knows size to regenerate (originalSize). - * @return : size of regenerated data (== originalSize), - * or an error code, which can be tested using HUF_isError() - */ -HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, - const void* cSrc, size_t cSrcSize); - +#include "zstd_deps.h" /* size_t */ +#include "mem.h" /* U32 */ +#define FSE_STATIC_LINKING_ONLY +#include "fse.h" /* *** Tool functions *** */ -#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */ -HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */ +#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */ +size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */ /* Error Management */ -HUF_PUBLIC_API unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */ -HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /**< provides error code string (useful for debugging) */ - - -/* *** Advanced function *** */ - -/** HUF_compress2() : - * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. - * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . - * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ -HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned tableLog); - -/** HUF_compress4X_wksp() : - * Same as HUF_compress2(), but uses externally allocated `workSpace`. - * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */ -#define HUF_WORKSPACE_SIZE (6 << 10) -#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32)) -HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned tableLog, - void* workSpace, size_t wkspSize); - -#endif /* HUF_H_298734234 */ - -/* ****************************************************************** - * WARNING !! - * The following section contains advanced and experimental definitions - * which shall never be used in the context of a dynamic library, - * because they are not guaranteed to remain stable in the future. - * Only consider them in association with static linking. - * *****************************************************************/ -#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY) -#define HUF_H_HUF_STATIC_LINKING_ONLY +unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */ +const char* HUF_getErrorName(size_t code); /**< provides error code string (useful for debugging) */ -/* *** Dependencies *** */ -#include "mem.h" /* U32 */ +#define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) +#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) /* *** Constants *** */ -#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ +#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ #define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none specified */ #define HUF_SYMBOLVALUE_MAX 255 -#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ +#define HUF_TABLELOG_ABSOLUTEMAX 12 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ #if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX) # error "HUF_TABLELOG_MAX is too large !" #endif @@ -153,12 +54,12 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, #define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ /* static allocation of HUF's Compression Table */ -#define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Use tables of U32, for proper alignment */ -#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32)) +/* this is a private definition, just exposed for allocation and strict aliasing purpose. never EVER access its members directly */ +typedef size_t HUF_CElt; /* consider it an incomplete type */ +#define HUF_CTABLE_SIZE_ST(maxSymbolValue) ((maxSymbolValue)+2) /* Use tables of size_t, for proper alignment */ +#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t)) #define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \ - U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \ - void* name##hv = &(name##hb); \ - HUF_CElt* name = (HUF_CElt*)(name##hv) /* no final ; */ + HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */ /* static allocation of HUF's DTable */ typedef U32 HUF_DTable; @@ -172,25 +73,49 @@ typedef U32 HUF_DTable; /* **************************************** * Advanced decompression functions ******************************************/ -size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ -#ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ -#endif -size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< decodes RLE and uncompressed */ -size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */ -size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */ -size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ -size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */ -#ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ -size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */ -#endif +/** + * Huffman flags bitset. + * For all flags, 0 is the default value. + */ +typedef enum { + /** + * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime. + * Otherwise: Ignored. + */ + HUF_flags_bmi2 = (1 << 0), + /** + * If set: Test possible table depths to find the one that produces the smallest header + encoded size. + * If unset: Use heuristic to find the table depth. + */ + HUF_flags_optimalDepth = (1 << 1), + /** + * If set: If the previous table can encode the input, always reuse the previous table. + * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output. + */ + HUF_flags_preferRepeat = (1 << 2), + /** + * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress. + * If unset: Always histogram the entire input. + */ + HUF_flags_suspectUncompressible = (1 << 3), + /** + * If set: Don't use assembly implementations + * If unset: Allow using assembly implementations + */ + HUF_flags_disableAsm = (1 << 4), + /** + * If set: Don't use the fast decoding loop, always use the fallback decoding loop. + * If unset: Use the fast decoding loop when possible. + */ + HUF_flags_disableFast = (1 << 5) +} HUF_flags_e; /* **************************************** * HUF detailed API * ****************************************/ +#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra /*! HUF_compress() does the following: * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") @@ -203,33 +128,38 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, * For example, it's possible to compress several blocks using the same 'CTable', * or to save and regenerate 'CTable' using external methods. */ -unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); -typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */ -size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ -size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); -size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +unsigned HUF_minTableLog(unsigned symbolCardinality); +unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue); +unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, + size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */ +size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); +size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); +size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); +int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); typedef enum { HUF_repeat_none, /**< Cannot use the previous table */ HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ HUF_repeat_valid /**< Can use the previous table and it is assumed to be valid */ } HUF_repeat; + /** HUF_compress4X_repeat() : * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. * If it uses hufTable it does not modify hufTable or repeat. * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. - * If preferRepeat then the old table will always be used if valid. */ + * If preferRepeat then the old table will always be used if valid. + * If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); + HUF_CElt* hufTable, HUF_repeat* repeat, int flags); /** HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. */ -#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) +#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192) #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, @@ -244,15 +174,40 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, const void* src, size_t srcSize); +/*! HUF_readStats_wksp() : + * Same as HUF_readStats() but takes an external workspace which must be + * 4-byte aligned and its size must be >= HUF_READ_STATS_WORKSPACE_SIZE. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +#define HUF_READ_STATS_WORKSPACE_SIZE_U32 FSE_DECOMPRESS_WKSP_SIZE_U32(6, HUF_TABLELOG_MAX-1) +#define HUF_READ_STATS_WORKSPACE_SIZE (HUF_READ_STATS_WORKSPACE_SIZE_U32 * sizeof(unsigned)) +size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workspace, size_t wkspSize, + int flags); + /** HUF_readCTable() : * Loading a CTable saved with HUF_writeCTable() */ -size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize); +size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights); -/** HUF_getNbBits() : +/** HUF_getNbBitsFromCTable() : * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX - * Note 1 : is not inlined, as HUF_CElt definition is private - * Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */ -U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue); + * Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0 + * Note 2 : is not inlined, as HUF_CElt definition is private + */ +U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); + +typedef struct { + BYTE tableLog; + BYTE maxSymbolValue; + BYTE unused[sizeof(size_t) - 2]; +} HUF_CTableHeader; + +/** HUF_readCTableHeader() : + * @returns The header from the CTable specifying the tableLog and the maxSymbolValue. + */ +HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable); /* * HUF_decompress() does the following: @@ -278,81 +233,48 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); * a required workspace size greater than that specified in the following * macro. */ -#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10) +#define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) -#ifndef HUF_FORCE_DECOMPRESS_X2 -size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); -size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); -#endif -#ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); -size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); -#endif - -size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); -#ifndef HUF_FORCE_DECOMPRESS_X2 -size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); -#endif -#ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); -#endif - /* ====================== */ /* single stream variants */ /* ====================== */ -size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); -size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ -size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); /** HUF_compress1X_repeat() : * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. * If it uses hufTable it does not modify hufTable or repeat. * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. - * If preferRepeat then the old table will always be used if valid. */ + * If preferRepeat then the old table will always be used if valid. + * If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); + HUF_CElt* hufTable, HUF_repeat* repeat, int flags); -size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); #ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /**< double-symbols decoder */ #endif -size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); -size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +/* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); #ifndef HUF_FORCE_DECOMPRESS_X2 -size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ -size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */ -#endif -#ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ -size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */ +size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); #endif - -size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */ +size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); #ifndef HUF_FORCE_DECOMPRESS_X2 -size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); #endif #ifndef HUF_FORCE_DECOMPRESS_X1 -size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); #endif -/* BMI2 variants. - * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. - */ -size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); -#ifndef HUF_FORCE_DECOMPRESS_X2 -size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); -#endif -size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); -size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); - -#endif /* HUF_STATIC_LINKING_ONLY */ +#endif /* HUF_H_298734234 */ -#if defined (__cplusplus) -} -#endif +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/huf_compress.c b/vendor/github.com/DataDog/zstd/huf_compress.c index f074f1e..7df9160 100644 --- a/vendor/github.com/DataDog/zstd/huf_compress.c +++ b/vendor/github.com/DataDog/zstd/huf_compress.c @@ -1,35 +1,16 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - Huffman encoder, part of New Generation Entropy library - Copyright (C) 2013-2016, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy - - Public forum : https://groups.google.com/forum/#!forum/lz4c + * Huffman encoder, part of New Generation Entropy library + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ /* ************************************************************** @@ -43,16 +24,15 @@ /* ************************************************************** * Includes ****************************************************************/ -#include /* memcpy, memset */ -#include /* printf (debug) */ +#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */ #include "compiler.h" #include "bitstream.h" #include "hist.h" #define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */ #include "fse.h" /* header compression */ -#define HUF_STATIC_LINKING_ONLY #include "huf.h" #include "error_private.h" +#include "bits.h" /* ZSTD_highbit32 */ /* ************************************************************** @@ -60,29 +40,114 @@ ****************************************************************/ #define HUF_isError ERR_isError #define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */ -#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e -#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } /* ************************************************************** -* Utils +* Required declarations ****************************************************************/ -unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) +typedef struct nodeElt_s { + U32 count; + U16 parent; + BYTE byte; + BYTE nbBits; +} nodeElt; + + +/* ************************************************************** +* Debug Traces +****************************************************************/ + +#if DEBUGLEVEL >= 2 + +static size_t showU32(const U32* arr, size_t size) { - return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); + size_t u; + for (u=0; u= add) { + assert(add < align); + assert(((size_t)aligned & mask) == 0); + *workspaceSizePtr -= add; + return aligned; + } else { + *workspaceSizePtr = 0; + return NULL; + } +} + + /* HUF_compressWeights() : * Same as FSE_compress(), but dedicated to huff0's weights compression. * The use case needs much less stack memory. * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX. */ #define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6 -static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize) + +typedef struct { + FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)]; + U32 scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(HUF_TABLELOG_MAX, MAX_FSE_TABLELOG_FOR_HUFF_HEADER)]; + unsigned count[HUF_TABLELOG_MAX+1]; + S16 norm[HUF_TABLELOG_MAX+1]; +} HUF_CompressWeightsWksp; + +static size_t +HUF_compressWeights(void* dst, size_t dstSize, + const void* weightTable, size_t wtSize, + void* workspace, size_t workspaceSize) { BYTE* const ostart = (BYTE*) dst; BYTE* op = ostart; @@ -90,69 +155,125 @@ static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weight unsigned maxSymbolValue = HUF_TABLELOG_MAX; U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; + HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); - FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)]; - BYTE scratchBuffer[1<count, &maxSymbolValue, weightTable, wtSize); /* never fails */ if (maxCount == wtSize) return 1; /* only a single symbol in src : rle */ if (maxCount == 1) return 0; /* each symbol present maximum once => not compressible */ } tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue); - CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) ); + CHECK_F( FSE_normalizeCount(wksp->norm, tableLog, wksp->count, wtSize, maxSymbolValue, /* useLowProbCount */ 0) ); /* Write table description header */ - { CHECK_V_F(hSize, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) ); + { CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), wksp->norm, maxSymbolValue, tableLog) ); op += hSize; } /* Compress */ - CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) ); - { CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable) ); + CHECK_F( FSE_buildCTable_wksp(wksp->CTable, wksp->norm, maxSymbolValue, tableLog, wksp->scratchBuffer, sizeof(wksp->scratchBuffer)) ); + { CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, wksp->CTable) ); if (cSize == 0) return 0; /* not enough space for compressed data */ op += cSize; } - return op-ostart; + return (size_t)(op-ostart); +} + +static size_t HUF_getNbBits(HUF_CElt elt) +{ + return elt & 0xFF; +} + +static size_t HUF_getNbBitsFast(HUF_CElt elt) +{ + return elt; +} + +static size_t HUF_getValue(HUF_CElt elt) +{ + return elt & ~(size_t)0xFF; +} + +static size_t HUF_getValueFast(HUF_CElt elt) +{ + return elt; } +static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits) +{ + assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX); + *elt = nbBits; +} + +static void HUF_setValue(HUF_CElt* elt, size_t value) +{ + size_t const nbBits = HUF_getNbBits(*elt); + if (nbBits > 0) { + assert((value >> nbBits) == 0); + *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits); + } +} -struct HUF_CElt_s { - U16 val; - BYTE nbBits; -}; /* typedef'd to HUF_CElt within "huf.h" */ +HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable) +{ + HUF_CTableHeader header; + ZSTD_memcpy(&header, ctable, sizeof(header)); + return header; +} -/*! HUF_writeCTable() : - `CTable` : Huffman tree to save, using huf representation. - @return : size of saved CTable */ -size_t HUF_writeCTable (void* dst, size_t maxDstSize, - const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue) { + HUF_CTableHeader header; + HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header)); + ZSTD_memset(&header, 0, sizeof(header)); + assert(tableLog < 256); + header.tableLog = (BYTE)tableLog; + assert(maxSymbolValue < 256); + header.maxSymbolValue = (BYTE)maxSymbolValue; + ZSTD_memcpy(ctable, &header, sizeof(header)); +} + +typedef struct { + HUF_CompressWeightsWksp wksp; BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */ BYTE huffWeight[HUF_SYMBOLVALUE_MAX]; +} HUF_WriteCTableWksp; + +size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, + const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, + void* workspace, size_t workspaceSize) +{ + HUF_CElt const* const ct = CTable + 1; BYTE* op = (BYTE*)dst; U32 n; + HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); + + HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp)); + + assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); + assert(HUF_readCTableHeader(CTable).tableLog == huffLog); - /* check conditions */ + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); /* convert to weight */ - bitsToWeight[0] = 0; + wksp->bitsToWeight[0] = 0; for (n=1; nbitsToWeight[n] = (BYTE)(huffLog + 1 - n); for (n=0; nhuffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])]; /* attempt weights compression by FSE */ - { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, huffWeight, maxSymbolValue) ); + if (maxDstSize < 1) return ERROR(dstSize_tooSmall); + { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) ); if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */ op[0] = (BYTE)hSize; return hSize+1; @@ -162,45 +283,51 @@ size_t HUF_writeCTable (void* dst, size_t maxDstSize, if (maxSymbolValue > (256-128)) return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */ if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */ op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1)); - huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */ + wksp->huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */ for (n=0; nhuffWeight[n] << 4) + wksp->huffWeight[n+1]); return ((maxSymbolValue+1)/2) + 1; } -size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize) +size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) { BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; /* init not required, even though some static analyzer may complain */ U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */ U32 tableLog = 0; U32 nbSymbols = 0; + HUF_CElt* const ct = CTable + 1; /* get symbol weights */ CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize)); + *hasZeroWeights = (rankVal[0] > 0); /* check result */ if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + *maxSymbolValuePtr = nbSymbols - 1; + + HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); + /* Prepare base value per rank */ { U32 n, nextRankStart = 0; for (n=1; n<=tableLog; n++) { - U32 current = nextRankStart; + U32 curr = nextRankStart; nextRankStart += (rankVal[n] << (n-1)); - rankVal[n] = current; + rankVal[n] = curr; } } /* fill nbBits */ { U32 n; for (n=0; nn=tableLog+1 */ U16 valPerRank[HUF_TABLELOG_MAX+2] = {0}; - { U32 n; for (n=0; n>= 1; } } /* assign value within rank, symbol order */ - { U32 n; for (n=0; n HUF_readCTableHeader(CTable).maxSymbolValue) + return 0; + return (U32)HUF_getNbBits(ct[symbolValue]); } -typedef struct nodeElt_s { - U32 count; - U16 parent; - BYTE byte; - BYTE nbBits; -} nodeElt; - -static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) +/** + * HUF_setMaxHeight(): + * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. + * + * It attempts to convert all nodes with nbBits > @targetNbBits + * to employ @targetNbBits instead. Then it adjusts the tree + * so that it remains a valid canonical Huffman tree. + * + * @pre The sum of the ranks of each symbol == 2^largestBits, + * where largestBits == huffNode[lastNonNull].nbBits. + * @post The sum of the ranks of each symbol == 2^largestBits, + * where largestBits is the return value (expected <= targetNbBits). + * + * @param huffNode The Huffman tree modified in place to enforce targetNbBits. + * It's presumed sorted, from most frequent to rarest symbol. + * @param lastNonNull The symbol with the lowest count in the Huffman tree. + * @param targetNbBits The allowed number of bits, which the Huffman tree + * may not respect. After this function the Huffman tree will + * respect targetNbBits. + * @return The maximum number of bits of the Huffman tree after adjustment. + */ +static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) { const U32 largestBits = huffNode[lastNonNull].nbBits; - if (largestBits <= maxNbBits) return largestBits; /* early exit : no elt > maxNbBits */ + /* early exit : no elt > targetNbBits, so the tree is already valid. */ + if (largestBits <= targetNbBits) return largestBits; + + DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); /* there are several too large elements (at least >= 2) */ { int totalCost = 0; - const U32 baseCost = 1 << (largestBits - maxNbBits); - U32 n = lastNonNull; - - while (huffNode[n].nbBits > maxNbBits) { + const U32 baseCost = 1 << (largestBits - targetNbBits); + int n = (int)lastNonNull; + + /* Adjust any ranks > targetNbBits to targetNbBits. + * Compute totalCost, which is how far the sum of the ranks is + * we are over 2^largestBits after adjust the offending ranks. + */ + while (huffNode[n].nbBits > targetNbBits) { totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); - huffNode[n].nbBits = (BYTE)maxNbBits; - n --; - } /* n stops at huffNode[n].nbBits <= maxNbBits */ - while (huffNode[n].nbBits == maxNbBits) n--; /* n end at index of smallest symbol using < maxNbBits */ + huffNode[n].nbBits = (BYTE)targetNbBits; + n--; + } + /* n stops at huffNode[n].nbBits <= targetNbBits */ + assert(huffNode[n].nbBits <= targetNbBits); + /* n end at index of smallest symbol using < targetNbBits */ + while (huffNode[n].nbBits == targetNbBits) --n; - /* renorm totalCost */ - totalCost >>= (largestBits - maxNbBits); /* note : totalCost is necessarily a multiple of baseCost */ + /* renorm totalCost from 2^largestBits to 2^targetNbBits + * note : totalCost is necessarily a multiple of baseCost */ + assert(((U32)totalCost & (baseCost - 1)) == 0); + totalCost >>= (largestBits - targetNbBits); + assert(totalCost > 0); /* repay normalized cost */ { U32 const noSymbol = 0xF0F0F0F0; U32 rankLast[HUF_TABLELOG_MAX+2]; - int pos; - /* Get pos of last (smallest) symbol per rank */ - memset(rankLast, 0xF0, sizeof(rankLast)); - { U32 currentNbBits = maxNbBits; + /* Get pos of last (smallest = lowest cum. count) symbol per rank */ + ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); + { U32 currentNbBits = targetNbBits; + int pos; for (pos=n ; pos >= 0; pos--) { if (huffNode[pos].nbBits >= currentNbBits) continue; - currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ - rankLast[maxNbBits-currentNbBits] = pos; + currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ + rankLast[targetNbBits-currentNbBits] = (U32)pos; } } while (totalCost > 0) { - U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1; + /* Try to reduce the next power of 2 above totalCost because we + * gain back half the rank. + */ + U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { - U32 highPos = rankLast[nBitsToDecrease]; - U32 lowPos = rankLast[nBitsToDecrease-1]; + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; if (highPos == noSymbol) continue; + /* Decrease highPos if no symbols of lowPos or if it is + * not cheaper to remove 2 lowPos than highPos. + */ if (lowPos == noSymbol) break; { U32 const highTotal = huffNode[highPos].count; U32 const lowTotal = 2 * huffNode[lowPos].count; if (highTotal <= lowTotal) break; } } /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */ + assert(rankLast[nBitsToDecrease] != noSymbol || nBitsToDecrease == 1); /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */ while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol)) - nBitsToDecrease ++; + nBitsToDecrease++; + assert(rankLast[nBitsToDecrease] != noSymbol); + /* Increase the number of bits to gain back half the rank cost. */ totalCost -= 1 << (nBitsToDecrease-1); + huffNode[rankLast[nBitsToDecrease]].nbBits++; + + /* Fix up the new rank. + * If the new rank was empty, this symbol is now its smallest. + * Otherwise, this symbol will be the largest in the new rank so no adjustment. + */ if (rankLast[nBitsToDecrease-1] == noSymbol) - rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]; /* this rank is no longer empty */ - huffNode[rankLast[nBitsToDecrease]].nbBits ++; + rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]; + /* Fix up the old rank. + * If the symbol was at position 0, meaning it was the highest weight symbol in the tree, + * it must be the only symbol in its rank, so the old rank now has no symbols. + * Otherwise, since the Huffman nodes are sorted by count, the previous position is now + * the smallest node in the rank. If the previous position belongs to a different rank, + * then the rank is now empty. + */ if (rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */ rankLast[nBitsToDecrease] = noSymbol; else { rankLast[nBitsToDecrease]--; - if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) + if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ - } } /* while (totalCost > 0) */ - + } + } /* while (totalCost > 0) */ + + /* If we've removed too much weight, then we have to add it back. + * To avoid overshooting again, we only adjust the smallest rank. + * We take the largest nodes from the lowest rank 0 and move them + * to rank 1. There's guaranteed to be enough rank 0 symbols because + * TODO. + */ while (totalCost < 0) { /* Sometimes, cost correction overshoot */ - if (rankLast[1] == noSymbol) { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */ - while (huffNode[n].nbBits == maxNbBits) n--; + /* special case : no rank 1 symbol (using targetNbBits-1); + * let's create one from largest rank 0 (using targetNbBits). + */ + if (rankLast[1] == noSymbol) { + while (huffNode[n].nbBits == targetNbBits) n--; huffNode[n+1].nbBits--; - rankLast[1] = n+1; + assert(n >= 0); + rankLast[1] = (U32)(n+1); totalCost++; continue; } huffNode[ rankLast[1] + 1 ].nbBits--; rankLast[1]++; totalCost ++; - } } } /* there are several too large elements (at least >= 2) */ + } + } /* repay normalized cost */ + } /* there are several too large elements (at least >= 2) */ - return maxNbBits; + return targetNbBits; } - typedef struct { - U32 base; - U32 current; + U16 base; + U16 curr; } rankPos; -static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue) -{ - rankPos rank[32]; +typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; + +/* Number of buckets available for HUF_sort() */ +#define RANK_POSITION_TABLE_SIZE 192 + +typedef struct { + huffNodeTable huffNodeTbl; + rankPos rankPosition[RANK_POSITION_TABLE_SIZE]; +} HUF_buildCTable_wksp_tables; + +/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing. + * Strategy is to use as many buckets as possible for representing distinct + * counts while using the remainder to represent all "large" counts. + * + * To satisfy this requirement for 192 buckets, we can do the following: + * Let buckets 0-166 represent distinct counts of [0, 166] + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ +#define RANK_POSITION_MAX_COUNT_LOG 32 +#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) +#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + +/* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. + */ +static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count + : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; +} + +/* Helper swap function for HUF_quickSortPartition() */ +static void HUF_swapNodes(nodeElt* a, nodeElt* b) { + nodeElt tmp = *a; + *a = *b; + *b = tmp; +} + +/* Returns 0 if the huffNode array is not sorted by descending count */ +MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) { + U32 i; + for (i = 1; i < maxSymbolValue1; ++i) { + if (huffNode[i].count > huffNode[i-1].count) { + return 0; + } + } + return 1; +} + +/* Insertion sort by descending order */ +HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) { + int i; + int const size = high-low+1; + huffNode += low; + for (i = 1; i < size; ++i) { + nodeElt const key = huffNode[i]; + int j = i - 1; + while (j >= 0 && huffNode[j].count < key.count) { + huffNode[j + 1] = huffNode[j]; + j--; + } + huffNode[j + 1] = key; + } +} + +/* Pivot helper function for quicksort. */ +static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) { + /* Simply select rightmost element as pivot. "Better" selectors like + * median-of-three don't experimentally appear to have any benefit. + */ + U32 const pivot = arr[high].count; + int i = low - 1; + int j = low; + for ( ; j < high; j++) { + if (arr[j].count > pivot) { + i++; + HUF_swapNodes(&arr[i], &arr[j]); + } + } + HUF_swapNodes(&arr[i + 1], &arr[high]); + return i + 1; +} + +/* Classic quicksort by descending with partially iterative calls + * to reduce worst case callstack size. + */ +static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) { + int const kInsertionSortThreshold = 8; + if (high - low < kInsertionSortThreshold) { + HUF_insertionSort(arr, low, high); + return; + } + while (low < high) { + int const idx = HUF_quickSortPartition(arr, low, high); + if (idx - low < high - idx) { + HUF_simpleQuickSort(arr, low, idx - 1); + low = idx + 1; + } else { + HUF_simpleQuickSort(arr, idx + 1, high); + high = idx - 1; + } + } +} + +/** + * HUF_sort(): + * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order. + * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket. + * + * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled. + * Must have (maxSymbolValue + 1) entries. + * @param[in] count Histogram of the symbols. + * @param[in] maxSymbolValue Maximum symbol value. + * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries. + */ +static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) { U32 n; + U32 const maxSymbolValue1 = maxSymbolValue+1; + + /* Compute base and set curr to base. + * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1. + * See HUF_getIndex to see bucketing strategy. + * We attribute each symbol to lowerRank's base value, because we want to know where + * each rank begins in the output, so for rank R we want to count ranks R+1 and above. + */ + ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE); + for (n = 0; n < maxSymbolValue1; ++n) { + U32 lowerRank = HUF_getIndex(count[n]); + assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1); + rankPosition[lowerRank].base++; + } - memset(rank, 0, sizeof(rank)); - for (n=0; n<=maxSymbolValue; n++) { - U32 r = BIT_highbit32(count[n] + 1); - rank[r].base ++; + assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0); + /* Set up the rankPosition table */ + for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) { + rankPosition[n-1].base += rankPosition[n].base; + rankPosition[n-1].curr = rankPosition[n-1].base; } - for (n=30; n>0; n--) rank[n-1].base += rank[n].base; - for (n=0; n<32; n++) rank[n].current = rank[n].base; - for (n=0; n<=maxSymbolValue; n++) { + + /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */ + for (n = 0; n < maxSymbolValue1; ++n) { U32 const c = count[n]; - U32 const r = BIT_highbit32(c+1) + 1; - U32 pos = rank[r].current++; - while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) { - huffNode[pos] = huffNode[pos-1]; - pos--; - } + U32 const r = HUF_getIndex(c) + 1; + U32 const pos = rankPosition[r].curr++; + assert(pos < maxSymbolValue1); huffNode[pos].count = c; huffNode[pos].byte = (BYTE)n; } + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { + int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); + HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1); + } + } + + assert(HUF_isSorted(huffNode, maxSymbolValue1)); } /** HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. - * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of HUF_CTABLE_WORKSPACE_SIZE_U32 unsigned. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). */ #define STARTNODE (HUF_SYMBOLVALUE_MAX+1) -typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; -size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) + +/* HUF_buildTree(): + * Takes the huffNode array sorted by HUF_sort() and builds an unlimited-depth Huffman tree. + * + * @param huffNode The array sorted by HUF_sort(). Builds the Huffman tree in this array. + * @param maxSymbolValue The maximum symbol value. + * @return The smallest node in the Huffman tree (by count). + */ +static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) { - nodeElt* const huffNode0 = (nodeElt*)workSpace; - nodeElt* const huffNode = huffNode0+1; - U32 n, nonNullRank; + nodeElt* const huffNode0 = huffNode - 1; + int nonNullRank; int lowS, lowN; - U16 nodeNb = STARTNODE; - U32 nodeRoot; - - /* safety checks */ - if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ - if (wkspSize < sizeof(huffNodeTable)) return ERROR(workSpace_tooSmall); - if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; - if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); - memset(huffNode0, 0, sizeof(huffNodeTable)); - - /* sort, decreasing order */ - HUF_sort(huffNode, count, maxSymbolValue); - + int nodeNb = STARTNODE; + int n, nodeRoot; + DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); /* init for parents */ - nonNullRank = maxSymbolValue; + nonNullRank = (int)maxSymbolValue; while(huffNode[nonNullRank].count == 0) nonNullRank--; lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb; huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count; - huffNode[lowS].parent = huffNode[lowS-1].parent = nodeNb; + huffNode[lowS].parent = huffNode[lowS-1].parent = (U16)nodeNb; nodeNb++; lowS-=2; for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30); huffNode0[0].count = (U32)(1U<<31); /* fake entry, strong barrier */ /* create parents */ while (nodeNb <= nodeRoot) { - U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; - U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + int const n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + int const n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count; - huffNode[n1].parent = huffNode[n2].parent = nodeNb; + huffNode[n1].parent = huffNode[n2].parent = (U16)nodeNb; nodeNb++; } @@ -392,126 +713,414 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbo for (n=0; n<=nonNullRank; n++) huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; - /* enforce maxTableLog */ - maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits); - - /* fill result into tree (val, nbBits) */ - { U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; - U16 valPerRank[HUF_TABLELOG_MAX+1] = {0}; - if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ - for (n=0; n<=nonNullRank; n++) - nbPerRank[huffNode[n].nbBits]++; - /* determine stating value per rank */ - { U16 min = 0; - for (n=maxNbBits; n>0; n--) { - valPerRank[n] = min; /* get starting value within each rank */ - min += nbPerRank[n]; - min >>= 1; - } } - for (n=0; n<=maxSymbolValue; n++) - tree[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */ - for (n=0; n<=maxSymbolValue; n++) - tree[n].val = valPerRank[tree[n].nbBits]++; /* assign value within rank, symbol order */ - } + DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); - return maxNbBits; + return nonNullRank; } -/** HUF_buildCTable() : - * @return : maxNbBits - * Note : count is used before tree is written, so they can safely overlap +/** + * HUF_buildCTableFromTree(): + * Build the CTable given the Huffman tree in huffNode. + * + * @param[out] CTable The output Huffman CTable. + * @param huffNode The Huffman tree. + * @param nonNullRank The last and smallest node in the Huffman tree. + * @param maxSymbolValue The maximum symbol value. + * @param maxNbBits The exact maximum number of bits used in the Huffman tree. */ -size_t HUF_buildCTable (HUF_CElt* tree, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits) +static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits) { - huffNodeTable nodeTable; - return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, nodeTable, sizeof(nodeTable)); + HUF_CElt* const ct = CTable + 1; + /* fill result into ctable (val, nbBits) */ + int n; + U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; + U16 valPerRank[HUF_TABLELOG_MAX+1] = {0}; + int const alphabetSize = (int)(maxSymbolValue + 1); + for (n=0; n<=nonNullRank; n++) + nbPerRank[huffNode[n].nbBits]++; + /* determine starting value per rank */ + { U16 min = 0; + for (n=(int)maxNbBits; n>0; n--) { + valPerRank[n] = min; /* get starting value within each rank */ + min += nbPerRank[n]; + min >>= 1; + } } + for (n=0; nhuffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + + HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); + + DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); + + /* safety checks */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) + return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) + return ERROR(maxSymbolValue_tooLarge); + ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); + DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); + + /* build tree */ + nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); + + /* determine and enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + + HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits); + + return maxNbBits; +} + +size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) +{ + HUF_CElt const* ct = CTable + 1; size_t nbBits = 0; int s; for (s = 0; s <= (int)maxSymbolValue; ++s) { - nbBits += CTable[s].nbBits * count[s]; + nbBits += HUF_getNbBits(ct[s]) * count[s]; } return nbBits >> 3; } -static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { - int bad = 0; - int s; - for (s = 0; s <= (int)maxSymbolValue; ++s) { - bad |= (count[s] != 0) & (CTable[s].nbBits == 0); - } - return !bad; +int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + HUF_CTableHeader header = HUF_readCTableHeader(CTable); + HUF_CElt const* ct = CTable + 1; + int bad = 0; + int s; + + assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); + + if (header.maxSymbolValue < maxSymbolValue) + return 0; + + for (s = 0; s <= (int)maxSymbolValue; ++s) { + bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); + } + return !bad; } size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +/** HUF_CStream_t: + * Huffman uses its own BIT_CStream_t implementation. + * There are three major differences from BIT_CStream_t: + * 1. HUF_addBits() takes a HUF_CElt (size_t) which is + * the pair (nbBits, value) in the format: + * format: + * - Bits [0, 4) = nbBits + * - Bits [4, 64 - nbBits) = 0 + * - Bits [64 - nbBits, 64) = value + * 2. The bitContainer is built from the upper bits and + * right shifted. E.g. to add a new value of N bits + * you right shift the bitContainer by N, then or in + * the new value into the N upper bits. + * 3. The bitstream has two bit containers. You can add + * bits to the second container and merge them into + * the first container. + */ + +#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8) + +typedef struct { + size_t bitContainer[2]; + size_t bitPos[2]; + + BYTE* startPtr; + BYTE* ptr; + BYTE* endPtr; +} HUF_CStream_t; + +/**! HUF_initCStream(): + * Initializes the bitstream. + * @returns 0 or an error code. + */ +static size_t HUF_initCStream(HUF_CStream_t* bitC, + void* startPtr, size_t dstCapacity) +{ + ZSTD_memset(bitC, 0, sizeof(*bitC)); + bitC->startPtr = (BYTE*)startPtr; + bitC->ptr = bitC->startPtr; + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]); + if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall); + return 0; +} + +/*! HUF_addBits(): + * Adds the symbol stored in HUF_CElt elt to the bitstream. + * + * @param elt The element we're adding. This is a (nbBits, value) pair. + * See the HUF_CStream_t docs for the format. + * @param idx Insert into the bitstream at this idx. + * @param kFast This is a template parameter. If the bitstream is guaranteed + * to have at least 4 unused bits after this call it may be 1, + * otherwise it must be 0. HUF_addBits() is faster when fast is set. + */ +FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast) +{ + assert(idx <= 1); + assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX); + /* This is efficient on x86-64 with BMI2 because shrx + * only reads the low 6 bits of the register. The compiler + * knows this and elides the mask. When fast is set, + * every operation can use the same value loaded from elt. + */ + bitC->bitContainer[idx] >>= HUF_getNbBits(elt); + bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt); + /* We only read the low 8 bits of bitC->bitPos[idx] so it + * doesn't matter that the high bits have noise from the value. + */ + bitC->bitPos[idx] += HUF_getNbBitsFast(elt); + assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + /* The last 4-bits of elt are dirty if fast is set, + * so we must not be overwriting bits that have already been + * inserted into the bit container. + */ +#if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); + size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); + /* We didn't overwrite any bits in the bit container. */ + assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + (void)dirtyBits; + } +#endif +} + +FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC) +{ + bitC->bitContainer[1] = 0; + bitC->bitPos[1] = 0; +} + +/*! HUF_mergeIndex1() : + * Merges the bit container @ index 1 into the bit container @ index 0 + * and zeros the bit container @ index 1. + */ +FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC) +{ + assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER); + bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF); + bitC->bitContainer[0] |= bitC->bitContainer[1]; + bitC->bitPos[0] += bitC->bitPos[1]; + assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER); +} + +/*! HUF_flushBits() : +* Flushes the bits in the bit container @ index 0. +* +* @post bitPos will be < 8. +* @param kFast If kFast is set then we must know a-priori that +* the bit container will not overflow. +*/ +FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast) +{ + /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */ + size_t const nbBits = bitC->bitPos[0] & 0xFF; + size_t const nbBytes = nbBits >> 3; + /* The top nbBits bits of bitContainer are the ones we need. */ + size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits); + /* Mask bitPos to account for the bytes we consumed. */ + bitC->bitPos[0] &= 7; + assert(nbBits > 0); + assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8); + assert(bitC->ptr <= bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitContainer); + bitC->ptr += nbBytes; + assert(!kFast || bitC->ptr <= bitC->endPtr); + if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; + /* bitContainer doesn't need to be modified because the leftover + * bits are already the top bitPos bits. And we don't care about + * noise in the lower values. + */ +} + +/*! HUF_endMark() + * @returns The Huffman stream end mark: A 1-bit value = 1. + */ +static HUF_CElt HUF_endMark(void) +{ + HUF_CElt endMark; + HUF_setNbBits(&endMark, 1); + HUF_setValue(&endMark, 1); + return endMark; +} + +/*! HUF_closeCStream() : + * @return Size of CStream, in bytes, + * or 0 if it could not fit into dstBuffer */ +static size_t HUF_closeCStream(HUF_CStream_t* bitC) +{ + HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0); + HUF_flushBits(bitC, /* kFast */ 0); + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ + return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } +} + FORCE_INLINE_TEMPLATE void -HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) +HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast) { - BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); + HUF_addBits(bitCPtr, CTable[symbol], idx, fast); } -#define HUF_FLUSHBITS(s) BIT_flushBits(s) +FORCE_INLINE_TEMPLATE void +HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC, + const BYTE* ip, size_t srcSize, + const HUF_CElt* ct, + int kUnroll, int kFastFlush, int kLastFast) +{ + /* Join to kUnroll */ + int n = (int)srcSize; + int rem = n % kUnroll; + if (rem > 0) { + for (; rem > 0; --rem) { + HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0); + } + HUF_flushBits(bitC, kFastFlush); + } + assert(n % kUnroll == 0); + + /* Join to 2 * kUnroll */ + if (n % (2 * kUnroll)) { + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast); + HUF_flushBits(bitC, kFastFlush); + n -= kUnroll; + } + assert(n % (2 * kUnroll) == 0); + + for (; n>0; n-= 2 * kUnroll) { + /* Encode kUnroll symbols into the bitstream @ index 0. */ + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast); + HUF_flushBits(bitC, kFastFlush); + /* Encode kUnroll symbols into the bitstream @ index 1. + * This allows us to start filling the bit container + * without any data dependencies. + */ + HUF_zeroIndex1(bitC); + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast); + /* Merge bitstream @ index 1 into the bitstream @ index 0 */ + HUF_mergeIndex1(bitC); + HUF_flushBits(bitC, kFastFlush); + } + assert(n == 0); + +} -#define HUF_FLUSHBITS_1(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) +/** + * Returns a tight upper bound on the output space needed by Huffman + * with 8 bytes buffer to handle over-writes. If the output is at least + * this large we don't need to do bounds checks during Huffman encoding. + */ +static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog) +{ + return ((srcSize * tableLog) >> 3) + 8; +} -#define HUF_FLUSHBITS_2(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) FORCE_INLINE_TEMPLATE size_t HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { + U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; + HUF_CElt const* ct = CTable + 1; const BYTE* ip = (const BYTE*) src; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; - BYTE* op = ostart; - size_t n; - BIT_CStream_t bitC; + HUF_CStream_t bitC; /* init */ if (dstSize < 8) return 0; /* not enough space to compress */ - { size_t const initErr = BIT_initCStream(&bitC, op, oend-op); + { BYTE* op = ostart; + size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); if (HUF_isError(initErr)) return 0; } - n = srcSize & ~3; /* join to mod 4 */ - switch (srcSize & 3) - { - case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); - HUF_FLUSHBITS_2(&bitC); - /* fall-through */ - case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); - HUF_FLUSHBITS_1(&bitC); - /* fall-through */ - case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); - HUF_FLUSHBITS(&bitC); - /* fall-through */ - case 0 : /* fall-through */ - default: break; - } - - for (; n>0; n-=4) { /* note : n&3==0 at this stage */ - HUF_encodeSymbol(&bitC, ip[n- 1], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 2], CTable); - HUF_FLUSHBITS_2(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 3], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 4], CTable); - HUF_FLUSHBITS(&bitC); + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0); + else { + if (MEM_32bits()) { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: ZSTD_FALLTHROUGH; + case 9: ZSTD_FALLTHROUGH; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 7: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } else { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 9: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 7: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 6: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } } + assert(bitC.ptr <= bitC.endPtr); - return BIT_closeCStream(&bitC); + return HUF_closeCStream(&bitC); } #if DYNAMIC_BMI2 -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) @@ -530,9 +1139,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, static size_t HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, const void* src, size_t srcSize, - const HUF_CElt* CTable, const int bmi2) + const HUF_CElt* CTable, const int flags) { - if (bmi2) { + if (flags & HUF_flags_bmi2) { return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); } return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); @@ -543,24 +1152,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, static size_t HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, const void* src, size_t srcSize, - const HUF_CElt* CTable, const int bmi2) + const HUF_CElt* CTable, const int flags) { - (void)bmi2; + (void)flags; return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); } #endif -size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) { - return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); } - static size_t HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, const void* src, size_t srcSize, - const HUF_CElt* CTable, int bmi2) + const HUF_CElt* CTable, int flags) { size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ const BYTE* ip = (const BYTE*) src; @@ -573,41 +1181,43 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, if (srcSize < 12) return 0; /* no saving possible : too small input */ op += 6; /* jumpTable */ - { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart, (U16)cSize); op += cSize; } ip += segmentSize; - { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+2, (U16)cSize); op += cSize; } ip += segmentSize; - { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+4, (U16)cSize); op += cSize; } ip += segmentSize; - { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, iend-ip, CTable, bmi2) ); - if (cSize==0) return 0; + assert(op <= oend); + assert(ip <= iend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; op += cSize; } - return op-ostart; + return (size_t)(op-ostart); } -size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) { - return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); } typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; @@ -615,44 +1225,129 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; static size_t HUF_compressCTable_internal( BYTE* const ostart, BYTE* op, BYTE* const oend, const void* src, size_t srcSize, - HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) + HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) { size_t const cSize = (nbStreams==HUF_singleStream) ? - HUF_compress1X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2) : - HUF_compress4X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2); + HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : + HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); if (HUF_isError(cSize)) { return cSize; } if (cSize==0) { return 0; } /* uncompressible */ op += cSize; /* check compressibility */ + assert(op >= ostart); if ((size_t)(op-ostart) >= srcSize-1) { return 0; } - return op-ostart; + return (size_t)(op-ostart); } typedef struct { unsigned count[HUF_SYMBOLVALUE_MAX + 1]; - HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; - huffNodeTable nodeTable; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)]; + union { + HUF_buildCTable_wksp_tables buildCTable_wksp; + HUF_WriteCTableWksp writeCTable_wksp; + U32 hist_wksp[HIST_WKSP_SIZE_U32]; + } wksps; } HUF_compress_tables_t; +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + +unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) +{ + unsigned cardinality = 0; + unsigned i; + + for (i = 0; i < maxSymbolValue + 1; i++) { + if (count[i] != 0) cardinality += 1; + } + + return cardinality; +} + +unsigned HUF_minTableLog(unsigned symbolCardinality) +{ + U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; + return minBitsSymbols; +} + +unsigned HUF_optimalTableLog( + unsigned maxTableLog, + size_t srcSize, + unsigned maxSymbolValue, + void* workSpace, size_t wkspSize, + HUF_CElt* table, + const unsigned* count, + int flags) +{ + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); + + if (!(flags & HUF_flags_optimalDepth)) { + /* cheap evaluation, based on FSE */ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); + } + + { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); + size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); + size_t hSize, newSize; + const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); + const unsigned minTableLog = HUF_minTableLog(symbolCardinality); + size_t optSize = ((size_t) ~0) - 1; + unsigned optLog = maxTableLog, optLogGuess; + + DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); + + /* Search until size increases */ + for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { + DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); + + { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); + if (ERR_isError(maxBits)) continue; + + if (maxBits < optLogGuess && optLogGuess > minTableLog) break; + + hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); + } + + if (ERR_isError(hSize)) continue; + + newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; + + if (newSize > optSize + 1) { + break; + } + + if (newSize < optSize) { + optSize = newSize; + optLog = optLogGuess; + } + } + assert(optLog <= HUF_TABLELOG_MAX); + return optLog; + } +} + /* HUF_compress_internal() : - * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ + * `workSpace_align4` must be aligned on 4-bytes boundaries, + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ static size_t HUF_compress_internal (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, HUF_nbStreams_e nbStreams, void* workSpace, size_t wkspSize, - HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, - const int bmi2) + HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) { - HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace; + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; BYTE* op = ostart; + DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); + /* checks & inits */ - if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ - if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall); + if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall); if (!srcSize) return 0; /* Uncompressed */ if (!dstSize) return 0; /* cannot fit anything within dst budget */ if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */ @@ -662,17 +1357,34 @@ HUF_compress_internal (void* dst, size_t dstSize, if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; /* Heuristic : If old table is valid, use it for small inputs */ - if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { + if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, - nbStreams, oldHufTable, bmi2); + nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); + if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; + DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; + } + { unsigned maxSymbolValueEnd = maxSymbolValue; + CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestEnd; + } + if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */ } /* Scan input and build symbol stats */ - { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace, wkspSize) ); + { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) ); if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ } + DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); /* Check validity of previous table */ if ( repeat @@ -681,26 +1393,25 @@ HUF_compress_internal (void* dst, size_t dstSize, *repeat = HUF_repeat_none; } /* Heuristic : use existing table for small inputs */ - if (preferRepeat && repeat && *repeat != HUF_repeat_none) { + if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, - nbStreams, oldHufTable, bmi2); + nbStreams, oldHufTable, flags); } /* Build Huffman Tree */ - huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, maxSymbolValue, huffLog, - table->nodeTable, sizeof(table->nodeTable)); + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); CHECK_F(maxBits); huffLog = (U32)maxBits; - /* Zero unused symbols in CTable, so we can check it for validity */ - memset(table->CTable + (maxSymbolValue + 1), 0, - sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt))); + DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); } /* Write table description header */ - { CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) ); + { CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, table->CTable, maxSymbolValue, huffLog, + &table->wksps.writeCTable_wksp, sizeof(table->wksps.writeCTable_wksp)) ); /* Check if using previous huffman table is beneficial */ if (repeat && *repeat != HUF_repeat_none) { size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue); @@ -708,7 +1419,7 @@ HUF_compress_internal (void* dst, size_t dstSize, if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, - nbStreams, oldHufTable, bmi2); + nbStreams, oldHufTable, flags); } } /* Use the new huffman table */ @@ -716,83 +1427,41 @@ HUF_compress_internal (void* dst, size_t dstSize, op += hSize; if (repeat) { *repeat = HUF_repeat_none; } if (oldHufTable) - memcpy(oldHufTable, table->CTable, sizeof(table->CTable)); /* Save new table */ + ZSTD_memcpy(oldHufTable, table->CTable, sizeof(table->CTable)); /* Save new table */ } return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, - nbStreams, table->CTable, bmi2); -} - - -size_t HUF_compress1X_wksp (void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned huffLog, - void* workSpace, size_t wkspSize) -{ - return HUF_compress_internal(dst, dstSize, src, srcSize, - maxSymbolValue, huffLog, HUF_singleStream, - workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); + nbStreams, table->CTable, flags); } size_t HUF_compress1X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int flags) { + DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_singleStream, workSpace, wkspSize, hufTable, - repeat, preferRepeat, bmi2); -} - -size_t HUF_compress1X (void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned huffLog) -{ - unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; - return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); -} - -/* HUF_compress4X_repeat(): - * compress input using 4 streams. - * provide workspace to generate compression tables */ -size_t HUF_compress4X_wksp (void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned huffLog, - void* workSpace, size_t wkspSize) -{ - return HUF_compress_internal(dst, dstSize, src, srcSize, - maxSymbolValue, huffLog, HUF_fourStreams, - workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); + repeat, flags); } /* HUF_compress4X_repeat(): * compress input using 4 streams. - * re-use an existing huffman compression table */ + * consider skipping quickly + * reuse an existing huffman compression table */ size_t HUF_compress4X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int flags) { + DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_fourStreams, workSpace, wkspSize, - hufTable, repeat, preferRepeat, bmi2); + hufTable, repeat, flags); } -size_t HUF_compress2 (void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned huffLog) -{ - unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; - return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); -} - -size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize) -{ - return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT); -} +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/huf_decompress.c b/vendor/github.com/DataDog/zstd/huf_decompress.c index bb2d0a9..1589ef1 100644 --- a/vendor/github.com/DataDog/zstd/huf_decompress.c +++ b/vendor/github.com/DataDog/zstd/huf_decompress.c @@ -1,52 +1,46 @@ +#ifndef USE_EXTERNAL_ZSTD /* ****************************************************************** - huff0 huffman decoder, - part of Finite State Entropy library - Copyright (C) 2013-present, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * huff0 huffman decoder, + * part of Finite State Entropy library + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ /* ************************************************************** * Dependencies ****************************************************************/ -#include /* memcpy, memset */ +#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */ #include "compiler.h" #include "bitstream.h" /* BIT_* */ #include "fse.h" /* to compress headers */ -#define HUF_STATIC_LINKING_ONLY #include "huf.h" #include "error_private.h" +#include "zstd_internal.h" +#include "bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ + +/* ************************************************************** +* Constants +****************************************************************/ + +#define HUF_DECODER_FAST_TABLELOG 11 /* ************************************************************** * Macros ****************************************************************/ +#ifdef HUF_DISABLE_FAST_DECODE +# define HUF_ENABLE_FAST_DECODE 0 +#else +# define HUF_ENABLE_FAST_DECODE 1 +#endif + /* These two optional macros force the use one way or another of the two * Huffman decompression implementations. You can't force in both directions * at the same time. @@ -56,14 +50,33 @@ #error "Cannot force the use of the X1 and X2 decoders at the same time!" #endif +/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is + * supported at runtime, so we can add the BMI2 target attribute. + * When it is disabled, we will still get BMI2 if it is enabled statically. + */ +#if DYNAMIC_BMI2 +# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE +#else +# define HUF_FAST_BMI2_ATTRS +#endif + +#ifdef __cplusplus +# define HUF_EXTERN_C extern "C" +#else +# define HUF_EXTERN_C +#endif +#define HUF_ASM_DECL HUF_EXTERN_C + +#if DYNAMIC_BMI2 +# define HUF_NEED_BMI2_FUNCTION 1 +#else +# define HUF_NEED_BMI2_FUNCTION 0 +#endif /* ************************************************************** * Error Management ****************************************************************/ #define HUF_isError ERR_isError -#ifndef CHECK_F -#define CHECK_F(f) { size_t const err_ = (f); if (HUF_isError(err_)) return err_; } -#endif /* ************************************************************** @@ -76,6 +89,11 @@ /* ************************************************************** * BMI2 Variant Wrappers ****************************************************************/ +typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, + const void *cSrc, + size_t cSrcSize, + const HUF_DTable *DTable); + #if DYNAMIC_BMI2 #define HUF_DGEN(fn) \ @@ -88,7 +106,7 @@ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ } \ \ - static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \ + static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \ void* dst, size_t dstSize, \ const void* cSrc, size_t cSrcSize, \ const HUF_DTable* DTable) \ @@ -97,9 +115,9 @@ } \ \ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ - size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ + size_t cSrcSize, HUF_DTable const* DTable, int flags) \ { \ - if (bmi2) { \ + if (flags & HUF_flags_bmi2) { \ return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ } \ return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ @@ -109,9 +127,9 @@ #define HUF_DGEN(fn) \ static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ - size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ + size_t cSrcSize, HUF_DTable const* DTable, int flags) \ { \ - (void)bmi2; \ + (void)flags; \ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ } @@ -126,82 +144,381 @@ typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) { DTableDesc dtd; - memcpy(&dtd, table, sizeof(dtd)); + ZSTD_memcpy(&dtd, table, sizeof(dtd)); return dtd; } +static size_t HUF_initFastDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; + size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); + assert(sizeof(size_t) == 8); + return value << bitsConsumed; +} + + +/** + * The input/output arguments to the Huffman fast decoding loop: + * + * ip [in/out] - The input pointers, must be updated to reflect what is consumed. + * op [in/out] - The output pointers, must be updated to reflect what is written. + * bits [in/out] - The bitstream containers, must be updated to reflect the current state. + * dt [in] - The decoding table. + * ilowest [in] - The beginning of the valid range of the input. Decoders may read + * down to this pointer. It may be below iend[0]. + * oend [in] - The end of the output stream. op[3] must not cross oend. + * iend [in] - The end of each input stream. ip[i] may cross iend[i], + * as long as it is above ilowest, but that indicates corruption. + */ +typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; + BYTE const* ilowest; + BYTE* oend; + BYTE const* iend[4]; +} HUF_DecompressFastArgs; + +typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); + +/** + * Initializes args for the fast decoding loop. + * @returns 1 on success + * 0 if the fallback implementation should be used. + * Or an error code on failure. + */ +static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) +{ + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + + const BYTE* const istart = (const BYTE*)src; + + BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); + + /* The fast decoding loop assumes 64-bit little-endian. + * This condition is false on x32. + */ + if (!MEM_isLittleEndian() || MEM_32bits()) + return 0; + + /* Avoid nullptr addition */ + if (dstSize == 0) + return 0; + assert(dst != NULL); + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) + return ERROR(corruption_detected); + + /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. + * If table log is not correct at this point, fallback to the old decoder. + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) + return 0; + + /* Read the jump table. */ + { + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = srcSize - (length1 + length2 + length3 + 6); + args->iend[0] = istart + 6; /* jumpTable */ + args->iend[1] = args->iend[0] + length1; + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + + /* HUF_initFastDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. + */ + if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) + return 0; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ + args->ip[0] = args->iend[1] - sizeof(U64); + args->ip[1] = args->iend[2] - sizeof(U64); + args->ip[2] = args->iend[3] - sizeof(U64); + args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64); + + /* op[] contains the output pointers. */ + args->op[0] = (BYTE*)dst; + args->op[1] = args->op[0] + (dstSize+3)/4; + args->op[2] = args->op[1] + (dstSize+3)/4; + args->op[3] = args->op[2] + (dstSize+3)/4; + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) + return 0; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. + * It is shifted left as it is read, and zeros are + * shifted in. After the lowest valid bit a 1 is + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ + args->bits[0] = HUF_initFastDStream(args->ip[0]); + args->bits[1] = HUF_initFastDStream(args->ip[1]); + args->bits[2] = HUF_initFastDStream(args->ip[2]); + args->bits[3] = HUF_initFastDStream(args->ip[3]); + + /* The decoders must be sure to never read beyond ilowest. + * This is lower than iend[0], but allowing decoders to read + * down to ilowest can allow an extra iteration or two in the + * fast loop. + */ + args->ilowest = istart; + + args->oend = oend; + args->dt = dt; + + return 1; +} + +static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) +{ + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) + return ERROR(corruption_detected); + /* Validate that we haven't read beyond iend[]. + * Note that ip[] may be < iend[] because the MSB is + * the next bit to read, and we may have consumed 100% + * of the stream, so down to iend[i] - 8 is valid. + */ + if (args->ip[stream] < args->iend[stream] - 8) + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ + assert(sizeof(size_t) == 8); + bit->bitContainer = MEM_readLEST(args->ip[stream]); + bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); + bit->start = (const char*)args->ilowest; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; +} + +/* Calls X(N) for each stream 0, 1, 2, 3. */ +#define HUF_4X_FOR_EACH_STREAM(X) \ + do { \ + X(0); \ + X(1); \ + X(2); \ + X(3); \ + } while (0) + +/* Calls X(N, var) for each stream 0, 1, 2, 3. */ +#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ + do { \ + X(0, (var)); \ + X(1, (var)); \ + X(2, (var)); \ + X(3, (var)); \ + } while (0) + #ifndef HUF_FORCE_DECOMPRESS_X2 /*-***************************/ /* single-symbol decoding */ /*-***************************/ -typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */ +typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */ + +/** + * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at + * a time. + */ +static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { + D4 = (U64)((symbol << 8) + nbBits); + } else { + D4 = (U64)(symbol + (nbBits << 8)); + } + assert(D4 < (1U << 16)); + D4 *= 0x0001000100010001ULL; + return D4; +} + +/** + * Increase the tableLog to targetTableLog and rescales the stats. + * If tableLog > targetTableLog this is a no-op. + * @returns New tableLog + */ +static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) +{ + if (tableLog > targetTableLog) + return tableLog; + if (tableLog < targetTableLog) { + U32 const scale = targetTableLog - tableLog; + U32 s; + /* Increase the weight for all non-zero probability symbols by scale. */ + for (s = 0; s < nbSymbols; ++s) { + huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale); + } + /* Update rankVal to reflect the new weights. + * All weights except 0 get moved to weight + scale. + * Weights [1, scale] are empty. + */ + for (s = targetTableLog; s > scale; --s) { + rankVal[s] = rankVal[s - scale]; + } + for (s = scale; s > 0; --s) { + rankVal[s] = 0; + } + } + return targetTableLog; +} + +typedef struct { + U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; + U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; + U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; + BYTE symbols[HUF_SYMBOLVALUE_MAX + 1]; + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; +} HUF_ReadDTableX1_Workspace; -size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) { U32 tableLog = 0; U32 nbSymbols = 0; size_t iSize; void* const dtPtr = DTable + 1; HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr; + HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace; - U32* rankVal; - BYTE* huffWeight; - size_t spaceUsed32 = 0; - - rankVal = (U32 *)workSpace + spaceUsed32; - spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1; - huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32); - spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; - - if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); + DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp)); + if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge); DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); - /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ - iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize); + iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); if (HUF_isError(iSize)) return iSize; + /* Table header */ { DTableDesc dtd = HUF_getDTableDesc(DTable); + U32 const maxTableLog = dtd.maxTableLog + 1; + U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG); + tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog); if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ dtd.tableType = 0; dtd.tableLog = (BYTE)tableLog; - memcpy(DTable, &dtd, sizeof(dtd)); + ZSTD_memcpy(DTable, &dtd, sizeof(dtd)); } - /* Calculate starting value for each rank */ - { U32 n, nextRankStart = 0; - for (n=1; n> 1; - U32 u; - HUF_DEltX1 D; - D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w); - for (u = rankVal[w]; u < rankVal[w] + length; u++) - dt[u] = D; - rankVal[w] += length; - } } + /* Compute symbols and rankStart given rankVal: + * + * rankVal already contains the number of values of each weight. + * + * symbols contains the symbols ordered by weight. First are the rankVal[0] + * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on. + * symbols[0] is filled (but unused) to avoid a branch. + * + * rankStart contains the offset where each rank belongs in the DTable. + * rankStart[0] is not filled because there are no entries in the table for + * weight 0. + */ + { int n; + U32 nextRankStart = 0; + int const unroll = 4; + int const nLimit = (int)nbSymbols - unroll + 1; + for (n=0; n<(int)tableLog+1; n++) { + U32 const curr = nextRankStart; + nextRankStart += wksp->rankVal[n]; + wksp->rankStart[n] = curr; + } + for (n=0; n < nLimit; n += unroll) { + int u; + for (u=0; u < unroll; ++u) { + size_t const w = wksp->huffWeight[n+u]; + wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u); + } + } + for (; n < (int)nbSymbols; ++n) { + size_t const w = wksp->huffWeight[n]; + wksp->symbols[wksp->rankStart[w]++] = (BYTE)n; + } + } + /* fill DTable + * We fill all entries of each weight in order. + * That way length is a constant for each iteration of the outer loop. + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ + { U32 w; + int symbol = wksp->rankVal[0]; + int rankStart = 0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; + int uStart = rankStart; + BYTE const nbBits = (BYTE)(tableLog + 1 - w); + int s; + int u; + switch (length) { + case 1: + for (s=0; ssymbols[symbol + s]; + D.nbBits = nbBits; + dt[uStart] = D; + uStart += 1; + } + break; + case 2: + for (s=0; ssymbols[symbol + s]; + D.nbBits = nbBits; + dt[uStart+0] = D; + dt[uStart+1] = D; + uStart += 2; + } + break; + case 4: + for (s=0; ssymbols[symbol + s], nbBits); + MEM_write64(dt + uStart, D4); + uStart += 4; + } + break; + case 8: + for (s=0; ssymbols[symbol + s], nbBits); + MEM_write64(dt + uStart, D4); + MEM_write64(dt + uStart + 4, D4); + uStart += 8; + } + break; + default: + for (s=0; ssymbols[symbol + s], nbBits); + for (u=0; u < length; u += 16) { + MEM_write64(dt + uStart + u + 0, D4); + MEM_write64(dt + uStart + u + 4, D4); + MEM_write64(dt + uStart + u + 8, D4); + MEM_write64(dt + uStart + u + 12, D4); + } + assert(u == length); + uStart += length; + } + break; + } + symbol += symbolCount; + rankStart += symbolCount * length; + } + } return iSize; } -size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize) -{ - U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; - return HUF_readDTableX1_wksp(DTable, src, srcSize, - workSpace, sizeof(workSpace)); -} - FORCE_INLINE_TEMPLATE BYTE HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog) { @@ -212,15 +529,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog } #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ - *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) + do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) -#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ - if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ - HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) +#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ + } while (0) -#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ - if (MEM_64bits()) \ - HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) +#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits()) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ + } while (0) HINT_INLINE size_t HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) @@ -228,11 +549,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons BYTE* const pStart = p; /* up to 4 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); - HUF_DECODE_SYMBOLX1_1(p, bitDPtr); - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); - HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + if ((pEnd - p) > 3) { + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_1(p, bitDPtr); + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + } + } else { + BIT_reloadDStream(bitDPtr); } /* [0-3] symbols remaining */ @@ -244,7 +569,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons while (p < pEnd) HUF_DECODE_SYMBOLX1_0(p, bitDPtr); - return pEnd-pStart; + return (size_t)(pEnd-pStart); } FORCE_INLINE_TEMPLATE size_t @@ -254,7 +579,7 @@ HUF_decompress1X1_usingDTable_internal_body( const HUF_DTable* DTable) { BYTE* op = (BYTE*)dst; - BYTE* const oend = op + dstSize; + BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); const void* dtPtr = DTable + 1; const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; BIT_DStream_t bitD; @@ -270,6 +595,10 @@ HUF_decompress1X1_usingDTable_internal_body( return dstSize; } +/* HUF_decompress4X1_usingDTable_internal_body(): + * Conditions : + * @dstSize >= 6 + */ FORCE_INLINE_TEMPLATE size_t HUF_decompress4X1_usingDTable_internal_body( void* dst, size_t dstSize, @@ -278,10 +607,12 @@ HUF_decompress4X1_usingDTable_internal_body( { /* Check */ if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ { const BYTE* const istart = (const BYTE*) cSrc; BYTE* const ostart = (BYTE*) dst; BYTE* const oend = ostart + dstSize; + BYTE* const olimit = oend - 3; const void* const dtPtr = DTable + 1; const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; @@ -306,39 +637,42 @@ HUF_decompress4X1_usingDTable_internal_body( BYTE* op2 = opStart2; BYTE* op3 = opStart3; BYTE* op4 = opStart4; - U32 endSignal = BIT_DStream_unfinished; DTableDesc const dtd = HUF_getDTableDesc(DTable); U32 const dtLog = dtd.tableLog; + U32 endSignal = 1; if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ + assert(dstSize >= 6); /* validated above */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); - while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) { - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); - HUF_DECODE_SYMBOLX1_1(op1, &bitD1); - HUF_DECODE_SYMBOLX1_1(op2, &bitD2); - HUF_DECODE_SYMBOLX1_1(op3, &bitD3); - HUF_DECODE_SYMBOLX1_1(op4, &bitD4); - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); - HUF_DECODE_SYMBOLX1_0(op1, &bitD1); - HUF_DECODE_SYMBOLX1_0(op2, &bitD2); - HUF_DECODE_SYMBOLX1_0(op3, &bitD3); - HUF_DECODE_SYMBOLX1_0(op4, &bitD4); - BIT_reloadDStream(&bitD1); - BIT_reloadDStream(&bitD2); - BIT_reloadDStream(&bitD3); - BIT_reloadDStream(&bitD4); + if ((size_t)(oend - op4) >= sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit) ; ) { + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_1(op1, &bitD1); + HUF_DECODE_SYMBOLX1_1(op2, &bitD2); + HUF_DECODE_SYMBOLX1_1(op3, &bitD3); + HUF_DECODE_SYMBOLX1_1(op4, &bitD4); + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_0(op1, &bitD1); + HUF_DECODE_SYMBOLX1_0(op2, &bitD2); + HUF_DECODE_SYMBOLX1_0(op3, &bitD3); + HUF_DECODE_SYMBOLX1_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + } } /* check corruption */ @@ -364,99 +698,248 @@ HUF_decompress4X1_usingDTable_internal_body( } } +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif -typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, - const void *cSrc, - size_t cSrcSize, - const HUF_DTable *DTable); +static +size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} -HUF_DGEN(HUF_decompress1X1_usingDTable_internal) -HUF_DGEN(HUF_decompress4X1_usingDTable_internal) +#if ZSTD_ENABLE_ASM_X86_64_BMI2 +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; +#endif -size_t HUF_decompress1X1_usingDTable( - void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) +static HUF_FAST_BMI2_ATTRS +void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) { - DTableDesc dtd = HUF_getDTableDesc(DTable); - if (dtd.tableType != 0) return ERROR(GENERIC); - return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); + U64 bits[4]; + BYTE const* ip[4]; + BYTE* op[4]; + U16 const* const dtable = (U16 const*)args->dt; + BYTE* const oend = args->oend; + BYTE const* const ilowest = args->ilowest; + + /* Copy the arguments to local variables */ + ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); + ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); + ZSTD_memcpy(&op, &args->op, sizeof(op)); + + assert(MEM_isLittleEndian()); + assert(!MEM_32bits()); + + for (;;) { + BYTE* olimit; + int stream; + + /* Assert loop preconditions */ +#ifndef NDEBUG + for (stream = 0; stream < 4; ++stream) { + assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); + assert(ip[stream] >= ilowest); + } +#endif + /* Compute olimit */ + { + /* Each iteration produces 5 output symbols per stream */ + size_t const oiters = (size_t)(oend - op[3]) / 5; + /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes + * per stream. + */ + size_t const iiters = (size_t)(ip[0] - ilowest) / 7; + /* We can safely run iters iterations before running bounds checks */ + size_t const iters = MIN(oiters, iiters); + size_t const symbols = iters * 5; + + /* We can simply check that op[3] < olimit, instead of checking all + * of our bounds, since we can't hit the other bounds until we've run + * iters iterations, which only happens when op[3] == olimit. + */ + olimit = op[3] + symbols; + + /* Exit fast decoding loop once we reach the end. */ + if (op[3] == olimit) + break; + + /* Exit the decoding loop if any input pointer has crossed the + * previous one. This indicates corruption, and a precondition + * to our loop is that ip[i] >= ip[0]. + */ + for (stream = 1; stream < 4; ++stream) { + if (ip[stream] < ip[stream - 1]) + goto _out; + } + } + +#ifndef NDEBUG + for (stream = 1; stream < 4; ++stream) { + assert(ip[stream] >= ip[stream - 1]); + } +#endif + +#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ + do { \ + int const index = (int)(bits[(_stream)] >> 53); \ + int const entry = (int)dtable[index]; \ + bits[(_stream)] <<= (entry & 0x3F); \ + op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ + } while (0) + +#define HUF_4X1_RELOAD_STREAM(_stream) \ + do { \ + int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ + int const nbBits = ctz & 7; \ + int const nbBytes = ctz >> 3; \ + op[(_stream)] += 5; \ + ip[(_stream)] -= nbBytes; \ + bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ + bits[(_stream)] <<= nbBits; \ + } while (0) + + /* Manually unroll the loop because compilers don't consistently + * unroll the inner loops, which destroys performance. + */ + do { + /* Decode 5 symbols in each of the 4 streams */ + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); + + /* Reload each of the 4 the bitstreams */ + HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); + } while (op[3] < olimit); + +#undef HUF_4X1_DECODE_SYMBOL +#undef HUF_4X1_RELOAD_STREAM + } + +_out: + + /* Save the final values of each of the state variables back to args. */ + ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); + ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); + ZSTD_memcpy(&args->op, &op, sizeof(op)); } -size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, - void* workSpace, size_t wkspSize) +/** + * @returns @p dstSize on success (>= 6) + * 0 if the fallback implementation should be used + * An error if an error occurred + */ +static HUF_FAST_BMI2_ATTRS +size_t +HUF_decompress4X1_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable, + HUF_DecompressFastLoopFn loopFn) { - const BYTE* ip = (const BYTE*) cSrc; + void const* dt = DTable + 1; + BYTE const* const ilowest = (BYTE const*)cSrc; + BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); + HUF_DecompressFastArgs args; + { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); + if (ret == 0) + return 0; + } - size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); - if (HUF_isError(hSize)) return hSize; - if (hSize >= cSrcSize) return ERROR(srcSize_wrong); - ip += hSize; cSrcSize -= hSize; + assert(args.ip[0] >= args.ilowest); + loopFn(&args); + + /* Our loop guarantees that ip[] >= ilowest and that we haven't + * overwritten any op[]. + */ + assert(args.ip[0] >= ilowest); + assert(args.ip[0] >= ilowest); + assert(args.ip[1] >= ilowest); + assert(args.ip[2] >= ilowest); + assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); + + assert(ilowest == args.ilowest); + assert(ilowest + 6 == args.iend[0]); + (void)ilowest; + + /* finish bit streams one by one. */ + { size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + /* Decompress and validate that we've produced exactly the expected length. */ + args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) return ERROR(corruption_detected); + } + } - return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); + /* decoded size */ + assert(dstSize != 0); + return dstSize; } +HUF_DGEN(HUF_decompress1X1_usingDTable_internal) -size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize) +static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int flags) { - U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; - return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize, - workSpace, sizeof(workSpace)); -} + HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; + HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; -size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ - HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX); - return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize); -} +#if DYNAMIC_BMI2 + if (flags & HUF_flags_bmi2) { + fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; + } +# endif + } else { + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +#endif -size_t HUF_decompress4X1_usingDTable( - void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) -{ - DTableDesc dtd = HUF_getDTableDesc(DTable); - if (dtd.tableType != 0) return ERROR(GENERIC); - return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; + } +#endif + + if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { + size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); + if (ret != 0) + return ret; + } + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); } -static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, +static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, - void* workSpace, size_t wkspSize, int bmi2) + void* workSpace, size_t wkspSize, int flags) { const BYTE* ip = (const BYTE*) cSrc; - size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize, - workSpace, wkspSize); + size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); if (HUF_isError(hSize)) return hSize; if (hSize >= cSrcSize) return ERROR(srcSize_wrong); ip += hSize; cSrcSize -= hSize; - return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); -} - -size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, - void* workSpace, size_t wkspSize) -{ - return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); -} - - -size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ - U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; - return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, - workSpace, sizeof(workSpace)); -} -size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ - HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX); - return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); + return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); } #endif /* HUF_FORCE_DECOMPRESS_X2 */ @@ -469,209 +952,322 @@ size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cS /* *************************/ typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ -typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; +typedef struct { BYTE symbol; } sortedSymbol_t; typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; +/** + * Constructs a HUF_DEltX2 in a U32. + */ +static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) +{ + U32 seq; + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3); + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32)); + if (MEM_isLittleEndian()) { + seq = level == 1 ? symbol : (baseSeq + (symbol << 8)); + return seq + (nbBits << 16) + ((U32)level << 24); + } else { + seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); + return (seq << 16) + (nbBits << 8) + (U32)level; + } +} -/* HUF_fillDTableX2Level2() : - * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ -static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed, - const U32* rankValOrigin, const int minWeight, - const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, - U32 nbBitsBaseline, U16 baseSeq) +/** + * Constructs a HUF_DEltX2. + */ +static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level) { HUF_DEltX2 DElt; - U32 rankVal[HUF_TABLELOG_MAX + 1]; + U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val)); + ZSTD_memcpy(&DElt, &val, sizeof(val)); + return DElt; +} - /* get pre-calculated rankVal */ - memcpy(rankVal, rankValOrigin, sizeof(rankVal)); +/** + * Constructs 2 HUF_DEltX2s and packs them into a U64. + */ +static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) +{ + U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + return (U64)DElt + ((U64)DElt << 32); +} - /* fill skipped values */ +/** + * Fills the DTable rank with all the symbols from [begin, end) that are each + * nbBits long. + * + * @param DTableRank The start of the rank in the DTable. + * @param begin The first symbol to fill (inclusive). + * @param end The last symbol to fill (exclusive). + * @param nbBits Each symbol is nbBits long. + * @param tableLog The table log. + * @param baseSeq If level == 1 { 0 } else { the first level symbol } + * @param level The level in the table. Must be 1 or 2. + */ +static void HUF_fillDTableX2ForWeight( + HUF_DEltX2* DTableRank, + sortedSymbol_t const* begin, sortedSymbol_t const* end, + U32 nbBits, U32 tableLog, + U16 baseSeq, int const level) +{ + U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */); + const sortedSymbol_t* ptr; + assert(level >= 1 && level <= 2); + switch (length) { + case 1: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + *DTableRank++ = DElt; + } + break; + case 2: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + DTableRank[0] = DElt; + DTableRank[1] = DElt; + DTableRank += 2; + } + break; + case 4: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + DTableRank += 4; + } + break; + case 8: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + DTableRank += 8; + } + break; + default: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + HUF_DEltX2* const DTableRankEnd = DTableRank + length; + for (; DTableRank != DTableRankEnd; DTableRank += 8) { + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + } + } + break; + } +} + +/* HUF_fillDTableX2Level2() : + * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ +static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits, + const U32* rankVal, const int minWeight, const int maxWeight1, + const sortedSymbol_t* sortedSymbols, U32 const* rankStart, + U32 nbBitsBaseline, U16 baseSeq) +{ + /* Fill skipped values (all positions up to rankVal[minWeight]). + * These are positions only get a single symbol because the combined weight + * is too large. + */ if (minWeight>1) { - U32 i, skipSize = rankVal[minWeight]; - MEM_writeLE16(&(DElt.sequence), baseSeq); - DElt.nbBits = (BYTE)(consumed); - DElt.length = 1; - for (i = 0; i < skipSize; i++) - DTable[i] = DElt; + U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */); + U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1); + int const skipSize = rankVal[minWeight]; + assert(length > 1); + assert((U32)skipSize < length); + switch (length) { + case 2: + assert(skipSize == 1); + ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); + break; + case 4: + assert(skipSize <= 4); + ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); + break; + default: + { + int i; + for (i = 0; i < skipSize; i += 8) { + ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); + } + } + } } - /* fill DTable */ - { U32 s; for (s=0; s= 1 */ - - rankVal[weight] += length; - } } + /* Fill each of the second level symbols by weight. */ + { + int w; + for (w = minWeight; w < maxWeight1; ++w) { + int const begin = rankStart[w]; + int const end = rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; + U32 const totalBits = nbBits + consumedBits; + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedSymbols + begin, sortedSymbols + end, + totalBits, targetLog, + baseSeq, /* level */ 2); + } + } } - static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, - const sortedSymbol_t* sortedList, const U32 sortedListSize, - const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, + const sortedSymbol_t* sortedList, + const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline) { - U32 rankVal[HUF_TABLELOG_MAX + 1]; + U32* const rankVal = rankValOrigin[0]; const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ const U32 minBits = nbBitsBaseline - maxWeight; - U32 s; - - memcpy(rankVal, rankValOrigin, sizeof(rankVal)); - - /* fill DTable */ - for (s=0; s= minBits) { /* enough room for a second symbol */ - U32 sortedRank; + int w; + int const wEnd = (int)maxWeight + 1; + + /* Fill DTable in order of weight. */ + for (w = 1; w < wEnd; ++w) { + int const begin = (int)rankStart[w]; + int const end = (int)rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; + + if (targetLog-nbBits >= minBits) { + /* Enough room for a second symbol. */ + int start = rankVal[w]; + U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */); int minWeight = nbBits + scaleLog; + int s; if (minWeight < 1) minWeight = 1; - sortedRank = rankStart[minWeight]; - HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits, - rankValOrigin[nbBits], minWeight, - sortedList+sortedRank, sortedListSize-sortedRank, - nbBitsBaseline, symbol); + /* Fill the DTable for every symbol of weight w. + * These symbols get at least 1 second symbol. + */ + for (s = begin; s != end; ++s) { + HUF_fillDTableX2Level2( + DTable + start, targetLog, nbBits, + rankValOrigin[nbBits], minWeight, wEnd, + sortedList, rankStart, + nbBitsBaseline, sortedList[s].symbol); + start += length; + } } else { - HUF_DEltX2 DElt; - MEM_writeLE16(&(DElt.sequence), symbol); - DElt.nbBits = (BYTE)(nbBits); - DElt.length = 1; - { U32 const end = start + length; - U32 u; - for (u = start; u < end; u++) DTable[u] = DElt; - } } - rankVal[weight] += length; + /* Only a single symbol. */ + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedList + begin, sortedList + end, + nbBits, targetLog, + /* baseSeq */ 0, /* level */ 1); + } } } +typedef struct { + rankValCol_t rankVal[HUF_TABLELOG_MAX]; + U32 rankStats[HUF_TABLELOG_MAX + 1]; + U32 rankStart0[HUF_TABLELOG_MAX + 3]; + sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; + BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; + U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +} HUF_ReadDTableX2_Workspace; + size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, - void* workSpace, size_t wkspSize) + void* workSpace, size_t wkspSize, int flags) { - U32 tableLog, maxW, sizeOfSort, nbSymbols; + U32 tableLog, maxW, nbSymbols; DTableDesc dtd = HUF_getDTableDesc(DTable); - U32 const maxTableLog = dtd.maxTableLog; + U32 maxTableLog = dtd.maxTableLog; size_t iSize; void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; U32 *rankStart; - rankValCol_t* rankVal; - U32* rankStats; - U32* rankStart0; - sortedSymbol_t* sortedSymbol; - BYTE* weightList; - size_t spaceUsed32 = 0; - - rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32); - spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2; - rankStats = (U32 *)workSpace + spaceUsed32; - spaceUsed32 += HUF_TABLELOG_MAX + 1; - rankStart0 = (U32 *)workSpace + spaceUsed32; - spaceUsed32 += HUF_TABLELOG_MAX + 2; - sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t); - spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2; - weightList = (BYTE *)((U32 *)workSpace + spaceUsed32); - spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; - - if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); - - rankStart = rankStart0 + 1; - memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1)); + HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace; + + if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC); + + rankStart = wksp->rankStart0 + 1; + ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats)); + ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0)); DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); - /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ - iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize); + iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); if (HUF_isError(iSize)) return iSize; /* check result */ if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ + if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG; /* find maxWeight */ - for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ + for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ /* Get start index of each weight */ { U32 w, nextRankStart = 0; for (w=1; wrankStats[w]; + rankStart[w] = curr; } rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ - sizeOfSort = nextRankStart; + rankStart[maxW+1] = nextRankStart; } /* sort symbols by weight */ { U32 s; for (s=0; sweightList[s]; U32 const r = rankStart[w]++; - sortedSymbol[r].symbol = (BYTE)s; - sortedSymbol[r].weight = (BYTE)w; + wksp->sortedSymbol[r].symbol = (BYTE)s; } rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ } /* Build rankVal */ - { U32* const rankVal0 = rankVal[0]; + { U32* const rankVal0 = wksp->rankVal[0]; { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */ U32 nextRankVal = 0; U32 w; for (w=1; wrankStats[w] << (w+rescale); + rankVal0[w] = curr; } } { U32 const minBits = tableLog+1 - maxW; U32 consumed; for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) { - U32* const rankValPtr = rankVal[consumed]; + U32* const rankValPtr = wksp->rankVal[consumed]; U32 w; for (w = 1; w < maxW+1; w++) { rankValPtr[w] = rankVal0[w] >> consumed; } } } } HUF_fillDTableX2(dt, maxTableLog, - sortedSymbol, sizeOfSort, - rankStart0, rankVal, maxW, + wksp->sortedSymbol, + wksp->rankStart0, wksp->rankVal, maxW, tableLog+1); dtd.tableLog = (BYTE)maxTableLog; dtd.tableType = 1; - memcpy(DTable, &dtd, sizeof(dtd)); + ZSTD_memcpy(DTable, &dtd, sizeof(dtd)); return iSize; } -size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize) -{ - U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; - return HUF_readDTableX2_wksp(DTable, src, srcSize, - workSpace, sizeof(workSpace)); -} - FORCE_INLINE_TEMPLATE U32 HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - memcpy(op, dt+val, 2); + ZSTD_memcpy(op, &dt[val].sequence, 2); BIT_skipBits(DStream, dt[val].nbBits); return dt[val].length; } @@ -680,28 +1276,34 @@ FORCE_INLINE_TEMPLATE U32 HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - memcpy(op, dt+val, 1); - if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); - else { + ZSTD_memcpy(op, &dt[val].sequence, 1); + if (dt[val].length==1) { + BIT_skipBits(DStream, dt[val].nbBits); + } else { if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { BIT_skipBits(DStream, dt[val].nbBits); if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); - } } + } + } return 1; } #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ - ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) + do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) -#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ - if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ - ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) +#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ + } while (0) -#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ - if (MEM_64bits()) \ - ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) +#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ + do { \ + if (MEM_64bits()) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ + } while (0) HINT_INLINE size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, @@ -710,19 +1312,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, BYTE* const pStart = p; /* up to 8 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_1(p, bitDPtr); - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { + if (dtLog <= 11 && MEM_64bits()) { + /* up to 10 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) { + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } else { + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } + } else { + BIT_reloadDStream(bitDPtr); } /* closer to end : up to 2 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + if ((size_t)(pEnd - p) >= 2) { + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); - while (p <= pEnd-2) - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + while (p <= pEnd-2) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + } if (p < pEnd) p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); @@ -743,7 +1363,7 @@ HUF_decompress1X2_usingDTable_internal_body( /* decode */ { BYTE* const ostart = (BYTE*) dst; - BYTE* const oend = ostart + dstSize; + BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; DTableDesc const dtd = HUF_getDTableDesc(DTable); @@ -757,7 +1377,10 @@ HUF_decompress1X2_usingDTable_internal_body( return dstSize; } - +/* HUF_decompress4X2_usingDTable_internal_body(): + * Conditions: + * @dstSize >= 6 + */ FORCE_INLINE_TEMPLATE size_t HUF_decompress4X2_usingDTable_internal_body( void* dst, size_t dstSize, @@ -765,10 +1388,12 @@ HUF_decompress4X2_usingDTable_internal_body( const HUF_DTable* DTable) { if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ { const BYTE* const istart = (const BYTE*) cSrc; BYTE* const ostart = (BYTE*) dst; BYTE* const oend = ostart + dstSize; + BYTE* const olimit = oend - (sizeof(size_t)-1); const void* const dtPtr = DTable+1; const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; @@ -793,37 +1418,66 @@ HUF_decompress4X2_usingDTable_internal_body( BYTE* op2 = opStart2; BYTE* op3 = opStart3; BYTE* op4 = opStart4; - U32 endSignal; + U32 endSignal = 1; DTableDesc const dtd = HUF_getDTableDesc(DTable); U32 const dtLog = dtd.tableLog; - if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ + assert(dstSize >= 6 /* validated above */); CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); /* 16-32 symbols per loop (4-8 symbols per stream) */ - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); - for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) { - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_1(op1, &bitD1); - HUF_DECODE_SYMBOLX2_1(op2, &bitD2); - HUF_DECODE_SYMBOLX2_1(op3, &bitD3); - HUF_DECODE_SYMBOLX2_1(op4, &bitD4); - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_0(op1, &bitD1); - HUF_DECODE_SYMBOLX2_0(op2, &bitD2); - HUF_DECODE_SYMBOLX2_0(op3, &bitD3); - HUF_DECODE_SYMBOLX2_0(op4, &bitD4); - - endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4); + if ((size_t)(oend - op4) >= sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit); ) { +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; +#else + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal = (U32)LIKELY((U32) + (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); +#endif + } } /* check corruption */ @@ -847,94 +1501,285 @@ HUF_decompress4X2_usingDTable_internal_body( } } -HUF_DGEN(HUF_decompress1X2_usingDTable_internal) -HUF_DGEN(HUF_decompress4X2_usingDTable_internal) +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif -size_t HUF_decompress1X2_usingDTable( - void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) -{ - DTableDesc dtd = HUF_getDTableDesc(DTable); - if (dtd.tableType != 1) return ERROR(GENERIC); - return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +static +size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); } -size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, - void* workSpace, size_t wkspSize) +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; + +#endif + +static HUF_FAST_BMI2_ATTRS +void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) { - const BYTE* ip = (const BYTE*) cSrc; + U64 bits[4]; + BYTE const* ip[4]; + BYTE* op[4]; + BYTE* oend[4]; + HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; + BYTE const* const ilowest = args->ilowest; + + /* Copy the arguments to local registers. */ + ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); + ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); + ZSTD_memcpy(&op, &args->op, sizeof(op)); + + oend[0] = op[1]; + oend[1] = op[2]; + oend[2] = op[3]; + oend[3] = args->oend; + + assert(MEM_isLittleEndian()); + assert(!MEM_32bits()); + + for (;;) { + BYTE* olimit; + int stream; + + /* Assert loop preconditions */ +#ifndef NDEBUG + for (stream = 0; stream < 4; ++stream) { + assert(op[stream] <= oend[stream]); + assert(ip[stream] >= ilowest); + } +#endif + /* Compute olimit */ + { + /* Each loop does 5 table lookups for each of the 4 streams. + * Each table lookup consumes up to 11 bits of input, and produces + * up to 2 bytes of output. + */ + /* We can consume up to 7 bytes of input per iteration per stream. + * We also know that each input pointer is >= ip[0]. So we can run + * iters loops before running out of input. + */ + size_t iters = (size_t)(ip[0] - ilowest) / 7; + /* Each iteration can produce up to 10 bytes of output per stream. + * Each output stream my advance at different rates. So take the + * minimum number of safe iterations among all the output streams. + */ + for (stream = 0; stream < 4; ++stream) { + size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; + iters = MIN(iters, oiters); + } + + /* Each iteration produces at least 5 output symbols. So until + * op[3] crosses olimit, we know we haven't executed iters + * iterations yet. This saves us maintaining an iters counter, + * at the expense of computing the remaining # of iterations + * more frequently. + */ + olimit = op[3] + (iters * 5); + + /* Exit the fast decoding loop once we reach the end. */ + if (op[3] == olimit) + break; + + /* Exit the decoding loop if any input pointer has crossed the + * previous one. This indicates corruption, and a precondition + * to our loop is that ip[i] >= ip[0]. + */ + for (stream = 1; stream < 4; ++stream) { + if (ip[stream] < ip[stream - 1]) + goto _out; + } + } - size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, - workSpace, wkspSize); - if (HUF_isError(hSize)) return hSize; - if (hSize >= cSrcSize) return ERROR(srcSize_wrong); - ip += hSize; cSrcSize -= hSize; +#ifndef NDEBUG + for (stream = 1; stream < 4; ++stream) { + assert(ip[stream] >= ip[stream - 1]); + } +#endif - return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); -} +#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ + do { \ + if ((_decode3) || (_stream) != 3) { \ + int const index = (int)(bits[(_stream)] >> 53); \ + HUF_DEltX2 const entry = dtable[index]; \ + MEM_write16(op[(_stream)], entry.sequence); \ + bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ + op[(_stream)] += (entry.length); \ + } \ + } while (0) + +#define HUF_4X2_RELOAD_STREAM(_stream) \ + do { \ + HUF_4X2_DECODE_SYMBOL(3, 1); \ + { \ + int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ + int const nbBits = ctz & 7; \ + int const nbBytes = ctz >> 3; \ + ip[(_stream)] -= nbBytes; \ + bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ + bits[(_stream)] <<= nbBits; \ + } \ + } while (0) + + /* Manually unroll the loop because compilers don't consistently + * unroll the inner loops, which destroys performance. + */ + do { + /* Decode 5 symbols from each of the first 3 streams. + * The final stream will be decoded during the reload phase + * to reduce register pressure. + */ + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); + + /* Decode one symbol from the final stream */ + HUF_4X2_DECODE_SYMBOL(3, 1); + + /* Decode 4 symbols from the final stream & reload bitstreams. + * The final stream is reloaded last, meaning that all 5 symbols + * are decoded from the final stream before it is reloaded. + */ + HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); + } while (op[3] < olimit); + } +#undef HUF_4X2_DECODE_SYMBOL +#undef HUF_4X2_RELOAD_STREAM -size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize) -{ - U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; - return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize, - workSpace, sizeof(workSpace)); -} +_out: -size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ - HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX); - return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); + /* Save the final values of each of the state variables back to args. */ + ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); + ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); + ZSTD_memcpy(&args->op, &op, sizeof(op)); } -size_t HUF_decompress4X2_usingDTable( + +static HUF_FAST_BMI2_ATTRS size_t +HUF_decompress4X2_usingDTable_internal_fast( void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) + const HUF_DTable* DTable, + HUF_DecompressFastLoopFn loopFn) { + void const* dt = DTable + 1; + const BYTE* const ilowest = (const BYTE*)cSrc; + BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); + HUF_DecompressFastArgs args; + { + size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret == 0) + return 0; + } + + assert(args.ip[0] >= args.ilowest); + loopFn(&args); + + /* note : op4 already verified within main loop */ + assert(args.ip[0] >= ilowest); + assert(args.ip[1] >= ilowest); + assert(args.ip[2] >= ilowest); + assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); + + assert(ilowest == args.ilowest); + assert(ilowest + 6 == args.iend[0]); + (void)ilowest; + + /* finish bitStreams one by one */ + { + size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) + return ERROR(corruption_detected); + } + } + + /* decoded size */ + return dstSize; +} + +static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int flags) { - DTableDesc dtd = HUF_getDTableDesc(DTable); - if (dtd.tableType != 1) return ERROR(GENERIC); - return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); + HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; + HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; + +#if DYNAMIC_BMI2 + if (flags & HUF_flags_bmi2) { + fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; + } +# endif + } else { + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; + } +#endif + + if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { + size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); + if (ret != 0) + return ret; + } + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); } -static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, +HUF_DGEN(HUF_decompress1X2_usingDTable_internal) + +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, - void* workSpace, size_t wkspSize, int bmi2) + void* workSpace, size_t wkspSize, int flags) { const BYTE* ip = (const BYTE*) cSrc; - size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, - workSpace, wkspSize); + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, + workSpace, wkspSize, flags); if (HUF_isError(hSize)) return hSize; if (hSize >= cSrcSize) return ERROR(srcSize_wrong); ip += hSize; cSrcSize -= hSize; - return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); } -size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, - void* workSpace, size_t wkspSize) + void* workSpace, size_t wkspSize, int flags) { - return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); -} - + const BYTE* ip = (const BYTE*) cSrc; -size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize) -{ - U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; - return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, - workSpace, sizeof(workSpace)); -} + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, + workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; -size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ - HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX); - return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); } #endif /* HUF_FORCE_DECOMPRESS_X1 */ @@ -944,66 +1789,28 @@ size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cS /* Universal decompression selectors */ /* ***********************************/ -size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, - const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) -{ - DTableDesc const dtd = HUF_getDTableDesc(DTable); -#if defined(HUF_FORCE_DECOMPRESS_X1) - (void)dtd; - assert(dtd.tableType == 0); - return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); -#elif defined(HUF_FORCE_DECOMPRESS_X2) - (void)dtd; - assert(dtd.tableType == 1); - return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); -#else - return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : - HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); -#endif -} - -size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, - const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) -{ - DTableDesc const dtd = HUF_getDTableDesc(DTable); -#if defined(HUF_FORCE_DECOMPRESS_X1) - (void)dtd; - assert(dtd.tableType == 0); - return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); -#elif defined(HUF_FORCE_DECOMPRESS_X2) - (void)dtd; - assert(dtd.tableType == 1); - return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); -#else - return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : - HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); -#endif -} - #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; -static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = +static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] = { /* single, double, quad */ - {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */ - {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */ - {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */ - {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */ - {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */ - {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */ - {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */ - {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */ - {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */ - {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */ - {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */ - {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */ - {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */ - {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */ - {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */ - {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */ + {{0,0}, {1,1}}, /* Q==0 : impossible */ + {{0,0}, {1,1}}, /* Q==1 : impossible */ + {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */ + {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */ + {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */ + {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */ + {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */ + {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */ + {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */ + {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */ + {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */ + {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */ + {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */ + {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */ + {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */ + {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */ }; #endif @@ -1030,188 +1837,92 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) U32 const D256 = (U32)(dstSize >> 8); U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); - DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */ + DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */ return DTime1 < DTime0; } #endif } - -typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); - -size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ -#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) - static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 }; -#endif - - /* validation checks */ - if (dstSize == 0) return ERROR(dstSize_tooSmall); - if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ - if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ - if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ - - { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); -#if defined(HUF_FORCE_DECOMPRESS_X1) - (void)algoNb; - assert(algoNb == 0); - return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize); -#elif defined(HUF_FORCE_DECOMPRESS_X2) - (void)algoNb; - assert(algoNb == 1); - return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize); -#else - return decompress[algoNb](dst, dstSize, cSrc, cSrcSize); -#endif - } -} - -size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ - /* validation checks */ - if (dstSize == 0) return ERROR(dstSize_tooSmall); - if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ - if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ - if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ - - { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); -#if defined(HUF_FORCE_DECOMPRESS_X1) - (void)algoNb; - assert(algoNb == 0); - return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize); -#elif defined(HUF_FORCE_DECOMPRESS_X2) - (void)algoNb; - assert(algoNb == 1); - return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize); -#else - return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : - HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ; -#endif - } -} - -size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) -{ - U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; - return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize, - workSpace, sizeof(workSpace)); -} - - -size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, - size_t dstSize, const void* cSrc, - size_t cSrcSize, void* workSpace, - size_t wkspSize) -{ - /* validation checks */ - if (dstSize == 0) return ERROR(dstSize_tooSmall); - if (cSrcSize == 0) return ERROR(corruption_detected); - - { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); -#if defined(HUF_FORCE_DECOMPRESS_X1) - (void)algoNb; - assert(algoNb == 0); - return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); -#elif defined(HUF_FORCE_DECOMPRESS_X2) - (void)algoNb; - assert(algoNb == 1); - return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); -#else - return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, - cSrcSize, workSpace, wkspSize): - HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); -#endif - } -} - size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, - void* workSpace, size_t wkspSize) + void* workSpace, size_t wkspSize, int flags) { /* validation checks */ if (dstSize == 0) return ERROR(dstSize_tooSmall); if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ - if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ - if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ + if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ + if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); #if defined(HUF_FORCE_DECOMPRESS_X1) (void)algoNb; assert(algoNb == 0); return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, - cSrcSize, workSpace, wkspSize); + cSrcSize, workSpace, wkspSize, flags); #elif defined(HUF_FORCE_DECOMPRESS_X2) (void)algoNb; assert(algoNb == 1); return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, - cSrcSize, workSpace, wkspSize); + cSrcSize, workSpace, wkspSize, flags); #else return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, - cSrcSize, workSpace, wkspSize): + cSrcSize, workSpace, wkspSize, flags): HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, - cSrcSize, workSpace, wkspSize); + cSrcSize, workSpace, wkspSize, flags); #endif } } -size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize) -{ - U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; - return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, - workSpace, sizeof(workSpace)); -} - -size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) +size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) { DTableDesc const dtd = HUF_getDTableDesc(DTable); #if defined(HUF_FORCE_DECOMPRESS_X1) (void)dtd; assert(dtd.tableType == 0); - return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); + return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); #elif defined(HUF_FORCE_DECOMPRESS_X2) (void)dtd; assert(dtd.tableType == 1); - return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); + return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); #else - return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : - HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); + return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : + HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); #endif } #ifndef HUF_FORCE_DECOMPRESS_X2 -size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) +size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) { const BYTE* ip = (const BYTE*) cSrc; - size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize); + size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); if (HUF_isError(hSize)) return hSize; if (hSize >= cSrcSize) return ERROR(srcSize_wrong); ip += hSize; cSrcSize -= hSize; - return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); + return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); } #endif -size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) +size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) { DTableDesc const dtd = HUF_getDTableDesc(DTable); #if defined(HUF_FORCE_DECOMPRESS_X1) (void)dtd; assert(dtd.tableType == 0); - return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); + return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); #elif defined(HUF_FORCE_DECOMPRESS_X2) (void)dtd; assert(dtd.tableType == 1); - return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); + return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); #else - return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : - HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); + return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : + HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); #endif } -size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) { /* validation checks */ if (dstSize == 0) return ERROR(dstSize_tooSmall); @@ -1221,14 +1932,16 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds #if defined(HUF_FORCE_DECOMPRESS_X1) (void)algoNb; assert(algoNb == 0); - return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); + return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); #elif defined(HUF_FORCE_DECOMPRESS_X2) (void)algoNb; assert(algoNb == 1); - return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); + return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); #else - return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : - HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); + return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : + HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); #endif } } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/huf_decompress_amd64.S b/vendor/github.com/DataDog/zstd/huf_decompress_amd64.S new file mode 100644 index 0000000..1178b11 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/huf_decompress_amd64.S @@ -0,0 +1,602 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "portability_macros.h" + +#if defined(__ELF__) && defined(__GNUC__) +/* Stack marking + * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart + */ +.section .note.GNU-stack,"",%progbits + +#if defined(__aarch64__) +/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64. + * See: https://github.com/facebook/zstd/issues/3841 + * See: https://gcc.godbolt.org/z/sqr5T4ffK + * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/ + * See: https://reviews.llvm.org/D62609 + */ +.pushsection .note.gnu.property, "a" +.p2align 3 +.long 4 /* size of the name - "GNU\0" */ +.long 0x10 /* size of descriptor */ +.long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */ +.asciz "GNU" +.long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */ +.long 4 /* pr_datasz - 4 bytes */ +.long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */ +.p2align 3 /* pr_padding - bring everything to 8 byte alignment */ +.popsection +#endif + +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +/* Calling convention: + * + * %rdi (or %rcx on Windows) contains the first argument: HUF_DecompressAsmArgs*. + * %rbp isn't maintained (no frame pointer). + * %rsp contains the stack pointer that grows down. + * No red-zone is assumed, only addresses >= %rsp are used. + * All register contents are preserved. + */ + +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) +.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop +.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop +.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop +.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop +.text + +/* Sets up register mappings for clarity. + * op[], bits[], dtable & ip[0] each get their own register. + * ip[1,2,3] & olimit alias var[]. + * %rax is a scratch register. + */ + +#define op0 rsi +#define op1 rbx +#define op2 rcx +#define op3 rdi + +#define ip0 r8 +#define ip1 r9 +#define ip2 r10 +#define ip3 r11 + +#define bits0 rbp +#define bits1 rdx +#define bits2 r12 +#define bits3 r13 +#define dtable r14 +#define olimit r15 + +/* var[] aliases ip[1,2,3] & olimit + * ip[1,2,3] are saved every iteration. + * olimit is only used in compute_olimit. + */ +#define var0 r15 +#define var1 r9 +#define var2 r10 +#define var3 r11 + +/* 32-bit var registers */ +#define vard0 r15d +#define vard1 r9d +#define vard2 r10d +#define vard3 r11d + +/* Calls X(N) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3) + +/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ + X(0, idx); \ + X(1, idx); \ + X(2, idx); \ + X(3, idx) + +/* Define both _HUF_* & HUF_* symbols because MacOS + * C symbols are prefixed with '_' & Linux symbols aren't. + */ +_HUF_decompress4X1_usingDTable_internal_fast_asm_loop: +HUF_decompress4X1_usingDTable_internal_fast_asm_loop: + ZSTD_CET_ENDBRANCH + /* Save all registers - even if they are callee saved for simplicity. */ + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 + + /* Read HUF_DecompressAsmArgs* args from %rax */ +#if defined(_WIN32) + movq %rcx, %rax +#else + movq %rdi, %rax +#endif + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax /* argument */ + push 104(%rax) /* ilowest */ + push 112(%rax) /* oend */ + push %olimit /* olimit space */ + + subq $24, %rsp + +.L_4X1_compute_olimit: + /* Computes how many iterations we can do safely + * %r15, %rax may be clobbered + * rbx, rdx must be saved + * op3 & ip0 mustn't be clobbered + */ + movq %rbx, 0(%rsp) + movq %rdx, 8(%rsp) + + movq 32(%rsp), %rax /* rax = oend */ + subq %op3, %rax /* rax = oend - op3 */ + + /* r15 = (oend - op3) / 5 */ + movabsq $-3689348814741910323, %rdx + mulq %rdx + movq %rdx, %r15 + shrq $2, %r15 + + movq %ip0, %rax /* rax = ip0 */ + movq 40(%rsp), %rdx /* rdx = ilowest */ + subq %rdx, %rax /* rax = ip0 - ilowest */ + movq %rax, %rbx /* rbx = ip0 - ilowest */ + + /* rdx = (ip0 - ilowest) / 7 */ + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %rbx + shrq %rbx + addq %rbx, %rdx + shrq $2, %rdx + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + /* r15 = r15 * 5 */ + leaq (%r15, %r15, 4), %r15 + + /* olimit = op3 + r15 */ + addq %op3, %olimit + + movq 8(%rsp), %rdx + movq 0(%rsp), %rbx + + /* If (op3 + 20 > olimit) */ + movq %op3, %rax /* rax = op3 */ + cmpq %rax, %olimit /* op3 == olimit */ + je .L_4X1_exit + + /* If (ip1 < ip0) go to exit */ + cmpq %ip0, %ip1 + jb .L_4X1_exit + + /* If (ip2 < ip1) go to exit */ + cmpq %ip1, %ip2 + jb .L_4X1_exit + + /* If (ip3 < ip2) go to exit */ + cmpq %ip2, %ip3 + jb .L_4X1_exit + +/* Reads top 11 bits from bits[n] + * Loads dt[bits[n]] into var[n] + */ +#define GET_NEXT_DELT(n) \ + movq $53, %var##n; \ + shrxq %var##n, %bits##n, %var##n; \ + movzwl (%dtable,%var##n,2),%vard##n + +/* var[n] must contain the DTable entry computed with GET_NEXT_DELT + * Moves var[n] to %rax + * bits[n] <<= var[n] & 63 + * op[n][idx] = %rax >> 8 + * %ah is a way to access bits [8, 16) of %rax + */ +#define DECODE_FROM_DELT(n, idx) \ + movq %var##n, %rax; \ + shlxq %var##n, %bits##n, %bits##n; \ + movb %ah, idx(%op##n) + +/* Assumes GET_NEXT_DELT has been called. + * Calls DECODE_FROM_DELT then GET_NEXT_DELT + */ +#define DECODE_AND_GET_NEXT(n, idx) \ + DECODE_FROM_DELT(n, idx); \ + GET_NEXT_DELT(n) \ + +/* // ctz & nbBytes is stored in bits[n] + * // nbBits is stored in %rax + * ctz = CTZ[bits[n]] + * nbBits = ctz & 7 + * nbBytes = ctz >> 3 + * op[n] += 5 + * ip[n] -= nbBytes + * // Note: x86-64 is little-endian ==> no bswap + * bits[n] = MEM_readST(ip[n]) | 1 + * bits[n] <<= nbBits + */ +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + andq $7, %rax; \ + shrq $3, %bits##n; \ + leaq 5(%op##n), %op##n; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlx %rax, %bits##n, %bits##n + + /* Store clobbered variables on the stack */ + movq %olimit, 24(%rsp) + movq %ip1, 0(%rsp) + movq %ip2, 8(%rsp) + movq %ip3, 16(%rsp) + + /* Call GET_NEXT_DELT for each stream */ + FOR_EACH_STREAM(GET_NEXT_DELT) + + .p2align 6 + +.L_4X1_loop_body: + /* Decode 5 symbols in each of the 4 streams (20 total) + * Must have called GET_NEXT_DELT for each stream + */ + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) + + /* Load ip[1,2,3] from stack (var[] aliases them) + * ip[] is needed for RELOAD_BITS + * Each will be stored back to the stack after RELOAD + */ + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 + + /* Reload each stream & fetch the next table entry + * to prepare for the next iteration + */ + RELOAD_BITS(0) + GET_NEXT_DELT(0) + + RELOAD_BITS(1) + movq %ip1, 0(%rsp) + GET_NEXT_DELT(1) + + RELOAD_BITS(2) + movq %ip2, 8(%rsp) + GET_NEXT_DELT(2) + + RELOAD_BITS(3) + movq %ip3, 16(%rsp) + GET_NEXT_DELT(3) + + /* If op3 < olimit: continue the loop */ + cmp %op3, 24(%rsp) + ja .L_4X1_loop_body + + /* Reload ip[1,2,3] from stack */ + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 + + /* Re-compute olimit */ + jmp .L_4X1_compute_olimit + +#undef GET_NEXT_DELT +#undef DECODE_FROM_DELT +#undef DECODE +#undef RELOAD_BITS +.L_4X1_exit: + addq $24, %rsp + + /* Restore stack (oend & olimit) */ + pop %rax /* olimit */ + pop %rax /* oend */ + pop %rax /* ilowest */ + pop %rax /* arg */ + + /* Save ip / op / bits */ + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) + + /* Restore registers */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret + +_HUF_decompress4X2_usingDTable_internal_fast_asm_loop: +HUF_decompress4X2_usingDTable_internal_fast_asm_loop: + ZSTD_CET_ENDBRANCH + /* Save all registers - even if they are callee saved for simplicity. */ + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 + + /* Read HUF_DecompressAsmArgs* args from %rax */ +#if defined(_WIN32) + movq %rcx, %rax +#else + movq %rdi, %rax +#endif + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax /* argument */ + push %rax /* olimit */ + push 104(%rax) /* ilowest */ + + movq 112(%rax), %rax + push %rax /* oend3 */ + + movq %op3, %rax + push %rax /* oend2 */ + + movq %op2, %rax + push %rax /* oend1 */ + + movq %op1, %rax + push %rax /* oend0 */ + + /* Scratch space */ + subq $8, %rsp + +.L_4X2_compute_olimit: + /* Computes how many iterations we can do safely + * %r15, %rax may be clobbered + * rdx must be saved + * op[1,2,3,4] & ip0 mustn't be clobbered + */ + movq %rdx, 0(%rsp) + + /* We can consume up to 7 input bytes each iteration. */ + movq %ip0, %rax /* rax = ip0 */ + movq 40(%rsp), %rdx /* rdx = ilowest */ + subq %rdx, %rax /* rax = ip0 - ilowest */ + movq %rax, %r15 /* r15 = ip0 - ilowest */ + + /* rdx = rax / 7 */ + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %r15 + shrq %r15 + addq %r15, %rdx + shrq $2, %rdx + + /* r15 = (ip0 - ilowest) / 7 */ + movq %rdx, %r15 + + /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */ + movq 8(%rsp), %rax /* rax = oend0 */ + subq %op0, %rax /* rax = oend0 - op0 */ + movq 16(%rsp), %rdx /* rdx = oend1 */ + subq %op1, %rdx /* rdx = oend1 - op1 */ + + cmpq %rax, %rdx + cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ + + movq 24(%rsp), %rax /* rax = oend2 */ + subq %op2, %rax /* rax = oend2 - op2 */ + + cmpq %rax, %rdx + cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ + + movq 32(%rsp), %rax /* rax = oend3 */ + subq %op3, %rax /* rax = oend3 - op3 */ + + cmpq %rax, %rdx + cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ + + movabsq $-3689348814741910323, %rax + mulq %rdx + shrq $3, %rdx /* rdx = rdx / 10 */ + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + /* olimit = op3 + 5 * r15 */ + movq %r15, %rax + leaq (%op3, %rax, 4), %olimit + addq %rax, %olimit + + movq 0(%rsp), %rdx + + /* If (op3 + 10 > olimit) */ + movq %op3, %rax /* rax = op3 */ + cmpq %rax, %olimit /* op3 == olimit */ + je .L_4X2_exit + + /* If (ip1 < ip0) go to exit */ + cmpq %ip0, %ip1 + jb .L_4X2_exit + + /* If (ip2 < ip1) go to exit */ + cmpq %ip1, %ip2 + jb .L_4X2_exit + + /* If (ip3 < ip2) go to exit */ + cmpq %ip2, %ip3 + jb .L_4X2_exit + +#define DECODE(n, idx) \ + movq %bits##n, %rax; \ + shrq $53, %rax; \ + movzwl 0(%dtable,%rax,4),%r8d; \ + movzbl 2(%dtable,%rax,4),%r15d; \ + movzbl 3(%dtable,%rax,4),%eax; \ + movw %r8w, (%op##n); \ + shlxq %r15, %bits##n, %bits##n; \ + addq %rax, %op##n + +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + shrq $3, %bits##n; \ + andq $7, %rax; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlxq %rax, %bits##n, %bits##n + + + movq %olimit, 48(%rsp) + + .p2align 6 + +.L_4X2_loop_body: + /* We clobber r8, so store it on the stack */ + movq %r8, 0(%rsp) + + /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ + FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) + + /* Reload r8 */ + movq 0(%rsp), %r8 + + FOR_EACH_STREAM(RELOAD_BITS) + + cmp %op3, 48(%rsp) + ja .L_4X2_loop_body + jmp .L_4X2_compute_olimit + +#undef DECODE +#undef RELOAD_BITS +.L_4X2_exit: + addq $8, %rsp + /* Restore stack (oend & olimit) */ + pop %rax /* oend0 */ + pop %rax /* oend1 */ + pop %rax /* oend2 */ + pop %rax /* oend3 */ + pop %rax /* ilowest */ + pop %rax /* olimit */ + pop %rax /* arg */ + + /* Save ip / op / bits */ + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) + + /* Restore registers */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret + +#endif diff --git a/vendor/github.com/DataDog/zstd/mem.h b/vendor/github.com/DataDog/zstd/mem.h index 530d30c..ba12a74 100644 --- a/vendor/github.com/DataDog/zstd/mem.h +++ b/vendor/github.com/DataDog/zstd/mem.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,15 +12,13 @@ #ifndef MEM_H_MODULE #define MEM_H_MODULE -#if defined (__cplusplus) -extern "C" { -#endif - /*-**************************************** * Dependencies ******************************************/ -#include /* size_t, ptrdiff_t */ -#include /* memcpy */ +#include /* size_t, ptrdiff_t */ +#include "compiler.h" /* __has_builtin */ +#include "debug.h" /* DEBUG_STATIC_ASSERT */ +#include "zstd_deps.h" /* ZSTD_memcpy */ /*-**************************************** @@ -28,105 +27,22 @@ extern "C" { #if defined(_MSC_VER) /* Visual Studio */ # include /* _byteswap_ulong */ # include /* _byteswap_* */ -#endif -#if defined(__GNUC__) -# define MEM_STATIC static __inline __attribute__((unused)) -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define MEM_STATIC static inline -#elif defined(_MSC_VER) -# define MEM_STATIC static __inline -#else -# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ -#endif - -#ifndef __has_builtin -# define __has_builtin(x) 0 /* compat. with non-clang compilers */ -#endif - -/* code only tested on 32 and 64 bits systems */ -#define MEM_STATIC_ASSERT(c) { enum { MEM_static_assert = 1/(int)(!!(c)) }; } -MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); } - -/* detects whether we are being compiled under msan */ -#if defined (__has_feature) -# if __has_feature(memory_sanitizer) -# define MEMORY_SANITIZER 1 -# endif -#endif - -#if defined (MEMORY_SANITIZER) -/* Not all platforms that support msan provide sanitizers/msan_interface.h. - * We therefore declare the functions we need ourselves, rather than trying to - * include the header file... */ - -#include /* intptr_t */ - -/* Make memory region fully initialized (without changing its contents). */ -void __msan_unpoison(const volatile void *a, size_t size); - -/* Make memory region fully uninitialized (without changing its contents). - This is a legacy interface that does not update origin information. Use - __msan_allocated_memory() instead. */ -void __msan_poison(const volatile void *a, size_t size); - -/* Returns the offset of the first (at least partially) poisoned byte in the - memory range, or -1 if the whole range is good. */ -intptr_t __msan_test_shadow(const volatile void *x, size_t size); -#endif - -/* detects whether we are being compiled under asan */ -#if defined (__has_feature) -# if __has_feature(address_sanitizer) -# define ADDRESS_SANITIZER 1 -# endif -#elif defined(__SANITIZE_ADDRESS__) -# define ADDRESS_SANITIZER 1 -#endif - -#if defined (ADDRESS_SANITIZER) -/* Not all platforms that support asan provide sanitizers/asan_interface.h. - * We therefore declare the functions we need ourselves, rather than trying to - * include the header file... */ - -/** - * Marks a memory region ([addr, addr+size)) as unaddressable. - * - * This memory must be previously allocated by your program. Instrumented - * code is forbidden from accessing addresses in this region until it is - * unpoisoned. This function is not guaranteed to poison the entire region - - * it could poison only a subregion of [addr, addr+size) due to ASan - * alignment restrictions. - * - * \note This function is not thread-safe because no two threads can poison or - * unpoison memory in the same memory region simultaneously. - * - * \param addr Start of memory region. - * \param size Size of memory region. */ -void __asan_poison_memory_region(void const volatile *addr, size_t size); - -/** - * Marks a memory region ([addr, addr+size)) as addressable. - * - * This memory must be previously allocated by your program. Accessing - * addresses in this region is allowed until this region is poisoned again. - * This function could unpoison a super-region of [addr, addr+size) due - * to ASan alignment restrictions. - * - * \note This function is not thread-safe because no two threads can - * poison or unpoison memory in the same memory region simultaneously. - * - * \param addr Start of memory region. - * \param size Size of memory region. */ -void __asan_unpoison_memory_region(void const volatile *addr, size_t size); +#elif defined(__ICCARM__) +# include #endif - /*-************************************************************** * Basic Types *****************************************************************/ #if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include +# if defined(_AIX) +# include +# else +# include /* intptr_t */ +# endif typedef uint8_t BYTE; + typedef uint8_t U8; + typedef int8_t S8; typedef uint16_t U16; typedef int16_t S16; typedef uint32_t U32; @@ -139,6 +55,8 @@ void __asan_unpoison_memory_region(void const volatile *addr, size_t size); # error "this implementation requires char to be exactly 8-bit type" #endif typedef unsigned char BYTE; + typedef unsigned char U8; + typedef signed char S8; #if USHRT_MAX != 65535 # error "this implementation requires short to be exactly 16-bit type" #endif @@ -155,27 +73,64 @@ void __asan_unpoison_memory_region(void const volatile *addr, size_t size); typedef signed long long S64; #endif +/*-************************************************************** +* Memory I/O API +*****************************************************************/ +/*=== Static platform detection ===*/ +MEM_STATIC unsigned MEM_32bits(void); +MEM_STATIC unsigned MEM_64bits(void); +MEM_STATIC unsigned MEM_isLittleEndian(void); + +/*=== Native unaligned read/write ===*/ +MEM_STATIC U16 MEM_read16(const void* memPtr); +MEM_STATIC U32 MEM_read32(const void* memPtr); +MEM_STATIC U64 MEM_read64(const void* memPtr); +MEM_STATIC size_t MEM_readST(const void* memPtr); + +MEM_STATIC void MEM_write16(void* memPtr, U16 value); +MEM_STATIC void MEM_write32(void* memPtr, U32 value); +MEM_STATIC void MEM_write64(void* memPtr, U64 value); + +/*=== Little endian unaligned read/write ===*/ +MEM_STATIC U16 MEM_readLE16(const void* memPtr); +MEM_STATIC U32 MEM_readLE24(const void* memPtr); +MEM_STATIC U32 MEM_readLE32(const void* memPtr); +MEM_STATIC U64 MEM_readLE64(const void* memPtr); +MEM_STATIC size_t MEM_readLEST(const void* memPtr); + +MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val); +MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val); +MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32); +MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64); +MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val); + +/*=== Big endian unaligned read/write ===*/ +MEM_STATIC U32 MEM_readBE32(const void* memPtr); +MEM_STATIC U64 MEM_readBE64(const void* memPtr); +MEM_STATIC size_t MEM_readBEST(const void* memPtr); + +MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32); +MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64); +MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val); + +/*=== Byteswap ===*/ +MEM_STATIC U32 MEM_swap32(U32 in); +MEM_STATIC U64 MEM_swap64(U64 in); +MEM_STATIC size_t MEM_swapST(size_t in); + /*-************************************************************** -* Memory I/O +* Memory I/O Implementation *****************************************************************/ -/* MEM_FORCE_MEMORY_ACCESS : - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. +/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory: + * Method 0 : always use `memcpy()`. Safe and portable. + * Method 1 : Use compiler extension to set unaligned access. * Method 2 : direct access. This method is portable but violate C standard. * It can generate buggy code on targets depending on alignment. - * In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6) - * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. - * Prefer these methods in priority order (0 > 1 > 2) + * Default : method 1 if supported, else method 0 */ #ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define MEM_FORCE_MEMORY_ACCESS 2 -# elif defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__) +# ifdef __GNUC__ # define MEM_FORCE_MEMORY_ACCESS 1 # endif #endif @@ -185,8 +140,24 @@ MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; } MEM_STATIC unsigned MEM_isLittleEndian(void) { +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + return 1; +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return 0; +#elif defined(__clang__) && __LITTLE_ENDIAN__ + return 1; +#elif defined(__clang__) && __BIG_ENDIAN__ + return 0; +#elif defined(_MSC_VER) && (_M_X64 || _M_IX86) + return 1; +#elif defined(__DMC__) && defined(_M_IX86) + return 1; +#elif defined(__IAR_SYSTEMS_ICC__) && __LITTLE_ENDIAN__ + return 1; +#else const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ return one.c[0]; +#endif } #if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) @@ -204,30 +175,19 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; } #elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32)) - __pragma( pack(push, 1) ) - typedef struct { U16 v; } unalign16; - typedef struct { U32 v; } unalign32; - typedef struct { U64 v; } unalign64; - typedef struct { size_t v; } unalignArch; - __pragma( pack(pop) ) -#else - typedef struct { U16 v; } __attribute__((packed)) unalign16; - typedef struct { U32 v; } __attribute__((packed)) unalign32; - typedef struct { U64 v; } __attribute__((packed)) unalign64; - typedef struct { size_t v; } __attribute__((packed)) unalignArch; -#endif +typedef __attribute__((aligned(1))) U16 unalign16; +typedef __attribute__((aligned(1))) U32 unalign32; +typedef __attribute__((aligned(1))) U64 unalign64; +typedef __attribute__((aligned(1))) size_t unalignArch; -MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; } -MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; } -MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; } -MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; } +MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; } +MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; } +MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; } +MEM_STATIC size_t MEM_readST(const void* ptr) { return *(const unalignArch*)ptr; } -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; } -MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; } -MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; } +MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; } +MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(unalign32*)memPtr = value; } +MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(unalign64*)memPtr = value; } #else @@ -236,41 +196,49 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = MEM_STATIC U16 MEM_read16(const void* memPtr) { - U16 val; memcpy(&val, memPtr, sizeof(val)); return val; + U16 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val; } MEM_STATIC U32 MEM_read32(const void* memPtr) { - U32 val; memcpy(&val, memPtr, sizeof(val)); return val; + U32 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val; } MEM_STATIC U64 MEM_read64(const void* memPtr) { - U64 val; memcpy(&val, memPtr, sizeof(val)); return val; + U64 val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val; } MEM_STATIC size_t MEM_readST(const void* memPtr) { - size_t val; memcpy(&val, memPtr, sizeof(val)); return val; + size_t val; ZSTD_memcpy(&val, memPtr, sizeof(val)); return val; } MEM_STATIC void MEM_write16(void* memPtr, U16 value) { - memcpy(memPtr, &value, sizeof(value)); + ZSTD_memcpy(memPtr, &value, sizeof(value)); } MEM_STATIC void MEM_write32(void* memPtr, U32 value) { - memcpy(memPtr, &value, sizeof(value)); + ZSTD_memcpy(memPtr, &value, sizeof(value)); } MEM_STATIC void MEM_write64(void* memPtr, U64 value) { - memcpy(memPtr, &value, sizeof(value)); + ZSTD_memcpy(memPtr, &value, sizeof(value)); } #endif /* MEM_FORCE_MEMORY_ACCESS */ +MEM_STATIC U32 MEM_swap32_fallback(U32 in) +{ + return ((in << 24) & 0xff000000 ) | + ((in << 8) & 0x00ff0000 ) | + ((in >> 8) & 0x0000ff00 ) | + ((in >> 24) & 0x000000ff ); +} + MEM_STATIC U32 MEM_swap32(U32 in) { #if defined(_MSC_VER) /* Visual Studio */ @@ -278,23 +246,16 @@ MEM_STATIC U32 MEM_swap32(U32 in) #elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ || (defined(__clang__) && __has_builtin(__builtin_bswap32)) return __builtin_bswap32(in); +#elif defined(__ICCARM__) + return __REV(in); #else - return ((in << 24) & 0xff000000 ) | - ((in << 8) & 0x00ff0000 ) | - ((in >> 8) & 0x0000ff00 ) | - ((in >> 24) & 0x000000ff ); + return MEM_swap32_fallback(in); #endif } -MEM_STATIC U64 MEM_swap64(U64 in) +MEM_STATIC U64 MEM_swap64_fallback(U64 in) { -#if defined(_MSC_VER) /* Visual Studio */ - return _byteswap_uint64(in); -#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ - || (defined(__clang__) && __has_builtin(__builtin_bswap64)) - return __builtin_bswap64(in); -#else - return ((in << 56) & 0xff00000000000000ULL) | + return ((in << 56) & 0xff00000000000000ULL) | ((in << 40) & 0x00ff000000000000ULL) | ((in << 24) & 0x0000ff0000000000ULL) | ((in << 8) & 0x000000ff00000000ULL) | @@ -302,6 +263,17 @@ MEM_STATIC U64 MEM_swap64(U64 in) ((in >> 24) & 0x0000000000ff0000ULL) | ((in >> 40) & 0x000000000000ff00ULL) | ((in >> 56) & 0x00000000000000ffULL); +} + +MEM_STATIC U64 MEM_swap64(U64 in) +{ +#if defined(_MSC_VER) /* Visual Studio */ + return _byteswap_uint64(in); +#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ + || (defined(__clang__) && __has_builtin(__builtin_bswap64)) + return __builtin_bswap64(in); +#else + return MEM_swap64_fallback(in); #endif } @@ -338,7 +310,7 @@ MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val) MEM_STATIC U32 MEM_readLE24(const void* memPtr) { - return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16); + return (U32)MEM_readLE16(memPtr) + ((U32)(((const BYTE*)memPtr)[2]) << 16); } MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val) @@ -445,9 +417,9 @@ MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val) MEM_writeBE64(memPtr, (U64)val); } - -#if defined (__cplusplus) -} -#endif +/* code only tested on 32 and 64 bits systems */ +MEM_STATIC void MEM_check(void) { DEBUG_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); } #endif /* MEM_H_MODULE */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/pool.c b/vendor/github.com/DataDog/zstd/pool.c index f575935..943c20a 100644 --- a/vendor/github.com/DataDog/zstd/pool.c +++ b/vendor/github.com/DataDog/zstd/pool.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -10,9 +11,9 @@ /* ====== Dependencies ======= */ -#include /* size_t */ +#include "allocations.h" /* ZSTD_customCalloc, ZSTD_customFree */ +#include "zstd_deps.h" /* size_t */ #include "debug.h" /* assert */ -#include "zstd_internal.h" /* ZSTD_malloc, ZSTD_free */ #include "pool.h" /* ====== Compiler specifics ====== */ @@ -86,7 +87,7 @@ static void* POOL_thread(void* opaque) { { POOL_job const job = ctx->queue[ctx->queueHead]; ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize; ctx->numThreadsBusy++; - ctx->queueEmpty = ctx->queueHead == ctx->queueTail; + ctx->queueEmpty = (ctx->queueHead == ctx->queueTail); /* Unlock the mutex, signal a pusher, and run the job */ ZSTD_pthread_cond_signal(&ctx->queuePushCond); ZSTD_pthread_mutex_unlock(&ctx->queueMutex); @@ -96,33 +97,37 @@ static void* POOL_thread(void* opaque) { /* If the intended queue size was 0, signal after finishing job */ ZSTD_pthread_mutex_lock(&ctx->queueMutex); ctx->numThreadsBusy--; - if (ctx->queueSize == 1) { - ZSTD_pthread_cond_signal(&ctx->queuePushCond); - } + ZSTD_pthread_cond_signal(&ctx->queuePushCond); ZSTD_pthread_mutex_unlock(&ctx->queueMutex); } } /* for (;;) */ assert(0); /* Unreachable */ } +/* ZSTD_createThreadPool() : public access point */ +POOL_ctx* ZSTD_createThreadPool(size_t numThreads) { + return POOL_create (numThreads, 0); +} + POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) { return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem); } POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, - ZSTD_customMem customMem) { + ZSTD_customMem customMem) +{ POOL_ctx* ctx; /* Check parameters */ if (!numThreads) { return NULL; } /* Allocate the context and zero initialize */ - ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem); + ctx = (POOL_ctx*)ZSTD_customCalloc(sizeof(POOL_ctx), customMem); if (!ctx) { return NULL; } /* Initialize the job queue. * It needs one extra space since one space is wasted to differentiate * empty and full queues. */ ctx->queueSize = queueSize + 1; - ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem); + ctx->queue = (POOL_job*)ZSTD_customCalloc(ctx->queueSize * sizeof(POOL_job), customMem); ctx->queueHead = 0; ctx->queueTail = 0; ctx->numThreadsBusy = 0; @@ -136,7 +141,7 @@ POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, } ctx->shutdown = 0; /* Allocate space for the thread handles */ - ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem); + ctx->threads = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), customMem); ctx->threadCapacity = 0; ctx->customMem = customMem; /* Check for errors */ @@ -169,7 +174,7 @@ static void POOL_join(POOL_ctx* ctx) { /* Join all of the threads */ { size_t i; for (i = 0; i < ctx->threadCapacity; ++i) { - ZSTD_pthread_join(ctx->threads[i], NULL); /* note : could fail */ + ZSTD_pthread_join(ctx->threads[i]); /* note : could fail */ } } } @@ -179,14 +184,27 @@ void POOL_free(POOL_ctx *ctx) { ZSTD_pthread_mutex_destroy(&ctx->queueMutex); ZSTD_pthread_cond_destroy(&ctx->queuePushCond); ZSTD_pthread_cond_destroy(&ctx->queuePopCond); - ZSTD_free(ctx->queue, ctx->customMem); - ZSTD_free(ctx->threads, ctx->customMem); - ZSTD_free(ctx, ctx->customMem); + ZSTD_customFree(ctx->queue, ctx->customMem); + ZSTD_customFree(ctx->threads, ctx->customMem); + ZSTD_customFree(ctx, ctx->customMem); } +/*! POOL_joinJobs() : + * Waits for all queued jobs to finish executing. + */ +void POOL_joinJobs(POOL_ctx* ctx) { + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + while(!ctx->queueEmpty || ctx->numThreadsBusy > 0) { + ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex); + } + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); +} +void ZSTD_freeThreadPool (ZSTD_threadPool* pool) { + POOL_free (pool); +} -size_t POOL_sizeof(POOL_ctx *ctx) { +size_t POOL_sizeof(const POOL_ctx* ctx) { if (ctx==NULL) return 0; /* supports sizeof NULL */ return sizeof(*ctx) + ctx->queueSize * sizeof(POOL_job) @@ -203,11 +221,11 @@ static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads) return 0; } /* numThreads > threadCapacity */ - { ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem); + { ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem); if (!threadPool) return 1; /* replace existing thread pool */ - memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool)); - ZSTD_free(ctx->threads, ctx->customMem); + ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(ZSTD_pthread_t)); + ZSTD_customFree(ctx->threads, ctx->customMem); ctx->threads = threadPool; /* Initialize additional threads */ { size_t threadId; @@ -251,9 +269,12 @@ static int isQueueFull(POOL_ctx const* ctx) { } -static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque) +static void +POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque) { - POOL_job const job = {function, opaque}; + POOL_job job; + job.function = function; + job.opaque = opaque; assert(ctx != NULL); if (ctx->shutdown) return; @@ -301,21 +322,28 @@ int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) struct POOL_ctx_s { int dummy; }; -static POOL_ctx g_ctx; +static POOL_ctx g_poolCtx; POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) { return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem); } -POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) { +POOL_ctx* +POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) +{ (void)numThreads; (void)queueSize; (void)customMem; - return &g_ctx; + return &g_poolCtx; } void POOL_free(POOL_ctx* ctx) { - assert(!ctx || ctx == &g_ctx); + assert(!ctx || ctx == &g_poolCtx); + (void)ctx; +} + +void POOL_joinJobs(POOL_ctx* ctx){ + assert(!ctx || ctx == &g_poolCtx); (void)ctx; } @@ -335,10 +363,12 @@ int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) { return 1; } -size_t POOL_sizeof(POOL_ctx* ctx) { +size_t POOL_sizeof(const POOL_ctx* ctx) { if (ctx==NULL) return 0; /* supports sizeof NULL */ - assert(ctx == &g_ctx); + assert(ctx == &g_poolCtx); return sizeof(*ctx); } #endif /* ZSTD_MULTITHREAD */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/pool.h b/vendor/github.com/DataDog/zstd/pool.h index 458d37f..df22f4d 100644 --- a/vendor/github.com/DataDog/zstd/pool.h +++ b/vendor/github.com/DataDog/zstd/pool.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,12 +12,8 @@ #ifndef POOL_H #define POOL_H -#if defined (__cplusplus) -extern "C" { -#endif - -#include /* size_t */ +#include "zstd_deps.h" #define ZSTD_STATIC_LINKING_ONLY /* ZSTD_customMem */ #include "zstd.h" @@ -38,10 +35,16 @@ POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, */ void POOL_free(POOL_ctx* ctx); + +/*! POOL_joinJobs() : + * Waits for all queued jobs to finish executing. + */ +void POOL_joinJobs(POOL_ctx* ctx); + /*! POOL_resize() : * Expands or shrinks pool's number of threads. * This is more efficient than releasing + creating a new context, - * since it tries to preserve and re-use existing threads. + * since it tries to preserve and reuse existing threads. * `numThreads` must be at least 1. * @return : 0 when resize was successful, * !0 (typically 1) if there is an error. @@ -53,7 +56,7 @@ int POOL_resize(POOL_ctx* ctx, size_t numThreads); * @return threadpool memory usage * note : compatible with NULL (returns 0 in this case) */ -size_t POOL_sizeof(POOL_ctx* ctx); +size_t POOL_sizeof(const POOL_ctx* ctx); /*! POOL_function : * The function type that can be added to a thread pool. @@ -70,15 +73,12 @@ void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque); /*! POOL_tryAdd() : - * Add the job `function(opaque)` to thread pool _if_ a worker is available. + * Add the job `function(opaque)` to thread pool _if_ a queue slot is available. * Returns immediately even if not (does not block). * @return : 1 if successful, 0 if not. */ int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque); - -#if defined (__cplusplus) -} #endif -#endif +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/portability_macros.h b/vendor/github.com/DataDog/zstd/portability_macros.h new file mode 100644 index 0000000..2001c24 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/portability_macros.h @@ -0,0 +1,174 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_PORTABILITY_MACROS_H +#define ZSTD_PORTABILITY_MACROS_H + +/** + * This header file contains macro definitions to support portability. + * This header is shared between C and ASM code, so it MUST only + * contain macro definitions. It MUST not contain any C code. + * + * This header ONLY defines macros to detect platforms/feature support. + * + */ + + +/* compat. with non-clang compilers */ +#ifndef __has_attribute + #define __has_attribute(x) 0 +#endif + +/* compat. with non-clang compilers */ +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +/* compat. with non-clang compilers */ +#ifndef __has_feature +# define __has_feature(x) 0 +#endif + +/* detects whether we are being compiled under msan */ +#ifndef ZSTD_MEMORY_SANITIZER +# if __has_feature(memory_sanitizer) +# define ZSTD_MEMORY_SANITIZER 1 +# else +# define ZSTD_MEMORY_SANITIZER 0 +# endif +#endif + +/* detects whether we are being compiled under asan */ +#ifndef ZSTD_ADDRESS_SANITIZER +# if __has_feature(address_sanitizer) +# define ZSTD_ADDRESS_SANITIZER 1 +# elif defined(__SANITIZE_ADDRESS__) +# define ZSTD_ADDRESS_SANITIZER 1 +# else +# define ZSTD_ADDRESS_SANITIZER 0 +# endif +#endif + +/* detects whether we are being compiled under dfsan */ +#ifndef ZSTD_DATAFLOW_SANITIZER +# if __has_feature(dataflow_sanitizer) +# define ZSTD_DATAFLOW_SANITIZER 1 +# else +# define ZSTD_DATAFLOW_SANITIZER 0 +# endif +#endif + +/* Mark the internal assembly functions as hidden */ +#ifdef __ELF__ +# define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func +#elif defined(__APPLE__) +# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func +#else +# define ZSTD_HIDE_ASM_FUNCTION(func) +#endif + +/* Compile time determination of BMI2 support */ +#ifndef STATIC_BMI2 +# if defined(__BMI2__) +# define STATIC_BMI2 1 +# elif defined(_MSC_VER) && defined(__AVX2__) +# define STATIC_BMI2 1 /* MSVC does not have a BMI2 specific flag, but every CPU that supports AVX2 also supports BMI2 */ +# endif +#endif + +#ifndef STATIC_BMI2 +# define STATIC_BMI2 0 +#endif + +/* Enable runtime BMI2 dispatch based on the CPU. + * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. + */ +#ifndef DYNAMIC_BMI2 +# if ((defined(__clang__) && __has_attribute(__target__)) \ + || (defined(__GNUC__) \ + && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ + && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)) \ + && !defined(__BMI2__) +# define DYNAMIC_BMI2 1 +# else +# define DYNAMIC_BMI2 0 +# endif +#endif + +/** + * Only enable assembly for GNU C compatible compilers, + * because other platforms may not support GAS assembly syntax. + * + * Only enable assembly for Linux / MacOS / Win32, other platforms may + * work, but they haven't been tested. This could likely be + * extended to BSD systems. + * + * Disable assembly when MSAN is enabled, because MSAN requires + * 100% of code to be instrumented to work. + */ +#if defined(__GNUC__) +# if defined(__linux__) || defined(__linux) || defined(__APPLE__) || defined(_WIN32) +# if ZSTD_MEMORY_SANITIZER +# define ZSTD_ASM_SUPPORTED 0 +# elif ZSTD_DATAFLOW_SANITIZER +# define ZSTD_ASM_SUPPORTED 0 +# else +# define ZSTD_ASM_SUPPORTED 1 +# endif +# else +# define ZSTD_ASM_SUPPORTED 0 +# endif +#else +# define ZSTD_ASM_SUPPORTED 0 +#endif + +/** + * Determines whether we should enable assembly for x86-64 + * with BMI2. + * + * Enable if all of the following conditions hold: + * - ASM hasn't been explicitly disabled by defining ZSTD_DISABLE_ASM + * - Assembly is supported + * - We are compiling for x86-64 and either: + * - DYNAMIC_BMI2 is enabled + * - BMI2 is supported at compile time + */ +#if !defined(ZSTD_DISABLE_ASM) && \ + ZSTD_ASM_SUPPORTED && \ + defined(__x86_64__) && \ + (DYNAMIC_BMI2 || defined(__BMI2__)) +# define ZSTD_ENABLE_ASM_X86_64_BMI2 1 +#else +# define ZSTD_ENABLE_ASM_X86_64_BMI2 0 +#endif + +/* + * For x86 ELF targets, add .note.gnu.property section for Intel CET in + * assembly sources when CET is enabled. + * + * Additionally, any function that may be called indirectly must begin + * with ZSTD_CET_ENDBRANCH. + */ +#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ + && defined(__has_include) +# if __has_include() +# include +# define ZSTD_CET_ENDBRANCH _CET_ENDBR +# endif +#endif + +#ifndef ZSTD_CET_ENDBRANCH +# define ZSTD_CET_ENDBRANCH +#endif + +#endif /* ZSTD_PORTABILITY_MACROS_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/threading.c b/vendor/github.com/DataDog/zstd/threading.c index 482664b..01908a6 100644 --- a/vendor/github.com/DataDog/zstd/threading.c +++ b/vendor/github.com/DataDog/zstd/threading.c @@ -1,13 +1,15 @@ +#ifndef USE_EXTERNAL_ZSTD /** * Copyright (c) 2016 Tino Reichardt * All rights reserved. * + * You can contact the author at: + * - zstdmt source repository: https://github.com/mcmilk/zstdmt + * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). - * - * You can contact the author at: - * - zstdmt source repository: https://github.com/mcmilk/zstdmt + * You may select, at your option, one of the above-listed licenses. */ /** @@ -22,8 +24,7 @@ int g_ZSTD_threading_useless_symbol; #if defined(ZSTD_MULTITHREAD) && defined(_WIN32) /** - * Windows minimalist Pthread Wrapper, based on : - * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html + * Windows minimalist Pthread Wrapper */ @@ -34,37 +35,94 @@ int g_ZSTD_threading_useless_symbol; /* === Implementation === */ +typedef struct { + void* (*start_routine)(void*); + void* arg; + int initialized; + ZSTD_pthread_cond_t initialized_cond; + ZSTD_pthread_mutex_t initialized_mutex; +} ZSTD_thread_params_t; + static unsigned __stdcall worker(void *arg) { - ZSTD_pthread_t* const thread = (ZSTD_pthread_t*) arg; - thread->arg = thread->start_routine(thread->arg); + void* (*start_routine)(void*); + void* thread_arg; + + /* Initialized thread_arg and start_routine and signal main thread that we don't need it + * to wait any longer. + */ + { + ZSTD_thread_params_t* thread_param = (ZSTD_thread_params_t*)arg; + thread_arg = thread_param->arg; + start_routine = thread_param->start_routine; + + /* Signal main thread that we are running and do not depend on its memory anymore */ + ZSTD_pthread_mutex_lock(&thread_param->initialized_mutex); + thread_param->initialized = 1; + ZSTD_pthread_cond_signal(&thread_param->initialized_cond); + ZSTD_pthread_mutex_unlock(&thread_param->initialized_mutex); + } + + start_routine(thread_arg); + return 0; } int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused, void* (*start_routine) (void*), void* arg) { + ZSTD_thread_params_t thread_param; (void)unused; - thread->arg = arg; - thread->start_routine = start_routine; - thread->handle = (HANDLE) _beginthreadex(NULL, 0, worker, thread, 0, NULL); - if (!thread->handle) + if (thread==NULL) return -1; + *thread = NULL; + + thread_param.start_routine = start_routine; + thread_param.arg = arg; + thread_param.initialized = 0; + + /* Setup thread initialization synchronization */ + if(ZSTD_pthread_cond_init(&thread_param.initialized_cond, NULL)) { + /* Should never happen on Windows */ + return -1; + } + if(ZSTD_pthread_mutex_init(&thread_param.initialized_mutex, NULL)) { + /* Should never happen on Windows */ + ZSTD_pthread_cond_destroy(&thread_param.initialized_cond); + return -1; + } + + /* Spawn thread */ + *thread = (HANDLE)_beginthreadex(NULL, 0, worker, &thread_param, 0, NULL); + if (*thread==NULL) { + ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex); + ZSTD_pthread_cond_destroy(&thread_param.initialized_cond); return errno; - else - return 0; + } + + /* Wait for thread to be initialized */ + ZSTD_pthread_mutex_lock(&thread_param.initialized_mutex); + while(!thread_param.initialized) { + ZSTD_pthread_cond_wait(&thread_param.initialized_cond, &thread_param.initialized_mutex); + } + ZSTD_pthread_mutex_unlock(&thread_param.initialized_mutex); + ZSTD_pthread_mutex_destroy(&thread_param.initialized_mutex); + ZSTD_pthread_cond_destroy(&thread_param.initialized_cond); + + return 0; } -int ZSTD_pthread_join(ZSTD_pthread_t thread, void **value_ptr) +int ZSTD_pthread_join(ZSTD_pthread_t thread) { DWORD result; - if (!thread.handle) return 0; + if (!thread) return 0; + + result = WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); - result = WaitForSingleObject(thread.handle, INFINITE); switch (result) { case WAIT_OBJECT_0: - if (value_ptr) *value_ptr = thread.arg; return 0; case WAIT_ABANDONED: return EINVAL; @@ -77,11 +135,13 @@ int ZSTD_pthread_join(ZSTD_pthread_t thread, void **value_ptr) #if defined(ZSTD_MULTITHREAD) && DEBUGLEVEL >= 1 && !defined(_WIN32) -#include +#define ZSTD_DEPS_NEED_MALLOC +#include "zstd_deps.h" int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr) { - *mutex = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t)); + assert(mutex != NULL); + *mutex = (pthread_mutex_t*)ZSTD_malloc(sizeof(pthread_mutex_t)); if (!*mutex) return 1; return pthread_mutex_init(*mutex, attr); @@ -89,18 +149,20 @@ int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t con int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex) { + assert(mutex != NULL); if (!*mutex) return 0; { int const ret = pthread_mutex_destroy(*mutex); - free(*mutex); + ZSTD_free(*mutex); return ret; } } int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr) { - *cond = (pthread_cond_t*)malloc(sizeof(pthread_cond_t)); + assert(cond != NULL); + *cond = (pthread_cond_t*)ZSTD_malloc(sizeof(pthread_cond_t)); if (!*cond) return 1; return pthread_cond_init(*cond, attr); @@ -108,13 +170,16 @@ int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond) { + assert(cond != NULL); if (!*cond) return 0; { int const ret = pthread_cond_destroy(*cond); - free(*cond); + ZSTD_free(*cond); return ret; } } #endif + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/threading.h b/vendor/github.com/DataDog/zstd/threading.h index 3193ca7..7e13c8e 100644 --- a/vendor/github.com/DataDog/zstd/threading.h +++ b/vendor/github.com/DataDog/zstd/threading.h @@ -1,13 +1,15 @@ +#ifndef USE_EXTERNAL_ZSTD /** * Copyright (c) 2016 Tino Reichardt * All rights reserved. * + * You can contact the author at: + * - zstdmt source repository: https://github.com/mcmilk/zstdmt + * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). - * - * You can contact the author at: - * - zstdmt source repository: https://github.com/mcmilk/zstdmt + * You may select, at your option, one of the above-listed licenses. */ #ifndef THREADING_H_938743 @@ -15,15 +17,10 @@ #include "debug.h" -#if defined (__cplusplus) -extern "C" { -#endif - #if defined(ZSTD_MULTITHREAD) && defined(_WIN32) /** - * Windows minimalist Pthread Wrapper, based on : - * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html + * Windows minimalist Pthread Wrapper */ #ifdef WINVER # undef WINVER @@ -61,22 +58,17 @@ extern "C" { #define ZSTD_pthread_cond_broadcast(a) WakeAllConditionVariable((a)) /* ZSTD_pthread_create() and ZSTD_pthread_join() */ -typedef struct { - HANDLE handle; - void* (*start_routine)(void*); - void* arg; -} ZSTD_pthread_t; +typedef HANDLE ZSTD_pthread_t; int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused, void* (*start_routine) (void*), void* arg); -int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr); +int ZSTD_pthread_join(ZSTD_pthread_t thread); /** * add here more wrappers as required */ - #elif defined(ZSTD_MULTITHREAD) /* posix assumed ; need a better detection method */ /* === POSIX Systems === */ # include @@ -98,7 +90,7 @@ int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr); #define ZSTD_pthread_t pthread_t #define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d)) -#define ZSTD_pthread_join(a, b) pthread_join((a),(b)) +#define ZSTD_pthread_join(a) pthread_join((a),NULL) #else /* DEBUGLEVEL >= 1 */ @@ -123,7 +115,7 @@ int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond); #define ZSTD_pthread_t pthread_t #define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d)) -#define ZSTD_pthread_join(a, b) pthread_join((a),(b)) +#define ZSTD_pthread_join(a) pthread_join((a),NULL) #endif @@ -147,8 +139,7 @@ typedef int ZSTD_pthread_cond_t; #endif /* ZSTD_MULTITHREAD */ -#if defined (__cplusplus) -} -#endif #endif /* THREADING_H_938743 */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/travis_test_32.sh b/vendor/github.com/DataDog/zstd/travis_test_32.sh index 4a0debc..264ca06 100644 --- a/vendor/github.com/DataDog/zstd/travis_test_32.sh +++ b/vendor/github.com/DataDog/zstd/travis_test_32.sh @@ -1,6 +1,8 @@ #!/bin/bash # Get utilities -yum -y -q -e 0 install wget tar unzip gcc +#yum -y -q -e 0 install wget tar unzip gcc +apt-get update +apt-get -y install wget tar unzip gcc # Get Go wget -q https://dl.google.com/go/go1.13.linux-386.tar.gz @@ -13,5 +15,5 @@ unzip mr.zip # Build and run tests go build -PAYLOAD=$(pwd)/mr go test -v -PAYLOAD=$(pwd)/mr go test -bench . +DISABLE_BIG_TESTS=1 PAYLOAD=$(pwd)/mr go test -v +DISABLE_BIG_TESTS=1 PAYLOAD=$(pwd)/mr go test -bench . diff --git a/vendor/github.com/DataDog/zstd/xxhash.c b/vendor/github.com/DataDog/zstd/xxhash.c index 99d2459..1096adc 100644 --- a/vendor/github.com/DataDog/zstd/xxhash.c +++ b/vendor/github.com/DataDog/zstd/xxhash.c @@ -1,882 +1,21 @@ +#ifndef USE_EXTERNAL_ZSTD /* -* xxHash - Fast Hash algorithm -* Copyright (C) 2012-2016, Yann Collet -* -* BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) -* -* Redistribution and use in source and binary forms, with or without -* modification, are permitted provided that the following conditions are -* met: -* -* * Redistributions of source code must retain the above copyright -* notice, this list of conditions and the following disclaimer. -* * Redistributions in binary form must reproduce the above -* copyright notice, this list of conditions and the following disclaimer -* in the documentation and/or other materials provided with the -* distribution. -* -* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -* -* You can contact the author at : -* - xxHash homepage: http://www.xxhash.com -* - xxHash source repository : https://github.com/Cyan4973/xxHash -*/ - - -/* ************************************* -* Tuning parameters -***************************************/ -/*!XXH_FORCE_MEMORY_ACCESS : - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. - * It can generate buggy code on targets which do not support unaligned memory accesses. - * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://stackoverflow.com/a/32095106/646947 for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define XXH_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) || \ - defined(__ICCARM__) -# define XXH_FORCE_MEMORY_ACCESS 1 -# endif -#endif - -/*!XXH_ACCEPT_NULL_INPUT_POINTER : - * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. - * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. - * By default, this option is disabled. To enable it, uncomment below define : - */ -/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */ - -/*!XXH_FORCE_NATIVE_FORMAT : - * By default, xxHash library provides endian-independent Hash values, based on little-endian convention. - * Results are therefore identical for little-endian and big-endian CPU. - * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. - * Should endian-independence be of no importance for your application, you may set the #define below to 1, - * to improve speed for Big-endian CPU. - * This option has no impact on Little_Endian CPU. + * xxHash - Extremely Fast Hash algorithm + * Copyright (c) Yann Collet - Meta Platforms, Inc + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ -#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */ -# define XXH_FORCE_NATIVE_FORMAT 0 -#endif -/*!XXH_FORCE_ALIGN_CHECK : - * This is a minor performance trick, only useful with lots of very small keys. - * It means : check for aligned/unaligned input. - * The check costs one initial branch per hash; set to 0 when the input data - * is guaranteed to be aligned. +/* + * xxhash.c instantiates functions defined in xxhash.h */ -#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ -# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) -# define XXH_FORCE_ALIGN_CHECK 0 -# else -# define XXH_FORCE_ALIGN_CHECK 1 -# endif -#endif - -/* ************************************* -* Includes & Memory related functions -***************************************/ -/* Modify the local functions below should you wish to use some other memory routines */ -/* for malloc(), free() */ -#include -#include /* size_t */ -static void* XXH_malloc(size_t s) { return malloc(s); } -static void XXH_free (void* p) { free(p); } -/* for memcpy() */ -#include -static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ -#ifndef XXH_STATIC_LINKING_ONLY -# define XXH_STATIC_LINKING_ONLY -#endif #include "xxhash.h" - -/* ************************************* -* Compiler Specific Options -***************************************/ -#if defined (__GNUC__) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ -# define INLINE_KEYWORD inline -#else -# define INLINE_KEYWORD -#endif - -#if defined(__GNUC__) || defined(__ICCARM__) -# define FORCE_INLINE_ATTR __attribute__((always_inline)) -#elif defined(_MSC_VER) -# define FORCE_INLINE_ATTR __forceinline -#else -# define FORCE_INLINE_ATTR -#endif - -#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR - - -#ifdef _MSC_VER -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -#endif - - -/* ************************************* -* Basic Types -***************************************/ -#ifndef MEM_MODULE -# define MEM_MODULE -# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include - typedef uint8_t BYTE; - typedef uint16_t U16; - typedef uint32_t U32; - typedef int32_t S32; - typedef uint64_t U64; -# else - typedef unsigned char BYTE; - typedef unsigned short U16; - typedef unsigned int U32; - typedef signed int S32; - typedef unsigned long long U64; /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */ -# endif -#endif - - -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) - -/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ -static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; } -static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; } - -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign; - -static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } - -#else - -/* portable and safe solution. Generally efficient. - * see : http://stackoverflow.com/a/32095106/646947 - */ - -static U32 XXH_read32(const void* memPtr) -{ - U32 val; - memcpy(&val, memPtr, sizeof(val)); - return val; -} - -static U64 XXH_read64(const void* memPtr) -{ - U64 val; - memcpy(&val, memPtr, sizeof(val)); - return val; -} - -#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ - - -/* **************************************** -* Compiler-specific Functions and Macros -******************************************/ -#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) - -/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ -#if defined(_MSC_VER) -# define XXH_rotl32(x,r) _rotl(x,r) -# define XXH_rotl64(x,r) _rotl64(x,r) -#else -#if defined(__ICCARM__) -# include -# define XXH_rotl32(x,r) __ROR(x,(32 - r)) -#else -# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) -#endif -# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) -#endif - -#if defined(_MSC_VER) /* Visual Studio */ -# define XXH_swap32 _byteswap_ulong -# define XXH_swap64 _byteswap_uint64 -#elif GCC_VERSION >= 403 -# define XXH_swap32 __builtin_bswap32 -# define XXH_swap64 __builtin_bswap64 -#else -static U32 XXH_swap32 (U32 x) -{ - return ((x << 24) & 0xff000000 ) | - ((x << 8) & 0x00ff0000 ) | - ((x >> 8) & 0x0000ff00 ) | - ((x >> 24) & 0x000000ff ); -} -static U64 XXH_swap64 (U64 x) -{ - return ((x << 56) & 0xff00000000000000ULL) | - ((x << 40) & 0x00ff000000000000ULL) | - ((x << 24) & 0x0000ff0000000000ULL) | - ((x << 8) & 0x000000ff00000000ULL) | - ((x >> 8) & 0x00000000ff000000ULL) | - ((x >> 24) & 0x0000000000ff0000ULL) | - ((x >> 40) & 0x000000000000ff00ULL) | - ((x >> 56) & 0x00000000000000ffULL); -} -#endif - - -/* ************************************* -* Architecture Macros -***************************************/ -typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; - -/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ -#ifndef XXH_CPU_LITTLE_ENDIAN - static const int g_one = 1; -# define XXH_CPU_LITTLE_ENDIAN (*(const char*)(&g_one)) -#endif - - -/* *************************** -* Memory reads -*****************************/ -typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; - -FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) -{ - if (align==XXH_unaligned) - return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); - else - return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); -} - -FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) -{ - return XXH_readLE32_align(ptr, endian, XXH_unaligned); -} - -static U32 XXH_readBE32(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); -} - -FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) -{ - if (align==XXH_unaligned) - return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); - else - return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); -} - -FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) -{ - return XXH_readLE64_align(ptr, endian, XXH_unaligned); -} - -static U64 XXH_readBE64(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); -} - - -/* ************************************* -* Macros -***************************************/ -#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ - - -/* ************************************* -* Constants -***************************************/ -static const U32 PRIME32_1 = 2654435761U; -static const U32 PRIME32_2 = 2246822519U; -static const U32 PRIME32_3 = 3266489917U; -static const U32 PRIME32_4 = 668265263U; -static const U32 PRIME32_5 = 374761393U; - -static const U64 PRIME64_1 = 11400714785074694791ULL; -static const U64 PRIME64_2 = 14029467366897019727ULL; -static const U64 PRIME64_3 = 1609587929392839161ULL; -static const U64 PRIME64_4 = 9650029242287828579ULL; -static const U64 PRIME64_5 = 2870177450012600261ULL; - -XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } - - -/* ************************** -* Utils -****************************/ -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState) -{ - memcpy(dstState, srcState, sizeof(*dstState)); -} - -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState) -{ - memcpy(dstState, srcState, sizeof(*dstState)); -} - - -/* *************************** -* Simple Hash Functions -*****************************/ - -static U32 XXH32_round(U32 seed, U32 input) -{ - seed += input * PRIME32_2; - seed = XXH_rotl32(seed, 13); - seed *= PRIME32_1; - return seed; -} - -FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) -{ - const BYTE* p = (const BYTE*)input; - const BYTE* bEnd = p + len; - U32 h32; -#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) - -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (p==NULL) { - len=0; - bEnd=p=(const BYTE*)(size_t)16; - } -#endif - - if (len>=16) { - const BYTE* const limit = bEnd - 16; - U32 v1 = seed + PRIME32_1 + PRIME32_2; - U32 v2 = seed + PRIME32_2; - U32 v3 = seed + 0; - U32 v4 = seed - PRIME32_1; - - do { - v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4; - v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4; - v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4; - v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4; - } while (p<=limit); - - h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); - } else { - h32 = seed + PRIME32_5; - } - - h32 += (U32) len; - - while (p+4<=bEnd) { - h32 += XXH_get32bits(p) * PRIME32_3; - h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; - p+=4; - } - - while (p> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - - return h32; -} - - -XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed) -{ -#if 0 - /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ - XXH32_CREATESTATE_STATIC(state); - XXH32_reset(state, seed); - XXH32_update(state, input, len); - return XXH32_digest(state); -#else - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if (XXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); - else - return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); - } } - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); - else - return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); -#endif -} - - -static U64 XXH64_round(U64 acc, U64 input) -{ - acc += input * PRIME64_2; - acc = XXH_rotl64(acc, 31); - acc *= PRIME64_1; - return acc; -} - -static U64 XXH64_mergeRound(U64 acc, U64 val) -{ - val = XXH64_round(0, val); - acc ^= val; - acc = acc * PRIME64_1 + PRIME64_4; - return acc; -} - -FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) -{ - const BYTE* p = (const BYTE*)input; - const BYTE* const bEnd = p + len; - U64 h64; -#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) - -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (p==NULL) { - len=0; - bEnd=p=(const BYTE*)(size_t)32; - } -#endif - - if (len>=32) { - const BYTE* const limit = bEnd - 32; - U64 v1 = seed + PRIME64_1 + PRIME64_2; - U64 v2 = seed + PRIME64_2; - U64 v3 = seed + 0; - U64 v4 = seed - PRIME64_1; - - do { - v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8; - v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8; - v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8; - v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8; - } while (p<=limit); - - h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); - h64 = XXH64_mergeRound(h64, v1); - h64 = XXH64_mergeRound(h64, v2); - h64 = XXH64_mergeRound(h64, v3); - h64 = XXH64_mergeRound(h64, v4); - - } else { - h64 = seed + PRIME64_5; - } - - h64 += (U64) len; - - while (p+8<=bEnd) { - U64 const k1 = XXH64_round(0, XXH_get64bits(p)); - h64 ^= k1; - h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; - p+=8; - } - - if (p+4<=bEnd) { - h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; - h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; - p+=4; - } - - while (p> 33; - h64 *= PRIME64_2; - h64 ^= h64 >> 29; - h64 *= PRIME64_3; - h64 ^= h64 >> 32; - - return h64; -} - - -XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) -{ -#if 0 - /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ - XXH64_CREATESTATE_STATIC(state); - XXH64_reset(state, seed); - XXH64_update(state, input, len); - return XXH64_digest(state); -#else - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if (XXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); - else - return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); - } } - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); - else - return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); -#endif -} - - -/* ************************************************** -* Advanced Hash Functions -****************************************************/ - -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) -{ - return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); -} -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -} - -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) -{ - return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); -} -XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -} - - -/*** Hash feed ***/ - -XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed) -{ - XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ - memset(&state, 0, sizeof(state)-4); /* do not write into reserved, for future removal */ - state.v1 = seed + PRIME32_1 + PRIME32_2; - state.v2 = seed + PRIME32_2; - state.v3 = seed + 0; - state.v4 = seed - PRIME32_1; - memcpy(statePtr, &state, sizeof(state)); - return XXH_OK; -} - - -XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) -{ - XXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ - memset(&state, 0, sizeof(state)-8); /* do not write into reserved, for future removal */ - state.v1 = seed + PRIME64_1 + PRIME64_2; - state.v2 = seed + PRIME64_2; - state.v3 = seed + 0; - state.v4 = seed - PRIME64_1; - memcpy(statePtr, &state, sizeof(state)); - return XXH_OK; -} - - -FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) -{ - const BYTE* p = (const BYTE*)input; - const BYTE* const bEnd = p + len; - -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (input==NULL) return XXH_ERROR; -#endif - - state->total_len_32 += (unsigned)len; - state->large_len |= (len>=16) | (state->total_len_32>=16); - - if (state->memsize + len < 16) { /* fill in tmp buffer */ - XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); - state->memsize += (unsigned)len; - return XXH_OK; - } - - if (state->memsize) { /* some data left from previous update */ - XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); - { const U32* p32 = state->mem32; - state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++; - state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++; - state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++; - state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++; - } - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= bEnd-16) { - const BYTE* const limit = bEnd - 16; - U32 v1 = state->v1; - U32 v2 = state->v2; - U32 v3 = state->v3; - U32 v4 = state->v4; - - do { - v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4; - v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4; - v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4; - v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4; - } while (p<=limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < bEnd) { - XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); - } - - return XXH_OK; -} - -XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_update_endian(state_in, input, len, XXH_littleEndian); - else - return XXH32_update_endian(state_in, input, len, XXH_bigEndian); -} - - - -FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) -{ - const BYTE * p = (const BYTE*)state->mem32; - const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize; - U32 h32; - - if (state->large_len) { - h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); - } else { - h32 = state->v3 /* == seed */ + PRIME32_5; - } - - h32 += state->total_len_32; - - while (p+4<=bEnd) { - h32 += XXH_readLE32(p, endian) * PRIME32_3; - h32 = XXH_rotl32(h32, 17) * PRIME32_4; - p+=4; - } - - while (p> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - - return h32; -} - - -XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH32_digest_endian(state_in, XXH_littleEndian); - else - return XXH32_digest_endian(state_in, XXH_bigEndian); -} - - - -/* **** XXH64 **** */ - -FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) -{ - const BYTE* p = (const BYTE*)input; - const BYTE* const bEnd = p + len; - -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (input==NULL) return XXH_ERROR; -#endif - - state->total_len += len; - - if (state->memsize + len < 32) { /* fill in tmp buffer */ - XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); - state->memsize += (U32)len; - return XXH_OK; - } - - if (state->memsize) { /* tmp buffer is full */ - XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); - state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian)); - state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian)); - state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian)); - state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian)); - p += 32-state->memsize; - state->memsize = 0; - } - - if (p+32 <= bEnd) { - const BYTE* const limit = bEnd - 32; - U64 v1 = state->v1; - U64 v2 = state->v2; - U64 v3 = state->v3; - U64 v4 = state->v4; - - do { - v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8; - v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8; - v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8; - v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8; - } while (p<=limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < bEnd) { - XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); - } - - return XXH_OK; -} - -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_update_endian(state_in, input, len, XXH_littleEndian); - else - return XXH64_update_endian(state_in, input, len, XXH_bigEndian); -} - - - -FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) -{ - const BYTE * p = (const BYTE*)state->mem64; - const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize; - U64 h64; - - if (state->total_len >= 32) { - U64 const v1 = state->v1; - U64 const v2 = state->v2; - U64 const v3 = state->v3; - U64 const v4 = state->v4; - - h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); - h64 = XXH64_mergeRound(h64, v1); - h64 = XXH64_mergeRound(h64, v2); - h64 = XXH64_mergeRound(h64, v3); - h64 = XXH64_mergeRound(h64, v4); - } else { - h64 = state->v3 + PRIME64_5; - } - - h64 += (U64) state->total_len; - - while (p+8<=bEnd) { - U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian)); - h64 ^= k1; - h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; - p+=8; - } - - if (p+4<=bEnd) { - h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1; - h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; - p+=4; - } - - while (p> 33; - h64 *= PRIME64_2; - h64 ^= h64 >> 29; - h64 *= PRIME64_3; - h64 ^= h64 >> 32; - - return h64; -} - - -XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in) -{ - XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; - - if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) - return XXH64_digest_endian(state_in, XXH_littleEndian); - else - return XXH64_digest_endian(state_in, XXH_bigEndian); -} - - -/* ************************** -* Canonical representation -****************************/ - -/*! Default XXH result types are basic unsigned 32 and 64 bits. -* The canonical representation follows human-readable write convention, aka big-endian (large digits first). -* These functions allow transformation of hash result into and from its canonical format. -* This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs. -*/ - -XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); - memcpy(dst, &hash, sizeof(*dst)); -} - -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); - memcpy(dst, &hash, sizeof(*dst)); -} - -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) -{ - return XXH_readBE32(src); -} - -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) -{ - return XXH_readBE64(src); -} +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/xxhash.h b/vendor/github.com/DataDog/zstd/xxhash.h index 9bad1f5..e7d9abc 100644 --- a/vendor/github.com/DataDog/zstd/xxhash.h +++ b/vendor/github.com/DataDog/zstd/xxhash.h @@ -1,100 +1,320 @@ +#ifndef USE_EXTERNAL_ZSTD /* - xxHash - Extremely Fast Hash algorithm - Header File - Copyright (C) 2012-2016, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - xxHash source repository : https://github.com/Cyan4973/xxHash -*/ - -/* Notice extracted from xxHash homepage : - -xxHash is an extremely fast Hash algorithm, running at RAM speed limits. -It also successfully passes all tests from the SMHasher suite. - -Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) - -Name Speed Q.Score Author -xxHash 5.4 GB/s 10 -CrapWow 3.2 GB/s 2 Andrew -MumurHash 3a 2.7 GB/s 10 Austin Appleby -SpookyHash 2.0 GB/s 10 Bob Jenkins -SBox 1.4 GB/s 9 Bret Mulvey -Lookup3 1.2 GB/s 9 Bob Jenkins -SuperFastHash 1.2 GB/s 1 Paul Hsieh -CityHash64 1.05 GB/s 10 Pike & Alakuijala -FNV 0.55 GB/s 5 Fowler, Noll, Vo -CRC32 0.43 GB/s 9 -MD5-32 0.33 GB/s 10 Ronald L. Rivest -SHA1-32 0.28 GB/s 10 - -Q.Score is a measure of quality of the hash function. -It depends on successfully passing SMHasher test set. -10 is a perfect score. - -A 64-bits version, named XXH64, is available since r35. -It offers much better speed, but for 64-bits applications only. -Name Speed on 64 bits Speed on 32 bits -XXH64 13.8 GB/s 1.9 GB/s -XXH32 6.8 GB/s 6.0 GB/s -*/ + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (c) Yann Collet - Meta Platforms, Inc + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ -#if defined (__cplusplus) -extern "C" { +/* Local adaptations for Zstandard */ + +#ifndef XXH_NO_XXH3 +# define XXH_NO_XXH3 #endif -#ifndef XXHASH_H_5627135585666179 -#define XXHASH_H_5627135585666179 1 +#ifndef XXH_NAMESPACE +# define XXH_NAMESPACE ZSTD_ +#endif +/*! + * @mainpage xxHash + * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include + * #include + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * + * @anchor canonical_representation_example + * **Canonical Representation** + * + * The default return values from XXH functions are unsigned 32, 64 and 128 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + * + * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(), + * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(), + * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(), + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which prints XXH32_hash_t in human readable format + * void printXxh32(XXH32_hash_t hash) + * { + * XXH32_canonical_t cano; + * XXH32_canonicalFromHash(&cano, hash); + * size_t i; + * for(i = 0; i < sizeof(cano.digest); ++i) { + * printf("%02x", cano.digest[i]); + * } + * printf("\n"); + * } + * + * // Example for a function which converts XXH32_canonical_t to XXH32_hash_t + * XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano) + * { + * XXH32_hash_t hash = XXH32_hashFromCanonical(&cano); + * return hash; + * } + * @endcode + * + * + * @file xxhash.h + * xxHash prototypes and implementation + */ /* **************************** -* Definitions -******************************/ -#include /* size_t */ -typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + * INLINE mode + ******************************/ +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Gives access to internal state declaration, required for static allocation. + * + * Incompatible with dynamic linking, due to risks of ABI changes. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #include "xxhash.h" + * @endcode + */ +# define XXH_STATIC_LINKING_ONLY +/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ +/*! + * @brief Gives access to internal definitions. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #define XXH_IMPLEMENTATION + * #include "xxhash.h" + * @endcode + */ +# define XXH_IMPLEMENTATION +/* Do not undef XXH_IMPLEMENTATION for Doxygen */ -/* **************************** -* API modifier -******************************/ -/** XXH_PRIVATE_API -* This is useful if you want to include xxhash functions in `static` mode -* in order to inline them, and remove their symbol from the public list. -* Methodology : -* #define XXH_PRIVATE_API -* #include "xxhash.h" -* `xxhash.c` is automatically included. -* It's not useful to compile and link it as a separate module anymore. -*/ -#ifdef XXH_PRIVATE_API -# ifndef XXH_STATIC_LINKING_ONLY -# define XXH_STATIC_LINKING_ONLY -# endif +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * @code{.c} + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * @endcode + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API # if defined(__GNUC__) # define XXH_PUBLIC_API static __inline __attribute__((unused)) # elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) @@ -102,204 +322,6776 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; # elif defined(_MSC_VER) # define XXH_PUBLIC_API static __inline # else -# define XXH_PUBLIC_API static /* this version may generate warnings for unused static functions; disable the relevant warning */ + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static # endif -#else -# define XXH_PUBLIC_API /* do nothing */ -#endif /* XXH_PRIVATE_API */ -/*!XXH_NAMESPACE, aka Namespace Emulation : + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. + */ +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ -If you want to include _and expose_ xxHash functions from within your own library, -but also want to avoid symbol collisions with another library which also includes xxHash, +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 -you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library -with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values). +/*! @brief Marks a global symbol. */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif -Note that no change is required within the calling program as long as it includes `xxhash.h` : -regular symbol name will be automatically translated by this header. -*/ #ifdef XXH_NAMESPACE # define XXH_CAT(A,B) A##B # define XXH_NAME2(A,B) XXH_CAT(A,B) -# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) -# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) # define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) # define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) -# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) # define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) -# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) # define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) -# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) # define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) -# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) # define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) -# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) # define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) -# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) # define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) -# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) # define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) # define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif #endif +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif /* ************************************* * Version ***************************************/ #define XXH_VERSION_MAJOR 0 -#define XXH_VERSION_MINOR 6 +#define XXH_VERSION_MINOR 8 #define XXH_VERSION_RELEASE 2 +/*! @brief Version number, encoded as two digits each */ #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) -XXH_PUBLIC_API unsigned XXH_versionNumber (void); +#if defined (__cplusplus) +extern "C" { +#endif +/*! + * @brief Obtains the xxHash version. + * + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. + * + * @return @ref XXH_VERSION_NUMBER of the invoked library. + */ +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); + +#if defined (__cplusplus) +} +#endif /* **************************** -* Simple Hash Functions +* Common basic types ******************************/ -typedef unsigned int XXH32_hash_t; -typedef unsigned long long XXH64_hash_t; +#include /* size_t */ +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; -XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed); -XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed); +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show include */ /*! -XXH32() : - Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". - The memory between input & input+length must be valid (allocated and read-accessible). - "seed" can be used to alter the result predictably. - Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s -XXH64() : - Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". - "seed" can be used to alter the result predictably. - This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark). -*/ + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# ifdef _AIX +# include +# else +# include +# endif + typedef uint32_t XXH32_hash_t; -/* **************************** -* Streaming Hash Functions -******************************/ -typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ -typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +#endif + +#if defined (__cplusplus) +extern "C" { +#endif -/*! State allocation, compatible with dynamic libraries */ +/*! + * @} + * + * @defgroup XXH32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. + * + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details + * @{ + */ -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit xxHash32 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +#ifndef XXH_NO_STREAM +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + */ +typedef struct XXH32_state_s XXH32_state_t; +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * @return An allocated pointer of @ref XXH32_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH32_freeState(). + */ +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH32_createState(). + * + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); -/* hash streaming */ +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH32_update(). + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); -XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed); +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + */ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); -XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated 32-bit xxHash32 value from that state. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +/*! @cond Doxygen ignores this part */ +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ +/*! @cond Doxygen ignores this part */ /* -These functions generate the xxHash of an input provided in multiple segments. -Note that, for small input, they are slower than single-call functions, due to state management. -For small input, prefer `XXH32()` and `XXH64()` . + * C23 __STDC_VERSION__ number hasn't been specified yet. For now + * leave as `201711L` (C17 + 1). + * TODO: Update to correct value when its been specified. + */ +#define XXH_C23_VN 201711L +/*! @endcond */ -XXH state must first be allocated, using XXH*_createState() . +/*! @cond Doxygen ignores this part */ +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ -Start a new hash by initializing state with a seed, using XXH*_reset(). +/*! @cond Doxygen ignores this part */ +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ -Then, feed the hash state by calling XXH*_update() as many times as necessary. -Obviously, input must be allocated and read accessible. -The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif +/*! @endcond */ -Finally, a hash value can be produced anytime, by using XXH*_digest(). -This function returns the nn-bits hash as an int or long long. +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_NOESCAPE for annotated pointers in public API. + * https://clang.llvm.org/docs/AttributeReference.html#noescape + * As of writing this, only supported by clang. + */ +#if XXH_HAS_ATTRIBUTE(noescape) +# define XXH_NOESCAPE __attribute__((noescape)) +#else +# define XXH_NOESCAPE +#endif +/*! @endcond */ -It's still possible to continue inserting input into the hash state after a digest, -and generate some new hashes later on, by calling again XXH*_digest(). +#if defined (__cplusplus) +} /* end of extern "C" */ +#endif -When done, free XXH state space if it was allocated dynamically. -*/ +/*! + * @} + * @ingroup public + * @{ + */ +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# ifdef _AIX +# include +# else +# include +# endif + typedef uint64_t XXH64_hash_t; +#else +# include +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif -/* ************************** -* Utils -****************************/ -#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* ! C99 */ -# define restrict /* disable restrict */ +#if defined (__cplusplus) +extern "C" { #endif +/*! + * @} + * + * @defgroup XXH64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit xxHash64 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state); -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state); +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +/*! + * @brief Allocates an @ref XXH64_state_t. + * + * @return An allocated pointer of @ref XXH64_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH64_freeState(). + */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); -/* ************************** -* Canonical representation -****************************/ -/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. -* The canonical representation uses human-readable write convention, aka big-endian (large digits first). -* These functions allow transformation of hash result into and from its canonical format. -* This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. -*/ -typedef struct { unsigned char digest[4]; } XXH32_canonical_t; -typedef struct { unsigned char digest[8]; } XXH64_canonical_t; +/*! + * @brief Frees an @ref XXH64_state_t. + * + * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH64_createState(). + */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); -XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +/*! + * @brief Copies one @ref XXH64_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +/*! + * @brief Resets an @ref XXH64_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH64_update(). + */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH64_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH64_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated 64-bit xxHash64 value from that state. + * + * @note + * Calling XXH64_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ +/******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH64_hash_t. + */ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; + +/*! + * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. + * + * @param dst The @ref XXH64_canonical_t pointer to be stored to. + * @param hash The @ref XXH64_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); + +/*! + * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. + * + * @param src The @ref XXH64_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 + +/*! + * @} + * ************************************************************************ + * @defgroup XXH3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD + * implementations for many common platforms: + * - AVX512 + * - AVX2 + * - SSE2 + * - ARM NEON + * - WebAssembly SIMD128 + * - POWER8 VSX + * - s390x ZVector + * This can be controlled via the @ref XXH_VECTOR macro, but it automatically + * selects the best version according to predefined macros. For the x86 family, an + * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generate exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/*! + * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @note + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/*! + * @brief Calculates 64-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing @ref XXH3_generateSecret() instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The opaque state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH3_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits()`. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits_withSeed()`. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * `secret` is referenced, it _must outlive_ the hash streaming session. + * + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 64-bit hash value from that state. + * + * @note + * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +/*! + * @brief Calculates 128-bit unseeded variant of XXH3 of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); +/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); +/*! + * @brief Calculates 128-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing @ref XXH3_generateSecret() instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 128-bit hash value from that state. + * + * @note + * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * @brief Check equality of two XXH128_hash_t values + * + * @param h1 The 128-bit hash value. + * @param h2 Another 128-bit hash value. + * + * @return `1` if `h1` and `h2` are equal. + * @return `0` if they are not. + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * @brief Compares two @ref XXH128_hash_t + * + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * @param h128_1 Left-hand side value + * @param h128_2 Right-hand side value + * + * @return >0 if @p h128_1 > @p h128_2 + * @return =0 if @p h128_1 == @p h128_2 + * @return <0 if @p h128_1 < @p h128_2 + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; + + +/*! + * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. + * + * @param dst The @ref XXH128_canonical_t pointer to be stored to. + * @param hash The @ref XXH128_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); + +/*! + * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. + * + * @param src The @ref XXH128_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); + + +#endif /* !XXH_NO_XXH3 */ + +#if defined (__cplusplus) +} /* extern "C" */ +#endif + +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ #endif /* XXHASH_H_5627135585666179 */ -/* ================================================================================================ - This section contains definitions which are not guaranteed to remain stable. - They may change in future versions, becoming incompatible with a different version of the library. - They shall only be used with static linking. - Never use these definitions in association with dynamic linking ! -=================================================================================================== */ -#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345) -#define XXH_STATIC_H_3543687687345 - -/* These definitions are only meant to allow allocation of XXH state - statically, on stack, or in a struct for example. - Do not use members directly. */ - - struct XXH32_state_s { - unsigned total_len_32; - unsigned large_len; - unsigned v1; - unsigned v2; - unsigned v3; - unsigned v4; - unsigned mem32[4]; /* buffer defined as U32 for alignment */ - unsigned memsize; - unsigned reserved; /* never read nor write, will be removed in a future version */ - }; /* typedef'd to XXH32_state_t */ - - struct XXH64_state_s { - unsigned long long total_len; - unsigned long long v1; - unsigned long long v2; - unsigned long long v3; - unsigned long long v4; - unsigned long long mem64[4]; /* buffer defined as U64 for alignment */ - unsigned memsize; - unsigned reserved[2]; /* never read nor write, will be removed in a future version */ - }; /* typedef'd to XXH64_state_t */ - - -# ifdef XXH_PRIVATE_API -# include "xxhash.c" /* include xxhash functions as `static`, for inlining */ -# endif +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t v[4]; /*!< Accumulator lanes */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v[4]; /*!< Accumulator lanes */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ +}; /* typedef'd to XXH64_state_t */ + +#ifndef XXH_NO_XXH3 + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @internal + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t useSeed; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER -#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */ +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) \ + do { \ + XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ + tmp_xxh3_state_ptr->seed = 0; \ + tmp_xxh3_state_ptr->extSecret = NULL; \ + } while(0) #if defined (__cplusplus) -} +extern "C" { +#endif + +/*! + * @brief Calculates the 128-bit hash of @p data using XXH3. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p len is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 128-bit XXH3 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/*! + * @brief Derive a high-entropy secret from any user-defined content, named customSeed. + * + * @param secretBuffer A writable buffer for derived high-entropy secret data. + * @param secretSize Size of secretBuffer, in bytes. Must be >= XXH3_SECRET_DEFAULT_SIZE. + * @param customSeed A user-defined content. + * @param customSeedSize Size of customSeed, in bytes. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() + * are part of this list. They all accept a `secret` parameter + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. + * + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * + * Example code: + * @code{.c} + * #include + * #include + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode + */ +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); + +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The 64-bit seed to alter the hash result predictably. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); + +/*! + * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * These variants generate hash values using either + * @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed); +/*! + * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param input The block of data to be hashed, at least @p len bytes in size. + * @param length The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed() + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#ifndef XXH_NO_STREAM +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed() + */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed() + */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ + +#if defined (__cplusplus) +} /* extern "C" */ +#endif + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ + +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION #endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 + +/*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips + * which are platforms known to offer good unaligned memory accesses performance. + * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH3_INLINE_SECRET + * @brief Determines whether to inline the XXH3 withSecret code. + * + * When the secret size is known, the compiler can improve the performance + * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). + * + * However, if the secret size is not known, it doesn't have any benefit. This + * happens when xxHash is compiled into a global symbol. Therefore, if + * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. + * + * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers + * that are *sometimes* force inline on -Og, and it is impossible to automatically + * detect this optimization level. + */ +# define XXH3_INLINE_SECRET 0 + +/*! + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. + * + * This setting is only possibly making a difference for very small inputs. + */ +# define XXH32_ENDJMP 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH3_INLINE_SECRET +# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ + || !defined(XXH_INLINE_ALL) +# define XXH3_INLINE_SECRET 0 +# else +# define XXH3_INLINE_SECRET 1 +# endif +#endif + +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + +/* ************************************* +* Includes & Memory related functions +***************************************/ +#include /* memcmp, memcpy */ +#include /* ULLONG_MAX */ + +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#if defined (__cplusplus) +} /* extern "C" */ +#endif + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include + +#if defined (__cplusplus) +extern "C" { +#endif +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#if defined (__cplusplus) +} /* extern "C" */ +#endif + +#endif /* XXH_NO_STDLIB */ + +#if defined (__cplusplus) +extern "C" { +#endif +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#if defined (__cplusplus) +} /* extern "C" */ +#endif + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __attribute__((unused)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + +#if XXH3_INLINE_SECRET +# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE +#else +# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE +#endif + + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# if defined(__INTEL_COMPILER) +# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) +# else +# define XXH_ASSERT(c) XXH_ASSUME(c) +# endif +#endif + +/* note: use after variable declarations */ +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +/* Specifically for NEON vectors which use the "w" constraint, on + * Clang. */ +#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) +# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# ifdef _AIX +# include +# else +# include +# endif + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +#if defined (__cplusplus) +extern "C" { +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ + +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + + + +/* + * C23 and future versions have standard "unreachable()". + * Once it has been implemented reliably we can add it as an + * additional case: + * + * ``` + * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) + * # include + * # ifdef unreachable + * # define XXH_UNREACHABLE() unreachable() + * # endif + * #endif + * ``` + * + * Note C++23 also has std::unreachable() which can be detected + * as follows: + * ``` + * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) + * # include + * # define XXH_UNREACHABLE() std::unreachable() + * #endif + * ``` + * NB: `__cpp_lib_unreachable` is defined in the `` header. + * We don't use that as including `` in `extern "C"` blocks + * doesn't work on GCC12 + */ + +#if XXH_HAS_BUILTIN(__builtin_unreachable) +# define XXH_UNREACHABLE() __builtin_unreachable() + +#elif defined(_MSC_VER) +# define XXH_UNREACHABLE() __assume(0) + +#else +# define XXH_UNREACHABLE() +#endif + +#if XXH_HAS_BUILTIN(__builtin_assume) +# define XXH_ASSUME(c) __builtin_assume(c) +#else +# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } +#endif + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup XXH32_impl XXH32 implementation + * @ingroup impl + * + * Details on the XXH32 implementation. + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing + * the loop. NEON is only faster on the A53, and with the newer cores, it is less + * than half the speed. + * + * Additionally, this is used on WASM SIMD128 because it JITs to the same + * SIMD instructions and has the same issue. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param hash The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 hash) +{ + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + * @see XXH64_finalize(). + */ +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ +} while (0) + + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(hash); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 8: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 4: XXH_PROCESS4; + return XXH32_avalanche(hash); + + case 13: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 9: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 14: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 10: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 15: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 11: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 7: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 3: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 2: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 1: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); + } + XXH_ASSERT(0); + return hash; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input , len , seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + xxh_u32 h32; + + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=16) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - XXH_PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + statePtr->v[1] = seed + XXH_PRIME32_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME32_1; + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + + do { + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v[0], 1) + + XXH_rotl32(state->v[1], 7) + + XXH_rotl32(state->v[2], 12) + + XXH_rotl32(state->v[3], 18); + } else { + h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup XXH64_impl XXH64 implementation + * @ingroup impl + * + * Details on the XXH64 implementation. + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +/*! @copydoc XXH32_round */ +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; +#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * DISABLE AUTOVECTORIZATION: + * A compiler fence is used to prevent GCC and Clang from + * autovectorizing the XXH64 loop (pragmas and attributes don't work for some + * reason) without globally disabling AVX512. + * + * Autovectorization of XXH64 tends to be detrimental, + * though the exact outcome may change depending on exact cpu and compiler version. + * For information, it has been reported as detrimental for Skylake-X, + * but possibly beneficial for Zen4. + * + * The default is to disable auto-vectorization, + * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) +{ + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(hash); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + xxh_u64 h64; + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=32) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - XXH_PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + statePtr->v[1] = seed + XXH_PRIME64_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME64_1; + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + + do { + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); + h64 = XXH64_mergeRound(h64, state->v[0]); + h64 = XXH64_mergeRound(h64, state->v[1]); + h64 = XXH64_mergeRound(h64, state->v[2]); + h64 = XXH64_mergeRound(h64, state->v[3]); + } else { + h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#if defined (__cplusplus) +} +#endif + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup XXH3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined (__clang__)) \ + || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ + || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) +/* + * There are a LOT more compilers that recognize __restrict but this + * covers the major ones. + */ +# define XXH_RESTRICT __restrict +#else +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#ifndef XXH_HAS_INCLUDE +# ifdef __has_include +/* + * Not defined as XXH_HAS_INCLUDE(x) (function-like) because + * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion) + */ +# define XXH_HAS_INCLUDE __has_include +# else +# define XXH_HAS_INCLUDE(x) 0 +# endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include +# endif +# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || (defined(_M_ARM) && _M_ARM >= 7) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* WASM SIMD128 via SIMDe */ +# define inline __inline__ /* circumvent a clang bug */ +# include +# undef inline +# elif defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# endif +#endif + +#if defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * internal macro XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< + * NEON for most ARMv7-A, all AArch64, and WASM SIMD128 + * via the SIMDeverywhere polyfill provided with the + * Emscripten SDK. + */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ +}; +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment required for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +# define XXH_SCALAR 0 +# define XXH_SSE2 1 +# define XXH_AVX2 2 +# define XXH_AVX512 3 +# define XXH_NEON 4 +# define XXH_VSX 5 +# define XXH_SVE 6 +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* wasm simd128 via SIMDe */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define XXH_ALIASING __attribute__((may_alias)) +#else +# define XXH_ALIASING /* nothing */ +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + +#if defined (__cplusplus) +extern "C" { +#endif + +#if XXH_VECTOR == XXH_NEON + +/* + * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 + * optimizes out the entire hashLong loop because of the aliasing violation. + * + * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, + * so the only option is to mark it as aliasing. + */ +typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(xxh_aliasing_uint64x2_t const *)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif + +/*! + * @internal + * @brief `vmlal_u32` on low and high halves of a vector. + * + * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with + * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` + * with `vmlal_u32`. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* Inline assembly is the only way */ + __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); + return acc; +} +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* This intrinsic works as expected */ + return vmlal_high_u32(acc, lhs, rhs); +} +#else +/* Portable intrinsic versions */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); +} +/*! @copydoc XXH_vmlal_low_u32 + * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); +} +#endif + +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * This can be set to 2, 4, 6, or 8. + * + * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those + * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU + * bandwidth. + * + * This is even more noticeable on the more advanced cores like the Cortex-A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes + * and 2 scalar lanes, which is chosen by default. + * + * This does not apply to Apple processors or 32-bit processors, which run better with + * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * most other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning + * it effectively becomes worse 4. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +#if defined (__cplusplus) +} /* extern "C" */ +#endif + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + +# if defined(__s390x__) +# include +# else +# include +# endif + +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +/* + * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. + */ +typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +#if defined (__cplusplus) +extern "C" { +#endif +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +#if defined (__cplusplus) +} /* extern "C" */ +#endif +# endif +# endif /* XXH_VSX_BE */ + +#if defined (__cplusplus) +extern "C" { +#endif +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ + +#if defined (__cplusplus) +} /* extern "C" */ +#endif + +#endif /* XXH_VECTOR == XXH_VSX */ + +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + +#if defined (__cplusplus) +extern "C" { +#endif +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ +static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs , rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs , rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= PRIME_MX1; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= PRIME_MX2; + h64 ^= (h64 >> 35) + len ; + h64 *= PRIME_MX2; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); +#endif + return XXH3_avalanche(acc); + } +} + +/*! + * @brief Maximum size of "short" key in bytes. + */ +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + xxh_u64 acc_end; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + /* last bytes */ + acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + XXH_ASSERT(nbRounds >= 8); + acc = XXH3_avalanche(acc); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + /* + * Prevents clang for unrolling the acc loop and interleaving with this one. + */ + XXH_COMPILER_GUARD(acc); + acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + return XXH3_avalanche(acc + acc_end); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_() and it calls + * XXH3_accumulate_512_(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + XXH_memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); + + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dst16); +# endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); + + for (i=0; i < nbRounds; ++i) { + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON and WASM SIMD128. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + * + * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit + * integers instead of the other platforms which mask full 64-bit vectors, + * so the setup is more complicated than just shifting right. + * + * Additionally, there is an optimization for 4 lanes at once noted below. + * + * Since, as stated, the most optimal amount of lanes for Cortexes is 6, + * there needs to be *three* versions of the accumulate operation used + * for the remaining 2 lanes. + * + * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap + * nearly perfectly. + */ + +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); + { /* GCC for darwin arm64 does not like aliasing here */ + xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* xinput = (const uint8_t *) input; + uint8_t const* xsecret = (const uint8_t *) secret; + + size_t i; +#ifdef __wasm_simd128__ + /* + * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret + * is constant propagated, which results in it converting it to this + * inside the loop: + * + * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) + * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) + * ... + * + * This requires a full 32-bit address immediate (and therefore a 6 byte + * instruction) as well as an add for each offset. + * + * Putting an asm guard prevents it from folding (at the cost of losing + * the alignment hint), and uses the free offset in `v128.load` instead + * of adding secret_offset each time which overall reduces code size by + * about a kilobyte and improves performance. + */ + XXH_COMPILER_GUARD(xsecret); +#endif + /* Scalar lanes use the normal scalarRound routine */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + /* 4 NEON lanes at a time. */ + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); + uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* data_swap = swap(data_vec) */ + uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); + uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); + uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); + + /* + * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a + * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to + * get one vector with the low 32 bits of each lane, and one vector + * with the high 32 bits of each lane. + * + * The intrinsic returns a double vector because the original ARMv7-a + * instruction modified both arguments in place. AArch64 and SIMD128 emit + * two instructions from this intrinsic. + * + * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] + * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] + */ + uint32x4x2_t unzipped = vuzpq_u32( + vreinterpretq_u32_u64(data_key_1), + vreinterpretq_u32_u64(data_key_2) + ); + /* data_key_lo = data_key & 0xFFFFFFFF */ + uint32x4_t data_key_lo = unzipped.val[0]; + /* data_key_hi = data_key >> 32 */ + uint32x4_t data_key_hi = unzipped.val[1]; + /* + * Then, we can split the vectors horizontally and multiply which, as for most + * widening intrinsics, have a variant that works on both high half vectors + * for free on AArch64. A similar instruction is available on SIMD128. + * + * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi + */ + uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); + uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); + /* + * Clang reorders + * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s + * c += a; // add acc.2d, acc.2d, swap.2d + * to + * c += a; // add acc.2d, acc.2d, swap.2d + * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s + * + * While it would make sense in theory since the addition is faster, + * for reasons likely related to umlal being limited to certain NEON + * pipelines, this is worse. A compiler guard fixes this. + */ + XXH_COMPILER_GUARD_CLANG_NEON(sum_1); + XXH_COMPILER_GUARD_CLANG_NEON(sum_2); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64(xacc[i], sum_1); + xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); + } + /* Operate on the remaining NEON lanes 2 at a time. */ + for (; i < XXH3_NEON_LANES / 2; i++) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* For two lanes, just use VMOVN and VSHRN. */ + /* data_key_lo = data_key & 0xFFFFFFFF; */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* data_key_hi = data_key >> 32; */ + uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); + /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ + uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); + /* Same Clang workaround as before */ + XXH_COMPILER_GUARD_CLANG_NEON(sum); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64 (xacc[i], sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + + size_t i; + /* WASM uses operator overloads and doesn't need these. */ +#ifndef __wasm_simd128__ + /* { prime32_1, prime32_1 } */ + uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); + /* { 0, prime32_1, 0, prime32_1 } */ + uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); +#endif + + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); + uint64x2_t data_vec = veorq_u64(acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* xacc[i] *= XXH_PRIME32_1 */ +#ifdef __wasm_simd128__ + /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ + xacc[i] = data_key * XXH_PRIME32_1; +#else + /* + * Expanded version with portable NEON intrinsics + * + * lo(x) * lo(y) + (hi(x) * lo(y) << 32) + * + * prod_hi = hi(data_key) * lo(prime) << 32 + * + * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector + * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits + * and avoid the shift. + */ + uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); + /* Extract low bits for vmlal_u32 */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); +#endif + } + } +} +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* presumed aligned */ + xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ + xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = xacc[i]; + acc_vec += product; + + /* swap high and low halves */ +#ifdef __s390x__ + acc_vec += vec_permi(data_vec, data_vec, 2); +#else + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] = acc_vec; + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + const xxh_u8* const xsecret = (const xxh_u8*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + +/* scalar variants - universal */ + +#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) +/* + * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they + * emit an excess mask and a full 64-bit multiply-add (MADD X-form). + * + * While this might not seem like much, as AArch64 is a 64-bit architecture, only + * big Cortex designs have a full 64-bit multiplier. + * + * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit + * multiplies expand to 2-3 multiplies in microcode. This has a major penalty + * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. + * + * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does + * not have this penalty and does the mask automatically. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + xxh_u64 ret; + /* note: %x = 64-bit register, %w = 32-bit register */ + __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); + return ret; +} +#else +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; +} +#endif + +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) + +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__GNUC__) && defined(__aarch64__) + /* + * UGLY HACK: + * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes the compiler to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes the compiler to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); +} + +/* + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. + */ +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ +#if XXH_SIZE_OPT <= 0 + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); +#endif + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) +{ + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} + + +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup XXH3_family */ +/*! + * @brief Allocate an @ref XXH3_state_t. + * + * @return An allocated pointer of @ref XXH3_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH3_freeState(). + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup XXH3_family */ +/*! + * @brief Frees an @ref XXH3_state_t. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * + * @return @ref XXH_OK. + * + * @note Must be allocated with XXH3_createState(). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) +{ + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->useSeed = (seed != 0); + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + +/*! + * @internal + * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). + * + * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. + * + * @param acc Pointer to the 8 accumulator lanes + * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* + * @param nbStripesPerBlock Number of stripes in a block + * @param input Input pointer + * @param nbStripes Number of stripes to process + * @param secret Secret pointer + * @param secretLimit Offset of the last block in @p secret + * @param f_acc Pointer to an XXH3_accumulate implementation + * @param f_scramble Pointer to an XXH3_scrambleAcc implementation + * @return Pointer past the end of @p input after processing + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; + /* Process full blocks */ + if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { + /* Process the initial partial block... */ + size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; + + do { + /* Accumulate and scramble */ + f_acc(acc, input, initialSecret, nbStripesThisIter); + f_scramble(acc, secret + secretLimit); + input += nbStripesThisIter * XXH_STRIPE_LEN; + nbStripes -= nbStripesThisIter; + /* Then continue the loop with the full block size */ + nbStripesThisIter = nbStripesPerBlock; + initialSecret = secret; + } while (nbStripes >= nbStripesPerBlock); + *nbStripesSoFarPtr = 0; + } + /* Process a partial block */ + if (nbStripes > 0) { + f_acc(acc, input, initialSecret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; + *nbStripesSoFarPtr += nbStripes; + } + /* Return end pointer */ + return input; +} + +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + XXH_ASSERT(state != NULL); + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; + XXH_memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + /* small input : just fill in tmp buffer */ + if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + input = XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, nbStripes, + secret, state->secretLimit, + f_acc, f_scramble); + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + + } + /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + XXH_memcpy(state->acc, acc, sizeof(acc)); +#endif + } + + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + const xxh_u8* lastStripePtr; + + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + XXH_memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + /* Consume remaining stripes then point to remaining data in buffer */ + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate, XXH3_scrambleAcc); + lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; + } else { /* bufferedSize < XXH_STRIPE_LEN */ + /* Copy to temp buffer */ + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + lastStripePtr = lastStripe; + } + /* Last stripe */ + XXH3_accumulate_512(acc, + lastStripePtr, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->useSeed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= PRIME_MX2; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + unsigned i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + /* + * We set as `i` as offset + 32. We do this so that unchanged + * `len` can be used as upper bound. This reaches a sweet spot + * where both x86 and aarch64 get simple agen and good codegen + * for the loop. + */ + for (i = 32; i < 160; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + i - 32, + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + /* + * NB: `i <= len` will duplicate the last 32-bytes if + * len % 32 was zero. This is an unfortunate necessity to keep + * the hash result stable. + */ + for (i=160; i <= len; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + (XXH64_hash_t)0 - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong() is not inlined. + */ +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. + * + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ +#ifndef XXH_NO_STREAM +/* + * All initialization and update functions are identical to 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + return XXH3_64bits_reset(statePtr); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_64bits_update(state, input, len); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ +/* 128-bit utility functions */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; n4GB) */ #define _FILE_OFFSET_BITS 64 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ +# ifndef _LARGEFILE_SOURCE # define _LARGEFILE_SOURCE +# endif #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ +# ifndef _LARGEFILE64_SOURCE # define _LARGEFILE64_SOURCE +# endif #endif @@ -37,17 +42,19 @@ #include /* fprintf, fopen, ftello64 */ #include /* clock */ +#ifndef ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY +#endif + #include "mem.h" /* read */ #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */ -#define HUF_STATIC_LINKING_ONLY #include "huf.h" /* HUF_buildCTable, HUF_writeCTable */ #include "zstd_internal.h" /* includes zstd.h */ #include "xxhash.h" /* XXH64 */ -#include "divsufsort.h" -#ifndef ZDICT_STATIC_LINKING_ONLY -# define ZDICT_STATIC_LINKING_ONLY -#endif +#include "zstd_compress_internal.h" /* ZSTD_loadCEntropy() */ #include "zdict.h" +#include "divsufsort.h" +#include "bits.h" /* ZSTD_NbCommonBytes */ /*-************************************* @@ -61,15 +68,16 @@ #define NOISELENGTH 32 -static const int g_compressionLevel_default = 3; static const U32 g_selectivity_default = 9; /*-************************************* * Console display ***************************************/ -#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } -#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ +#undef DISPLAY +#define DISPLAY(...) do { fprintf(stderr, __VA_ARGS__); fflush( stderr ); } while (0) +#undef DISPLAYLEVEL +#define DISPLAYLEVEL(l, ...) do { if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } } while (0) /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */ static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; } @@ -99,69 +107,30 @@ unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) return MEM_readLE32((const char*)dictBuffer + 4); } - -/*-******************************************************** -* Dictionary training functions -**********************************************************/ -static unsigned ZDICT_NbCommonBytes (size_t val) +size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize) { - if (MEM_isLittleEndian()) { - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanForward64( &r, (U64)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctzll((U64)val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r=0; - _BitScanForward( &r, (U32)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctz((U32)val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif + size_t headerSize; + if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted); + + { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t)); + U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE); + if (!bs || !wksp) { + headerSize = ERROR(memory_allocation); + } else { + ZSTD_reset_compressedBlockState(bs); + headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize); } - } else { /* Big Endian CPU */ - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clzll(val) >> 3); -# else - unsigned r; - const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ - if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r = 0; - _BitScanReverse( &r, (unsigned long)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clz((U32)val) >> 3); -# else - unsigned r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif - } } -} + free(bs); + free(wksp); + } + + return headerSize; +} +/*-******************************************************** +* Dictionary training functions +**********************************************************/ /*! ZDICT_count() : Count the nb of common bytes between 2 pointers. Note : this function presumes end of buffer followed by noisy guard band. @@ -176,7 +145,7 @@ static size_t ZDICT_count(const void* pIn, const void* pMatch) pMatch = (const char*)pMatch+sizeof(size_t); continue; } - pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff); + pIn = (const char*)pIn+ZSTD_NbCommonBytes(diff); return (size_t)((const char*)pIn - pStart); } } @@ -208,7 +177,7 @@ static dictItem ZDICT_analyzePos( U32 savings[LLIMIT] = {0}; const BYTE* b = (const BYTE*)buffer; size_t maxLength = LLIMIT; - size_t pos = suffix[start]; + size_t pos = (size_t)suffix[start]; U32 end = start; dictItem solution; @@ -342,7 +311,7 @@ static dictItem ZDICT_analyzePos( savings[i] = savings[i-1] + (lengthList[i] * (i-3)); DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n", - (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / maxLength); + (unsigned)pos, (unsigned)maxLength, (unsigned)savings[maxLength], (double)savings[maxLength] / (double)maxLength); solution.pos = (U32)pos; solution.length = (U32)maxLength; @@ -352,7 +321,7 @@ static dictItem ZDICT_analyzePos( { U32 id; for (id=start; id1) && (table[u-1].savings < elt.savings)) - table[u] = table[u-1], u--; + table[u] = table[u-1], u--; table[u] = elt; return u; } } @@ -415,7 +384,7 @@ static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { /* overlap, existing < new */ /* append */ - int const addedLength = (int)eltEnd - (table[u].pos + table[u].length); + int const addedLength = (int)eltEnd - (int)(table[u].pos + table[u].length); table[u].savings += elt.length / 8; /* rough approx bonus */ if (addedLength > 0) { /* otherwise, elt fully included into existing */ table[u].length += addedLength; @@ -508,10 +477,17 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize, clock_t displayClock = 0; clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10; -# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \ - if (ZDICT_clockSpan(displayClock) > refreshRate) \ - { displayClock = clock(); DISPLAY(__VA_ARGS__); \ - if (notificationLevel>=4) fflush(stderr); } } +# undef DISPLAYUPDATE +# define DISPLAYUPDATE(l, ...) \ + do { \ + if (notificationLevel>=l) { \ + if (ZDICT_clockSpan(displayClock) > refreshRate) { \ + displayClock = clock(); \ + DISPLAY(__VA_ARGS__); \ + } \ + if (notificationLevel>=4) fflush(stderr); \ + } \ + } while (0) /* init */ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ @@ -554,7 +530,7 @@ static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize, if (solution.length==0) { cursor++; continue; } ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); cursor += solution.length; - DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100); + DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / (double)bufferSize * 100.0); } } _cleanup: @@ -588,24 +564,24 @@ typedef struct #define MAXREPOFFSET 1024 -static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, +static void ZDICT_countEStats(EStats_ress_t esr, const ZSTD_parameters* params, unsigned* countLit, unsigned* offsetcodeCount, unsigned* matchlengthCount, unsigned* litlengthCount, U32* repOffsets, const void* src, size_t srcSize, U32 notificationLevel) { - size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); + size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params->cParams.windowLog); size_t cSize; if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ - { size_t const errorCode = ZSTD_compressBegin_usingCDict(esr.zc, esr.dict); + { size_t const errorCode = ZSTD_compressBegin_usingCDict_deprecated(esr.zc, esr.dict); if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; } } - cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); + cSize = ZSTD_compressBlock_deprecated(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize); return; } if (cSize) { /* if == 0; block is not compressible */ - const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc); + const SeqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc); /* literals stats */ { const BYTE* bytePtr; @@ -633,9 +609,9 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, } if (nbSeq >= 2) { /* rep offsets */ - const seqDef* const seq = seqStorePtr->sequencesStart; - U32 offset1 = seq[0].offset - 3; - U32 offset2 = seq[1].offset - 3; + const SeqDef* const seq = seqStorePtr->sequencesStart; + U32 offset1 = seq[0].offBase - ZSTD_REP_NUM; + U32 offset2 = seq[1].offBase - ZSTD_REP_NUM; if (offset1 >= MAXREPOFFSET) offset1 = 0; if (offset2 >= MAXREPOFFSET) offset2 = 0; repOffsets[offset1] += 3; @@ -682,7 +658,7 @@ static void ZDICT_flatLit(unsigned* countLit) #define OFFCODE_MAX 30 /* only applicable to first block */ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, - unsigned compressionLevel, + int compressionLevel, const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles, const void* dictBuffer, size_t dictBufferSize, unsigned notificationLevel) @@ -706,6 +682,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles); BYTE* dstPtr = (BYTE*)dstBuffer; + U32 wksp[HUF_CTABLE_WORKSPACE_SIZE_U32]; /* init */ DEBUGLOG(4, "ZDICT_analyzeEntropy"); @@ -717,7 +694,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, memset(repOffset, 0, sizeof(repOffset)); repOffset[1] = repOffset[4] = repOffset[8] = 1; memset(bestRepOffset, 0, sizeof(bestRepOffset)); - if (compressionLevel==0) compressionLevel = g_compressionLevel_default; + if (compressionLevel==0) compressionLevel = ZSTD_CLEVEL_DEFAULT; params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); esr.dict = ZSTD_createCDict_advanced(dictBuffer, dictBufferSize, ZSTD_dlm_byRef, ZSTD_dct_rawContent, params.cParams, ZSTD_defaultCMem); @@ -731,15 +708,22 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, /* collect stats on all samples */ for (u=0; u= 4) { + /* writeStats */ + DISPLAYLEVEL(4, "Offset Code Frequencies : \n"); + for (u=0; u<=offcodeMax; u++) { + DISPLAYLEVEL(4, "%2u :%7u \n", u, offcodeCount[u]); + } } + /* analyze, build stats, starting with literals */ - { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); + { size_t maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp)); if (HUF_isError(maxNbBits)) { eSize = maxNbBits; DISPLAYLEVEL(1, " HUF_buildCTable error \n"); @@ -748,7 +732,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */ DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n"); ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */ - maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); + maxNbBits = HUF_buildCTable_wksp(hufTable, countLit, 255, huffLog, wksp, sizeof(wksp)); assert(maxNbBits==9); } huffLog = (U32)maxNbBits; @@ -762,7 +746,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, /* note : the result of this phase should be used to better appreciate the impact on statistics */ total=0; for (u=0; u<=offcodeMax; u++) total+=offcodeCount[u]; - errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax); + errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax, /* useLowProbCount */ 1); if (FSE_isError(errorCode)) { eSize = errorCode; DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n"); @@ -771,7 +755,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, Offlog = (U32)errorCode; total=0; for (u=0; u<=MaxML; u++) total+=matchLengthCount[u]; - errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML); + errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML, /* useLowProbCount */ 1); if (FSE_isError(errorCode)) { eSize = errorCode; DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n"); @@ -780,7 +764,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, mlLog = (U32)errorCode; total=0; for (u=0; u<=MaxLL; u++) total+=litLengthCount[u]; - errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL); + errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL, /* useLowProbCount */ 1); if (FSE_isError(errorCode)) { eSize = errorCode; DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n"); @@ -789,7 +773,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, llLog = (U32)errorCode; /* write result to buffer */ - { size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, 255, huffLog); + { size_t const hhSize = HUF_writeCTable_wksp(dstPtr, maxDstSize, hufTable, 255, huffLog, wksp, sizeof(wksp)); if (HUF_isError(hhSize)) { eSize = hhSize; DISPLAYLEVEL(1, "HUF_writeCTable error \n"); @@ -844,7 +828,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, MEM_writeLE32(dstPtr+8, bestRepOffset[2].offset); #else /* at this stage, we don't use the result of "most common first offset", - as the impact of statistics is not properly evaluated */ + * as the impact of statistics is not properly evaluated */ MEM_writeLE32(dstPtr+0, repStartValue[0]); MEM_writeLE32(dstPtr+4, repStartValue[1]); MEM_writeLE32(dstPtr+8, repStartValue[2]); @@ -860,6 +844,17 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, } +/** + * @returns the maximum repcode value + */ +static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM]) +{ + U32 maxRep = reps[0]; + int r; + for (r = 1; r < ZSTD_REP_NUM; ++r) + maxRep = MAX(maxRep, reps[r]); + return maxRep; +} size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, const void* customDictContent, size_t dictContentSize, @@ -869,13 +864,15 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, size_t hSize; #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */ BYTE header[HBUFFSIZE]; - int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel; + int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel; U32 const notificationLevel = params.notificationLevel; + /* The final dictionary content must be at least as large as the largest repcode */ + size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue); + size_t paddingSize; /* check conditions */ DEBUGLOG(4, "ZDICT_finalizeDictionary"); if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); - if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong); if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); /* dictionary header */ @@ -899,12 +896,43 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, hSize += eSize; } - /* copy elements in final buffer ; note : src and dst buffer can overlap */ - if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize; - { size_t const dictSize = hSize + dictContentSize; - char* dictEnd = (char*)dictBuffer + dictSize; - memmove(dictEnd - dictContentSize, customDictContent, dictContentSize); - memcpy(dictBuffer, header, hSize); + /* Shrink the content size if it doesn't fit in the buffer */ + if (hSize + dictContentSize > dictBufferCapacity) { + dictContentSize = dictBufferCapacity - hSize; + } + + /* Pad the dictionary content with zeros if it is too small */ + if (dictContentSize < minContentSize) { + RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall, + "dictBufferCapacity too small to fit max repcode"); + paddingSize = minContentSize - dictContentSize; + } else { + paddingSize = 0; + } + + { + size_t const dictSize = hSize + paddingSize + dictContentSize; + + /* The dictionary consists of the header, optional padding, and the content. + * The padding comes before the content because the "best" position in the + * dictionary is the last byte. + */ + BYTE* const outDictHeader = (BYTE*)dictBuffer; + BYTE* const outDictPadding = outDictHeader + hSize; + BYTE* const outDictContent = outDictPadding + paddingSize; + + assert(dictSize <= dictBufferCapacity); + assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize); + + /* First copy the customDictContent into its final location. + * `customDictContent` and `dictBuffer` may overlap, so we must + * do this before any other writes into the output buffer. + * Then copy the header & padding into the output buffer. + */ + memmove(outDictContent, customDictContent, dictContentSize); + memcpy(outDictHeader, header, hSize); + memset(outDictPadding, 0, paddingSize); + return dictSize; } } @@ -915,7 +943,7 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced( const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_params_t params) { - int const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel; + int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel; U32 const notificationLevel = params.notificationLevel; size_t hSize = 8; @@ -944,16 +972,11 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced( return MIN(dictBufferCapacity, hSize+dictContentSize); } -/* Hidden declaration for dbio.c */ -size_t ZDICT_trainFromBuffer_unsafe_legacy( - void* dictBuffer, size_t maxDictSize, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, - ZDICT_legacy_params_t params); /*! ZDICT_trainFromBuffer_unsafe_legacy() : -* Warning : `samplesBuffer` must be followed by noisy guard band. +* Warning : `samplesBuffer` must be followed by noisy guard band !!! * @return : size of dictionary, or an error code which can be tested with ZDICT_isError() */ -size_t ZDICT_trainFromBuffer_unsafe_legacy( +static size_t ZDICT_trainFromBuffer_unsafe_legacy( void* dictBuffer, size_t maxDictSize, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t params) @@ -1090,8 +1113,8 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, memset(¶ms, 0, sizeof(params)); params.d = 8; params.steps = 4; - /* Default to level 6 since no compression level information is available */ - params.zParams.compressionLevel = 3; + /* Use default level since no compression level information is available */ + params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1) params.zParams.notificationLevel = DEBUGLEVEL; #endif @@ -1109,3 +1132,5 @@ size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize samplesBuffer, samplesSizes, nbSamples, params); } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zdict.h b/vendor/github.com/DataDog/zstd/zdict.h index 37978ec..506775f 100644 --- a/vendor/github.com/DataDog/zstd/zdict.h +++ b/vendor/github.com/DataDog/zstd/zdict.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -8,34 +9,184 @@ * You may select, at your option, one of the above-listed licenses. */ -#ifndef DICTBUILDER_H_001 -#define DICTBUILDER_H_001 - -#if defined (__cplusplus) -extern "C" { -#endif +#ifndef ZSTD_ZDICT_H +#define ZSTD_ZDICT_H /*====== Dependencies ======*/ #include /* size_t */ +#if defined (__cplusplus) +extern "C" { +#endif /* ===== ZDICTLIB_API : control library symbols visibility ===== */ -#ifndef ZDICTLIB_VISIBILITY -# if defined(__GNUC__) && (__GNUC__ >= 4) -# define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default"))) +#ifndef ZDICTLIB_VISIBLE + /* Backwards compatibility with old macro name */ +# ifdef ZDICTLIB_VISIBILITY +# define ZDICTLIB_VISIBLE ZDICTLIB_VISIBILITY +# elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZDICTLIB_VISIBLE __attribute__ ((visibility ("default"))) +# else +# define ZDICTLIB_VISIBLE +# endif +#endif + +#ifndef ZDICTLIB_HIDDEN +# if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZDICTLIB_HIDDEN __attribute__ ((visibility ("hidden"))) # else -# define ZDICTLIB_VISIBILITY +# define ZDICTLIB_HIDDEN # endif #endif + #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) -# define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY +# define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBLE #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) -# define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +# define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ #else -# define ZDICTLIB_API ZDICTLIB_VISIBILITY +# define ZDICTLIB_API ZDICTLIB_VISIBLE #endif +/******************************************************************************* + * Zstd dictionary builder + * + * FAQ + * === + * Why should I use a dictionary? + * ------------------------------ + * + * Zstd can use dictionaries to improve compression ratio of small data. + * Traditionally small files don't compress well because there is very little + * repetition in a single sample, since it is small. But, if you are compressing + * many similar files, like a bunch of JSON records that share the same + * structure, you can train a dictionary on ahead of time on some samples of + * these files. Then, zstd can use the dictionary to find repetitions that are + * present across samples. This can vastly improve compression ratio. + * + * When is a dictionary useful? + * ---------------------------- + * + * Dictionaries are useful when compressing many small files that are similar. + * The larger a file is, the less benefit a dictionary will have. Generally, + * we don't expect dictionary compression to be effective past 100KB. And the + * smaller a file is, the more we would expect the dictionary to help. + * + * How do I use a dictionary? + * -------------------------- + * + * Simply pass the dictionary to the zstd compressor with + * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to + * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other + * more advanced functions that allow selecting some options, see zstd.h for + * complete documentation. + * + * What is a zstd dictionary? + * -------------------------- + * + * A zstd dictionary has two pieces: Its header, and its content. The header + * contains a magic number, the dictionary ID, and entropy tables. These + * entropy tables allow zstd to save on header costs in the compressed file, + * which really matters for small data. The content is just bytes, which are + * repeated content that is common across many samples. + * + * What is a raw content dictionary? + * --------------------------------- + * + * A raw content dictionary is just bytes. It doesn't have a zstd dictionary + * header, a dictionary ID, or entropy tables. Any buffer is a valid raw + * content dictionary. + * + * How do I train a dictionary? + * ---------------------------- + * + * Gather samples from your use case. These samples should be similar to each + * other. If you have several use cases, you could try to train one dictionary + * per use case. + * + * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your + * dictionary. There are a few advanced versions of this function, but this + * is a great starting point. If you want to further tune your dictionary + * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow + * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`. + * + * If the dictionary training function fails, that is likely because you + * either passed too few samples, or a dictionary would not be effective + * for your data. Look at the messages that the dictionary trainer printed, + * if it doesn't say too few samples, then a dictionary would not be effective. + * + * How large should my dictionary be? + * ---------------------------------- + * + * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB. + * The zstd CLI defaults to a 110KB dictionary. You likely don't need a + * dictionary larger than that. But, most use cases can get away with a + * smaller dictionary. The advanced dictionary builders can automatically + * shrink the dictionary for you, and select the smallest size that doesn't + * hurt compression ratio too much. See the `shrinkDict` parameter. + * A smaller dictionary can save memory, and potentially speed up + * compression. + * + * How many samples should I provide to the dictionary builder? + * ------------------------------------------------------------ + * + * We generally recommend passing ~100x the size of the dictionary + * in samples. A few thousand should suffice. Having too few samples + * can hurt the dictionaries effectiveness. Having more samples will + * only improve the dictionaries effectiveness. But having too many + * samples can slow down the dictionary builder. + * + * How do I determine if a dictionary will be effective? + * ----------------------------------------------------- + * + * Simply train a dictionary and try it out. You can use zstd's built in + * benchmarking tool to test the dictionary effectiveness. + * + * # Benchmark levels 1-3 without a dictionary + * zstd -b1e3 -r /path/to/my/files + * # Benchmark levels 1-3 with a dictionary + * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary + * + * When should I retrain a dictionary? + * ----------------------------------- + * + * You should retrain a dictionary when its effectiveness drops. Dictionary + * effectiveness drops as the data you are compressing changes. Generally, we do + * expect dictionaries to "decay" over time, as your data changes, but the rate + * at which they decay depends on your use case. Internally, we regularly + * retrain dictionaries, and if the new dictionary performs significantly + * better than the old dictionary, we will ship the new dictionary. + * + * I have a raw content dictionary, how do I turn it into a zstd dictionary? + * ------------------------------------------------------------------------- + * + * If you have a raw content dictionary, e.g. by manually constructing it, or + * using a third-party dictionary builder, you can turn it into a zstd + * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to + * provide some samples of the data. It will add the zstd header to the + * raw content, which contains a dictionary ID and entropy tables, which + * will improve compression ratio, and allow zstd to write the dictionary ID + * into the frame, if you so choose. + * + * Do I have to use zstd's dictionary builder? + * ------------------------------------------- + * + * No! You can construct dictionary content however you please, it is just + * bytes. It will always be valid as a raw content dictionary. If you want + * a zstd dictionary, which can improve compression ratio, use + * `ZDICT_finalizeDictionary()`. + * + * What is the attack surface of a zstd dictionary? + * ------------------------------------------------ + * + * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so + * zstd should never crash, or access out-of-bounds memory no matter what + * the dictionary is. However, if an attacker can control the dictionary + * during decompression, they can cause zstd to generate arbitrary bytes, + * just like if they controlled the compressed data. + * + ******************************************************************************/ + /*! ZDICT_trainFromBuffer(): * Train a dictionary from an array of samples. @@ -61,15 +212,89 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCap const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); +typedef struct { + int compressionLevel; /**< optimize for a specific zstd compression level; 0 means default */ + unsigned notificationLevel; /**< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ + unsigned dictID; /**< force dictID value; 0 means auto mode (32-bits random value) + * NOTE: The zstd format reserves some dictionary IDs for future use. + * You may use them in private settings, but be warned that they + * may be used by zstd in a public dictionary registry in the future. + * These dictionary IDs are: + * - low range : <= 32767 + * - high range : >= (2^31) + */ +} ZDICT_params_t; + +/*! ZDICT_finalizeDictionary(): + * Given a custom content as a basis for dictionary, and a set of samples, + * finalize dictionary by adding headers and statistics according to the zstd + * dictionary format. + * + * Samples must be stored concatenated in a flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each + * sample in order. The samples are used to construct the statistics, so they + * should be representative of what you will compress with this dictionary. + * + * The compression level can be set in `parameters`. You should pass the + * compression level you expect to use in production. The statistics for each + * compression level differ, so tuning the dictionary for the compression level + * can help quite a bit. + * + * You can set an explicit dictionary ID in `parameters`, or allow us to pick + * a random dictionary ID for you, but we can't guarantee no collisions. + * + * The dstDictBuffer and the dictContent may overlap, and the content will be + * appended to the end of the header. If the header + the content doesn't fit in + * maxDictSize the beginning of the content is truncated to make room, since it + * is presumed that the most profitable content is at the end of the dictionary, + * since that is the cheapest to reference. + * + * `maxDictSize` must be >= max(dictContentSize, ZDICT_DICTSIZE_MIN). + * + * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`), + * or an error code, which can be tested by ZDICT_isError(). + * Note: ZDICT_finalizeDictionary() will push notifications into stderr if + * instructed to, using notificationLevel>0. + * NOTE: This function currently may fail in several edge cases including: + * * Not enough samples + * * Samples are uncompressible + * * Samples are all exactly the same + */ +ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize, + const void* dictContent, size_t dictContentSize, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_params_t parameters); + /*====== Helper functions ======*/ ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */ +ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */ ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode); ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_ZDICT_H */ + +#if defined(ZDICT_STATIC_LINKING_ONLY) && !defined(ZSTD_ZDICT_H_STATIC) +#define ZSTD_ZDICT_H_STATIC +#if defined (__cplusplus) +extern "C" { +#endif -#ifdef ZDICT_STATIC_LINKING_ONLY +/* This can be overridden externally to hide static symbols. */ +#ifndef ZDICTLIB_STATIC_API +# if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZDICTLIB_STATIC_API __declspec(dllexport) ZDICTLIB_VISIBLE +# elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZDICTLIB_STATIC_API __declspec(dllimport) ZDICTLIB_VISIBLE +# else +# define ZDICTLIB_STATIC_API ZDICTLIB_VISIBLE +# endif +#endif /* ==================================================================================== * The definitions in this section are considered experimental. @@ -78,11 +303,9 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); * Use them only in association with static linking. * ==================================================================================== */ -typedef struct { - int compressionLevel; /* optimize for a specific zstd compression level; 0 means default */ - unsigned notificationLevel; /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ - unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */ -} ZDICT_params_t; +#define ZDICT_DICTSIZE_MIN 256 +/* Deprecated: Remove in v1.6.0 */ +#define ZDICT_CONTENTSIZE_MIN 128 /*! ZDICT_cover_params_t: * k and d are the only required parameters. @@ -127,7 +350,7 @@ typedef struct { * In general, it's recommended to provide a few thousands samples, though this can vary a lot. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. */ -ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( +ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover( void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_cover_params_t parameters); @@ -149,7 +372,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( * See ZDICT_trainFromBuffer() for details on failure modes. * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. */ -ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( +ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover( void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_cover_params_t* parameters); @@ -170,7 +393,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( * In general, it's recommended to provide a few thousands samples, though this can vary a lot. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. */ -ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer, +ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters); @@ -193,33 +416,11 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer, * See ZDICT_trainFromBuffer() for details on failure modes. * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread. */ -ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer, +ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t* parameters); -/*! ZDICT_finalizeDictionary(): - * Given a custom content as a basis for dictionary, and a set of samples, - * finalize dictionary by adding headers and statistics. - * - * Samples must be stored concatenated in a flat buffer `samplesBuffer`, - * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order. - * - * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes. - * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes. - * - * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`), - * or an error code, which can be tested by ZDICT_isError(). - * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0. - * Note 2: dictBuffer and dictContent can overlap - */ -#define ZDICT_CONTENTSIZE_MIN 128 -#define ZDICT_DICTSIZE_MIN 256 -ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, - const void* dictContent, size_t dictContentSize, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, - ZDICT_params_t parameters); - typedef struct { unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */ ZDICT_params_t zParams; @@ -240,43 +441,44 @@ typedef struct { * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. */ -ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy( - void *dictBuffer, size_t dictBufferCapacity, - const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, +ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_legacy( + void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t parameters); + /* Deprecation warnings */ /* It is generally possible to disable deprecation warnings from compiler, for example with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */ #ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS -# define ZDICT_DEPRECATED(message) ZDICTLIB_API /* disable deprecation warnings */ +# define ZDICT_DEPRECATED(message) /* disable deprecation warnings */ #else # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ -# define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API -# elif (ZDICT_GCC_VERSION >= 405) || defined(__clang__) -# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message))) +# define ZDICT_DEPRECATED(message) [[deprecated(message)]] +# elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405) +# define ZDICT_DEPRECATED(message) __attribute__((deprecated(message))) # elif (ZDICT_GCC_VERSION >= 301) -# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated)) +# define ZDICT_DEPRECATED(message) __attribute__((deprecated)) # elif defined(_MSC_VER) -# define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message)) +# define ZDICT_DEPRECATED(message) __declspec(deprecated(message)) # else # pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler") -# define ZDICT_DEPRECATED(message) ZDICTLIB_API +# define ZDICT_DEPRECATED(message) # endif #endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */ ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead") +ZDICTLIB_STATIC_API size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); - -#endif /* ZDICT_STATIC_LINKING_ONLY */ - #if defined (__cplusplus) } #endif -#endif /* DICTBUILDER_H_001 */ +#endif /* ZSTD_ZDICT_H_STATIC */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd.go b/vendor/github.com/DataDog/zstd/zstd.go index b6af4eb..8499bf1 100644 --- a/vendor/github.com/DataDog/zstd/zstd.go +++ b/vendor/github.com/DataDog/zstd/zstd.go @@ -1,29 +1,18 @@ package zstd /* -#define ZSTD_STATIC_LINKING_ONLY -#include "zstd.h" -#include "stdint.h" // for uintptr_t - -// The following *_wrapper function are used for removing superflouos -// memory allocations when calling the wrapped functions from Go code. -// See https://github.com/golang/go/issues/24450 for details. - -static size_t ZSTD_compress_wrapper(uintptr_t dst, size_t maxDstSize, const uintptr_t src, size_t srcSize, int compressionLevel) { - return ZSTD_compress((void*)dst, maxDstSize, (const void*)src, srcSize, compressionLevel); -} - -static size_t ZSTD_decompress_wrapper(uintptr_t dst, size_t maxDstSize, uintptr_t src, size_t srcSize) { - return ZSTD_decompress((void*)dst, maxDstSize, (const void *)src, srcSize); -} +// support decoding of "legacy" zstd payloads from versions [0.4, 0.8], matching the +// default configuration of the zstd command line tool: +// https://github.com/facebook/zstd/blob/dev/programs/README.md +#cgo CFLAGS: -DZSTD_LEGACY_SUPPORT=4 -DZSTD_MULTITHREAD=1 +#include "zstd.h" */ import "C" import ( "bytes" "errors" "io/ioutil" - "runtime" "unsafe" ) @@ -39,6 +28,17 @@ var ( ErrEmptySlice = errors.New("Bytes slice is empty") ) +const ( + // decompressSizeBufferLimit is the limit we set on creating a decompression buffer for the Decompress API + // This is made to prevent DOS from maliciously-created payloads (aka zipbomb). + // For large payloads with a compression ratio > 10, you can do your own allocation and pass it to the method: + // dst := make([]byte, 1GB) + // decompressed, err := zstd.Decompress(dst, src) + decompressSizeBufferLimit = 1000 * 1000 + + zstdFrameHeaderSizeMin = 2 // From zstd.h. Since it's experimental API, hardcoding it +) + // CompressBound returns the worst case size needed for a destination buffer, // which can be used to preallocate a destination buffer or select a previously // allocated buffer from a pool. @@ -57,6 +57,33 @@ func cCompressBound(srcSize int) int { return int(C.ZSTD_compressBound(C.size_t(srcSize))) } +// decompressSizeHint tries to give a hint on how much of the output buffer size we should have +// based on zstd frame descriptors. To prevent DOS from maliciously-created payloads, limit the size +func decompressSizeHint(src []byte) int { + // 1 MB or 50x input size + upperBound := 50 * len(src) + if upperBound < decompressSizeBufferLimit { + upperBound = decompressSizeBufferLimit + } + + hint := upperBound + if len(src) >= zstdFrameHeaderSizeMin { + hint = int(C.ZSTD_getFrameContentSize(unsafe.Pointer(&src[0]), C.size_t(len(src)))) + if hint < 0 { // On error, just use upperBound + hint = upperBound + } + if hint == 0 { // When compressing the empty slice, we need an output of at least 1 to pass down to the C lib + hint = 1 + } + } + + // Take the minimum of both + if hint > upperBound { + return upperBound + } + return hint +} + // Compress src into dst. If you have a buffer to use, you can pass it to // prevent allocation. If it is too small, or if nil is passed, a new buffer // will be allocated and returned. @@ -73,19 +100,26 @@ func CompressLevel(dst, src []byte, level int) ([]byte, error) { dst = make([]byte, bound) } - srcPtr := C.uintptr_t(uintptr(0)) // Do not point anywhere, if src is empty - if len(src) > 0 { - srcPtr = C.uintptr_t(uintptr(unsafe.Pointer(&src[0]))) + // We need unsafe.Pointer(&src[0]) in the Cgo call to avoid "Go pointer to Go pointer" panics. + // This means we need to special case empty input. See: + // https://github.com/golang/go/issues/14210#issuecomment-346402945 + var cWritten C.size_t + if len(src) == 0 { + cWritten = C.ZSTD_compress( + unsafe.Pointer(&dst[0]), + C.size_t(len(dst)), + unsafe.Pointer(nil), + C.size_t(0), + C.int(level)) + } else { + cWritten = C.ZSTD_compress( + unsafe.Pointer(&dst[0]), + C.size_t(len(dst)), + unsafe.Pointer(&src[0]), + C.size_t(len(src)), + C.int(level)) } - cWritten := C.ZSTD_compress_wrapper( - C.uintptr_t(uintptr(unsafe.Pointer(&dst[0]))), - C.size_t(len(dst)), - srcPtr, - C.size_t(len(src)), - C.int(level)) - - runtime.KeepAlive(src) written := int(cWritten) // Check if the return is an Error code if err := getError(written); err != nil { @@ -101,43 +135,20 @@ func Decompress(dst, src []byte) ([]byte, error) { if len(src) == 0 { return []byte{}, ErrEmptySlice } - decompress := func(dst, src []byte) ([]byte, error) { - cWritten := C.ZSTD_decompress_wrapper( - C.uintptr_t(uintptr(unsafe.Pointer(&dst[0]))), - C.size_t(len(dst)), - C.uintptr_t(uintptr(unsafe.Pointer(&src[0]))), - C.size_t(len(src))) - - runtime.KeepAlive(src) - written := int(cWritten) - // Check error - if err := getError(written); err != nil { - return nil, err - } - return dst[:written], nil + bound := decompressSizeHint(src) + if cap(dst) >= bound { + dst = dst[0:cap(dst)] + } else { + dst = make([]byte, bound) } - if len(dst) == 0 { - // Attempt to use zStd to determine decompressed size (may result in error or 0) - size := int(C.size_t(C.ZSTD_getDecompressedSize(unsafe.Pointer(&src[0]), C.size_t(len(src))))) - - if err := getError(size); err != nil { - return nil, err - } - - if size > 0 { - dst = make([]byte, size) - } else { - dst = make([]byte, len(src)*3) // starting guess - } + written, err := DecompressInto(dst, src) + if err == nil { + return dst[:written], nil } - for i := 0; i < 3; i++ { // 3 tries to allocate a bigger buffer - result, err := decompress(dst, src) - if !IsDstSizeTooSmallError(err) { - return result, err - } - dst = make([]byte, len(dst)*2) // Grow buffer by 2 + if !IsDstSizeTooSmallError(err) { + return nil, err } // We failed getting a dst buffer of correct size, use stream API @@ -145,3 +156,19 @@ func Decompress(dst, src []byte) ([]byte, error) { defer r.Close() return ioutil.ReadAll(r) } + +// DecompressInto decompresses src into dst. Unlike Decompress, DecompressInto +// requires that dst be sufficiently large to hold the decompressed payload. +// DecompressInto may be used when the caller knows the size of the decompressed +// payload before attempting decompression. +// +// It returns the number of bytes copied and an error if any is encountered. If +// dst is too small, DecompressInto errors. +func DecompressInto(dst, src []byte) (int, error) { + written := int(C.ZSTD_decompress( + unsafe.Pointer(&dst[0]), + C.size_t(len(dst)), + unsafe.Pointer(&src[0]), + C.size_t(len(src)))) + return written, getError(written) +} diff --git a/vendor/github.com/DataDog/zstd/zstd.h b/vendor/github.com/DataDog/zstd/zstd.h index 72080ea..54a7cd4 100644 --- a/vendor/github.com/DataDog/zstd/zstd.h +++ b/vendor/github.com/DataDog/zstd/zstd.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -7,34 +8,73 @@ * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ -#if defined (__cplusplus) -extern "C" { -#endif #ifndef ZSTD_H_235446 #define ZSTD_H_235446 -/* ====== Dependency ======*/ -#include /* INT_MAX */ + +/* ====== Dependencies ======*/ #include /* size_t */ +#include "zstd_errors.h" /* list of errors */ +#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#include /* INT_MAX */ +#endif /* ZSTD_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +extern "C" { +#endif /* ===== ZSTDLIB_API : control library symbols visibility ===== */ -#ifndef ZSTDLIB_VISIBILITY -# if defined(__GNUC__) && (__GNUC__ >= 4) -# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default"))) +#ifndef ZSTDLIB_VISIBLE + /* Backwards compatibility with old macro name */ +# ifdef ZSTDLIB_VISIBILITY +# define ZSTDLIB_VISIBLE ZSTDLIB_VISIBILITY +# elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) # else -# define ZSTDLIB_VISIBILITY +# define ZSTDLIB_VISIBLE # endif #endif + +#ifndef ZSTDLIB_HIDDEN +# if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) +# else +# define ZSTDLIB_HIDDEN +# endif +#endif + #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) -# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY +# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBLE #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) -# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ #else -# define ZSTDLIB_API ZSTDLIB_VISIBILITY +# define ZSTDLIB_API ZSTDLIB_VISIBLE #endif +/* Deprecation warnings : + * Should these warnings be a problem, it is generally possible to disable them, + * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. + * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. + */ +#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ +#else +# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +# define ZSTD_DEPRECATED(message) [[deprecated(message)]] +# elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) || defined(__IAR_SYSTEMS_ICC__) +# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) +# elif defined(__GNUC__) && (__GNUC__ >= 3) +# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define ZSTD_DEPRECATED(message) __declspec(deprecated(message)) +# else +# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +# define ZSTD_DEPRECATED(message) +# endif +#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ + /******************************************************************************* Introduction @@ -71,17 +111,22 @@ extern "C" { /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 -#define ZSTD_VERSION_MINOR 4 -#define ZSTD_VERSION_RELEASE 4 - +#define ZSTD_VERSION_MINOR 5 +#define ZSTD_VERSION_RELEASE 7 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) -ZSTDLIB_API unsigned ZSTD_versionNumber(void); /**< to check runtime library version */ + +/*! ZSTD_versionNumber() : + * Return runtime library version, the value is (MAJOR*100*100 + MINOR*100 + RELEASE). */ +ZSTDLIB_API unsigned ZSTD_versionNumber(void); #define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE #define ZSTD_QUOTE(str) #str #define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str) #define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION) -ZSTDLIB_API const char* ZSTD_versionString(void); /* requires v1.3.0+ */ + +/*! ZSTD_versionString() : + * Return runtime library version, like "1.4.5". Requires v1.3.0+. */ +ZSTDLIB_API const char* ZSTD_versionString(void); /* ************************************* * Default constant @@ -104,13 +149,13 @@ ZSTDLIB_API const char* ZSTD_versionString(void); /* requires v1.3.0+ */ #define ZSTD_BLOCKSIZE_MAX (1<= `ZSTD_compressBound(srcSize)`. + * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have + * enough space to successfully compress the data. * @return : compressed size written into `dst` (<= `dstCapacity), * or an error code if it fails (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, @@ -118,65 +163,106 @@ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, int compressionLevel); /*! ZSTD_decompress() : - * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. - * `dstCapacity` is an upper bound of originalSize to regenerate. - * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. - * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), - * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ + * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. + * Multiple compressed frames can be decompressed at once with this method. + * The result will be the concatenation of all decompressed frames, back to back. + * `dstCapacity` is an upper bound of originalSize to regenerate. + * First frame's decompressed size can be extracted using ZSTD_getFrameContentSize(). + * If maximum upper bound isn't known, prefer using streaming mode to decompress data. + * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, const void* src, size_t compressedSize); + +/*====== Decompression helper functions ======*/ + /*! ZSTD_getFrameContentSize() : requires v1.3.0+ - * `src` should point to the start of a ZSTD encoded frame. - * `srcSize` must be at least as large as the frame header. - * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. - * @return : - decompressed size of `src` frame content, if known - * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined - * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) - * note 1 : a 0 return value means the frame is valid but "empty". - * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. - * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. - * In which case, it's necessary to use streaming mode to decompress data. - * Optionally, application can rely on some implicit limit, - * as ZSTD_decompress() only needs an upper bound of decompressed size. - * (For example, data could be necessarily cut into blocks <= 16 KB). - * note 3 : decompressed size is always present when compression is completed using single-pass functions, - * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). - * note 4 : decompressed size can be very large (64-bits value), - * potentially larger than what local system can handle as a single memory segment. - * In which case, it's necessary to use streaming mode to decompress data. - * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. - * Always ensure return value fits within application's authorized limits. - * Each application can set its own limits. - * note 6 : This function replaces ZSTD_getDecompressedSize() */ + * `src` should point to the start of a ZSTD encoded frame. + * `srcSize` must be at least as large as the frame header. + * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. + * @return : - decompressed size of `src` frame content, if known + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) + * note 1 : a 0 return value means the frame is valid but "empty". + * When invoking this method on a skippable frame, it will return 0. + * note 2 : decompressed size is an optional field, it may not be present (typically in streaming mode). + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * Optionally, application can rely on some implicit limit, + * as ZSTD_decompress() only needs an upper bound of decompressed size. + * (For example, data could be necessarily cut into blocks <= 16 KB). + * note 3 : decompressed size is always present when compression is completed using single-pass functions, + * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). + * note 4 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure return value fits within application's authorized limits. + * Each application can set its own limits. + * note 6 : This function replaces ZSTD_getDecompressedSize() */ #define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) #define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); -/*! ZSTD_getDecompressedSize() : - * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). +/*! ZSTD_getDecompressedSize() (obsolete): + * This function is now obsolete, in favor of ZSTD_getFrameContentSize(). * Both functions work the same way, but ZSTD_getDecompressedSize() blends * "empty", "unknown" and "error" results to the same return value (0), * while ZSTD_getFrameContentSize() gives them separate return values. * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); -/*! ZSTD_findFrameCompressedSize() : +/*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ * `src` should point to the start of a ZSTD frame or skippable frame. * `srcSize` must be >= first frame size * @return : the compressed size of the first frame starting at `src`, * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, - * or an error code if input is invalid */ + * or an error code if input is invalid + * Note 1: this method is called _find*() because it's not enough to read the header, + * it may have to scan through the frame's content, to reach its end. + * Note 2: this method also works with Skippable Frames. In which case, + * it returns the size of the complete skippable frame, + * which is always equal to its content size + 8 bytes for headers. */ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); -/*====== Helper functions ======*/ -#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ -ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ -ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ -ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ -ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ -ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ +/*====== Compression helper functions ======*/ + +/*! ZSTD_compressBound() : + * maximum compressed size in worst case single-pass scenario. + * When invoking `ZSTD_compress()`, or any other one-pass compression function, + * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) + * as it eliminates one potential failure scenario, + * aka not enough room in dst buffer to write the compressed frame. + * Note : ZSTD_compressBound() itself can fail, if @srcSize >= ZSTD_MAX_INPUT_SIZE . + * In which case, ZSTD_compressBound() will return an error code + * which can be tested using ZSTD_isError(). + * + * ZSTD_COMPRESSBOUND() : + * same as ZSTD_compressBound(), but as a macro. + * It can be used to produce constants, which can be useful for static allocation, + * for example to size a static array on stack. + * Will produce constant value 0 if srcSize is too large. + */ +#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) +#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ + + +/*====== Error helper functions ======*/ +/* ZSTD_isError() : + * Most ZSTD_* functions returning a size_t value can be tested for error, + * using ZSTD_isError(). + * @return 1 if error, 0 otherwise + */ +ZSTDLIB_API unsigned ZSTD_isError(size_t result); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); /* convert a result into an error code, which can be compared to error enum list */ +ZSTDLIB_API const char* ZSTD_getErrorName(size_t result); /*!< provides readable string from a function result */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ +ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ /*************************************** @@ -184,25 +270,25 @@ ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compres ***************************************/ /*= Compression context * When compressing many times, - * it is recommended to allocate a context just once, - * and re-use it for each successive compression operation. - * This will make workload friendlier for system's memory. + * it is recommended to allocate a compression context just once, + * and reuse it for each successive compression operation. + * This will make the workload easier for system's memory. * Note : re-using context is just a speed / resource optimization. * It doesn't change the compression ratio, which remains identical. - * Note 2 : In multi-threaded environments, - * use one different context per thread for parallel execution. + * Note 2: For parallel execution in multi-threaded environments, + * use one different context per thread . */ typedef struct ZSTD_CCtx_s ZSTD_CCtx; ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); -ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* compatible with NULL pointer */ /*! ZSTD_compressCCtx() : * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. - * Important : in order to behave similarly to `ZSTD_compress()`, - * this function compresses at requested compression level, - * __ignoring any other parameter__ . + * Important : in order to mirror `ZSTD_compress()` behavior, + * this function compresses at the requested compression level, + * __ignoring any other advanced parameter__ . * If any advanced parameter was set using the advanced API, - * they will all be reset. Only `compressionLevel` remains. + * they will all be reset. Only @compressionLevel remains. */ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, @@ -212,38 +298,38 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, /*= Decompression context * When decompressing many times, * it is recommended to allocate a context only once, - * and re-use it for each successive compression operation. + * and reuse it for each successive compression operation. * This will make workload friendlier for system's memory. * Use one context per thread for parallel execution. */ typedef struct ZSTD_DCtx_s ZSTD_DCtx; ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void); -ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer */ /*! ZSTD_decompressDCtx() : * Same as ZSTD_decompress(), * requires an allocated ZSTD_DCtx. - * Compatible with sticky parameters. + * Compatible with sticky parameters (see below). */ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -/*************************************** -* Advanced compression API -***************************************/ +/********************************************* +* Advanced compression API (Requires v1.4.0+) +**********************************************/ /* API design : * Parameters are pushed one by one into an existing context, * using ZSTD_CCtx_set*() functions. * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! - * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . + * __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ . * * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). * - * This API supercedes all other "advanced" API entry points in the experimental section. - * In the future, we expect to remove from experimental API entry points which are redundant with this API. + * This API supersedes all other "advanced" API entry points in the experimental section. + * In the future, we expect to remove API entry points from experimental which are redundant with this API. */ @@ -261,7 +347,6 @@ typedef enum { ZSTD_fast=1, Only the order (from fast to strong) is guaranteed */ } ZSTD_strategy; - typedef enum { /* compression parameters @@ -274,7 +359,10 @@ typedef enum { * Default level is ZSTD_CLEVEL_DEFAULT==3. * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT. * Note 1 : it's possible to pass a negative compression level. - * Note 2 : setting a level resets all other compression parameters to default */ + * Note 2 : setting a level does not automatically set all other compression parameters + * to default. Setting this will however eventually dynamically impact the compression + * parameters which have not been manually set. The manually set + * ones will 'stick'. */ /* Advanced compression parameters : * It's possible to pin down compression parameters to some specific values. * In which case, these values are no longer dynamically selected by the compressor */ @@ -325,13 +413,27 @@ typedef enum { * resulting in stronger and slower compression. * Special: value 0 means "use default strategy". */ + ZSTD_c_targetCBlockSize=130, /* v1.5.6+ + * Attempts to fit compressed block size into approximately targetCBlockSize. + * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. + * Note that it's not a guarantee, just a convergence target (default:0). + * No target when targetCBlockSize == 0. + * This is helpful in low bandwidth streaming environments to improve end-to-end latency, + * when a client can make use of partial documents (a prominent example being Chrome). + * Note: this parameter is stable since v1.5.6. + * It was present as an experimental parameter in earlier versions, + * but it's not recommended using it with earlier library versions + * due to massive performance regressions. + */ /* LDM mode parameters */ ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. * This parameter is designed to improve compression ratio * for large inputs, by finding large matches at long distance. * It increases memory usage and window size. * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB - * except when expressly set to a different value. */ + * except when expressly set to a different value. + * Note: will be enabled by default if ZSTD_c_windowLog >= 128 MB and + * compression strategy >= ZSTD_btopt (== compression level 16+) */ ZSTD_c_ldmHashLog=161, /* Size of the table for long distance matching, as a power of 2. * Larger values increase memory usage and compression ratio, * but decrease compression speed. @@ -362,20 +464,24 @@ typedef enum { ZSTD_c_dictIDFlag=202, /* When applicable, dictionary's ID is written into frame header (default:1) */ /* multi-threading parameters */ - /* These parameters are only useful if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD). - * They return an error otherwise. */ + /* These parameters are only active if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD). + * Otherwise, trying to set any other value than default (0) will be a no-op and return an error. + * In a situation where it's unknown if the linked library supports multi-threading or not, + * setting ZSTD_c_nbWorkers to any value >= 1 and consulting the return value provides a quick way to check this property. + */ ZSTD_c_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel. - * When nbWorkers >= 1, triggers asynchronous mode when used with ZSTD_compressStream*() : + * When nbWorkers >= 1, triggers asynchronous mode when invoking ZSTD_compressStream*() : * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller, - * while compression work is performed in parallel, within worker threads. + * while compression is performed in parallel, within worker thread(s). * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end : * in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call). * More workers improve speed, but also increase memory usage. - * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */ + * Default value is `0`, aka "single-threaded mode" : no worker is spawned, + * compression is performed inside Caller's thread, and all invocations are blocking */ ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. * 0 means default, which is dynamically determined based on compression parameters. - * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. + * Job size must be a minimum of overlap size, or ZSTDMT_JOBSIZE_MIN (= 512 KB), whichever is largest. * The minimum size is automatically and transparently enforced. */ ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. @@ -398,8 +504,18 @@ typedef enum { * ZSTD_c_forceMaxWindow * ZSTD_c_forceAttachDict * ZSTD_c_literalCompressionMode - * ZSTD_c_targetCBlockSize * ZSTD_c_srcSizeHint + * ZSTD_c_enableDedicatedDictSearch + * ZSTD_c_stableInBuffer + * ZSTD_c_stableOutBuffer + * ZSTD_c_blockDelimiters + * ZSTD_c_validateSequences + * ZSTD_c_blockSplitterLevel + * ZSTD_c_splitAfterSequences + * ZSTD_c_useRowMatchFinder + * ZSTD_c_prefetchCDictTables + * ZSTD_c_enableSeqProducerFallback + * ZSTD_c_maxBlockSize * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. @@ -409,8 +525,21 @@ typedef enum { ZSTD_c_experimentalParam3=1000, ZSTD_c_experimentalParam4=1001, ZSTD_c_experimentalParam5=1002, - ZSTD_c_experimentalParam6=1003, - ZSTD_c_experimentalParam7=1004 + /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */ + ZSTD_c_experimentalParam7=1004, + ZSTD_c_experimentalParam8=1005, + ZSTD_c_experimentalParam9=1006, + ZSTD_c_experimentalParam10=1007, + ZSTD_c_experimentalParam11=1008, + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, + ZSTD_c_experimentalParam15=1012, + ZSTD_c_experimentalParam16=1013, + ZSTD_c_experimentalParam17=1014, + ZSTD_c_experimentalParam18=1015, + ZSTD_c_experimentalParam19=1016, + ZSTD_c_experimentalParam20=1017 } ZSTD_cParameter; typedef struct { @@ -473,7 +602,7 @@ typedef enum { * They will be used to compress next frame. * Resetting session never fails. * - The parameters : changes all parameters back to "default". - * This removes any reference to any dictionary too. + * This also removes any reference to any dictionary or external sequence producer. * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) * - Both : similar to resetting the session, followed by resetting parameters. @@ -482,11 +611,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); /*! ZSTD_compress2() : * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + * (note that this entry point doesn't even expose a compression level parameter). * ZSTD_compress2() always starts a new frame. * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() * - The function is always blocking, returns when compression is completed. - * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have + * enough space to successfully compress the data, though it is possible it fails for other reasons. * @return : compressed size written into `dst` (<= `dstCapacity), * or an error code if it fails (which can be tested using ZSTD_isError()). */ @@ -495,9 +626,9 @@ ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, const void* src, size_t srcSize); -/*************************************** -* Advanced decompression API -***************************************/ +/*********************************************** +* Advanced decompression API (Requires v1.4.0+) +************************************************/ /* The advanced API pushes parameters one by one into an existing DCtx context. * Parameters are sticky, and remain valid for all following frames @@ -519,11 +650,21 @@ typedef enum { /* note : additional experimental parameters are also available * within the experimental section of the API. * At the time of this writing, they include : - * ZSTD_c_format + * ZSTD_d_format + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts + * ZSTD_d_disableHuffmanAssembly + * ZSTD_d_maxBlockSize * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly */ - ZSTD_d_experimentalParam1=1000 + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, + ZSTD_d_experimentalParam4=1003, + ZSTD_d_experimentalParam5=1004, + ZSTD_d_experimentalParam6=1005 } ZSTD_dParameter; @@ -578,14 +719,14 @@ typedef struct ZSTD_outBuffer_s { * A ZSTD_CStream object is required to track streaming operation. * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. -* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. +* It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. * * For parallel execution, use one separate ZSTD_CStream per thread. * * note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. * * Parameters are sticky : when starting a new compression on the same context, -* it will re-use the same sticky parameters as previous compression session. +* it will reuse the same sticky parameters as previous compression session. * When in doubt, it's recommended to fully initialize the context before usage. * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to @@ -637,7 +778,7 @@ typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */ /*===== ZSTD_CStream management functions =====*/ ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void); -ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); /* accept NULL pointer */ /*===== Streaming compression functions =====*/ typedef enum { @@ -653,14 +794,15 @@ typedef enum { : note : multithreaded compression will block to flush as much output as possible. */ } ZSTD_EndDirective; -/*! ZSTD_compressStream2() : +/*! ZSTD_compressStream2() : Requires v1.4.0+ * Behaves about the same as ZSTD_compressStream, with additional control on end directive. * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) * - output->pos must be <= dstCapacity, input->pos must be <= srcSize * - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit. + * - endOp must be a valid directive * - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller. - * - When nbWorkers>=1, function is non-blocking : it just acquires a copy of input, and distributes jobs to internal worker threads, flush whatever is available, + * - When nbWorkers>=1, function is non-blocking : it copies a portion of input, distributes jobs to internal worker threads, flush to output whatever is available, * and then immediately returns, just indicating that there is some data remaining to be flushed. * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte. * - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking. @@ -673,6 +815,11 @@ typedef enum { * only ZSTD_e_end or ZSTD_e_flush operations are allowed. * Before starting a new compression job, or changing compression parameters, * it is required to fully flush internal buffers. + * - note: if an operation ends with an error, it may leave @cctx in an undefined state. + * Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state. + * In order to be re-employed after an error, a state must be reset, + * which can be done explicitly (ZSTD_CCtx_reset()), + * or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx()) */ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, ZSTD_outBuffer* output, @@ -698,11 +845,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output /* ***************************************************************************** - * This following is a legacy streaming API. + * This following is a legacy streaming API, available since v1.0+ . * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). * It is redundant, but remains fully supported. - * Advanced parameters and dictionary compression can only be used through the - * new API. ******************************************************************************/ /*! @@ -711,6 +856,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * + * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API + * to compress with a dictionary. */ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); /*! @@ -731,7 +879,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); * * A ZSTD_DStream object is required to track streaming operations. * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. -* ZSTD_DStream objects can be re-used multiple times. +* ZSTD_DStream objects can be re-employed multiple times. * * Use ZSTD_initDStream() to start a new decompression operation. * @return : recommended first input size @@ -741,33 +889,63 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); * The function will update both `pos` fields. * If `input.pos < input.size`, some input has not been consumed. * It's up to the caller to present again remaining data. +* * The function tries to flush all data decoded immediately, respecting output buffer size. * If `output.pos < output.size`, decoder has flushed everything it could. -* But if `output.pos == output.size`, there might be some data left within internal buffers., +* +* However, when `output.pos == output.size`, it's more difficult to know. +* If @return > 0, the frame is not complete, meaning +* either there is still some data left to flush within internal buffers, +* or there is more input to read to complete the frame (or both). * In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. * Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. * @return : 0 when a frame is completely decoded and fully flushed, * or an error code, which can be tested using ZSTD_isError(), * or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : * the return value is a suggested next input size (just a hint for better latency) -* that will never request more than the remaining frame size. +* that will never request more than the remaining content of the compressed frame. * *******************************************************************************/ typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */ /* For compatibility with versions <= v1.2.0, prefer differentiating them. */ /*===== ZSTD_DStream management functions =====*/ ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void); -ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); +ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer */ /*===== Streaming decompression functions =====*/ -/* This function is redundant with the advanced API and equivalent to: +/*! ZSTD_initDStream() : + * Initialize/reset DStream state for new decompression operation. + * Call before new decompression operation using same DStream. * - * ZSTD_DCtx_reset(zds); + * Note : This function is redundant with the advanced API and equivalent to: + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); * ZSTD_DCtx_refDDict(zds, NULL); */ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); +/*! ZSTD_decompressStream() : + * Streaming decompression function. + * Call repetitively to consume full input updating it as necessary. + * Function will update both input and output `pos` fields exposing current state via these fields: + * - `input.pos < input.size`, some input remaining and caller should provide remaining input + * on the next call. + * - `output.pos < output.size`, decoder flushed internal output buffer. + * - `output.pos == output.size`, unflushed data potentially present in the internal buffers, + * check ZSTD_decompressStream() @return value, + * if > 0, invoke it again to flush remaining data to output. + * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. + * + * @return : 0 when a frame is completely decoded and fully flushed, + * or an error code, which can be tested using ZSTD_isError(), + * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. + * + * Note: when an operation returns with an error code, the @zds state may be left in undefined state. + * It's UB to invoke `ZSTD_decompressStream()` on such a state. + * In order to re-use such a state, it must be first reset, + * which can be done explicitly (`ZSTD_DCtx_reset()`), + * or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`) + */ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ @@ -780,7 +958,7 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output /*! ZSTD_compress_usingDict() : * Compression at an explicit compression level using a Dictionary. * A dictionary can be any arbitrary data segment (also called a prefix), - * or a buffer with specified information (see dictBuilder/zdict.h). + * or a buffer with specified information (see zdict.h). * Note : This function loads the dictionary, resulting in significant startup delay. * It's intended for a dictionary used only once. * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ @@ -823,7 +1001,8 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize int compressionLevel); /*! ZSTD_freeCDict() : - * Function frees memory allocated by ZSTD_createCDict(). */ + * Function frees memory allocated by ZSTD_createCDict(). + * If a NULL pointer is passed, no operation is performed. */ ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict); /*! ZSTD_compress_usingCDict() : @@ -845,7 +1024,8 @@ typedef struct ZSTD_DDict_s ZSTD_DDict; ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); /*! ZSTD_freeDDict() : - * Function frees memory allocated with ZSTD_createDDict() */ + * Function frees memory allocated with ZSTD_createDDict() + * If a NULL pointer is passed, no operation is performed. */ ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict); /*! ZSTD_decompress_usingDDict() : @@ -861,24 +1041,30 @@ ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, * Dictionary helper functions *******************************/ -/*! ZSTD_getDictID_fromDict() : +/*! ZSTD_getDictID_fromDict() : Requires v1.4.0+ * Provides the dictID stored within dictionary. * if @return == 0, the dictionary is not conformant with Zstandard specification. * It can still be loaded, but as a content-only dictionary. */ ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); -/*! ZSTD_getDictID_fromDDict() : +/*! ZSTD_getDictID_fromCDict() : Requires v1.5.0+ + * Provides the dictID of the dictionary loaded into `cdict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); + +/*! ZSTD_getDictID_fromDDict() : Requires v1.4.0+ * Provides the dictID of the dictionary loaded into `ddict`. * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); -/*! ZSTD_getDictID_fromFrame() : +/*! ZSTD_getDictID_fromFrame() : Requires v1.4.0+ * Provides the dictID required to decompressed the frame stored within `src`. * If @return == 0, the dictID could not be decoded. * This could for one of the following reasons : * - The frame does not require a dictionary to be decoded (most common case). - * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. * Note : this use case also happens when using a non-conformant dictionary. * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). * - This is not a Zstandard frame. @@ -887,23 +1073,26 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); /******************************************************************************* - * Advanced dictionary and prefix API + * Advanced dictionary and prefix API (Requires v1.4.0+) * * This API allows dictionaries to be used with ZSTD_compress2(), - * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and - * only reset with the context is reset with ZSTD_reset_parameters or - * ZSTD_reset_session_and_parameters. Prefixes are single-use. + * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). + * Dictionaries are sticky, they remain valid when same context is reused, + * they only reset when the context is reset + * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. + * In contrast, Prefixes are single-use. ******************************************************************************/ -/*! ZSTD_CCtx_loadDictionary() : +/*! ZSTD_CCtx_loadDictionary() : Requires v1.4.0+ * Create an internal CDict from `dict` buffer. * Decompression will have to use same dictionary. * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, * meaning "return to no-dictionary mode". - * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. - * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). + * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, + * until parameters are reset, a new dictionary is loaded, or the dictionary + * is explicitly invalidated by loading a NULL dictionary. * Note 2 : Loading a dictionary involves building tables. * It's also a CPU consuming operation, with non-negligible impact on latency. * Tables are dependent on compression parameters, and for this reason, @@ -912,14 +1101,18 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. * In such a case, dictionary buffer must outlive its users. * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() - * to precisely select how dictionary content must be interpreted. */ + * to precisely select how dictionary content must be interpreted. + * Note 5 : This method does not benefit from LDM (long distance mode). + * If you want to employ LDM on some large dictionary content, + * prefer employing ZSTD_CCtx_refPrefix() described below. + */ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); -/*! ZSTD_CCtx_refCDict() : - * Reference a prepared dictionary, to be used for all next compressed frames. +/*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ + * Reference a prepared dictionary, to be used for all future compressed frames. * Note that compression parameters are enforced from within CDict, * and supersede any compression parameter previously set within CCtx. - * The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode. * The dictionary will remain valid for future compressed frames using same CCtx. * @result : 0, or an error code (which can be tested with ZSTD_isError()). @@ -929,12 +1122,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, s * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); -/*! ZSTD_CCtx_refPrefix() : +/*! ZSTD_CCtx_refPrefix() : Requires v1.4.0+ * Reference a prefix (single-usage dictionary) for next compressed frame. * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). * Decompression will need same prefix to properly regenerate data. * Compressing with a prefix is similar in outcome as performing a diff and compressing it, * but performs much faster, especially during decompression (compression speed is tunable with compression level). + * This method is compatible with LDM (long distance mode). * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary * Note 1 : Prefix buffer is referenced. It **must** outlive compression. @@ -950,10 +1144,10 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize); -/*! ZSTD_DCtx_loadDictionary() : - * Create an internal DDict from dict buffer, - * to be used to decompress next frames. - * The dictionary remains valid for all future frames, until explicitly invalidated. +/*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ + * Create an internal DDict from dict buffer, to be used to decompress all future frames. + * The dictionary remains valid for all future frames, until explicitly invalidated, or + * a new dictionary is loaded. * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, * meaning "return to no-dictionary mode". @@ -967,18 +1161,26 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, */ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); -/*! ZSTD_DCtx_refDDict() : +/*! ZSTD_DCtx_refDDict() : Requires v1.4.0+ * Reference a prepared dictionary, to be used to decompress next frames. * The dictionary remains active for decompression of future frames using same DCtx. + * + * If called with ZSTD_d_refMultipleDDicts enabled, repeated calls of this function + * will store the DDict references in a table, and the DDict used for decompression + * will be determined at decompression time, as per the dict ID in the frame. + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * + * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary + * will be managed, and referencing a dictionary effectively "discards" any previous one. + * * @result : 0, or an error code (which can be tested with ZSTD_isError()). - * Note 1 : Currently, only one dictionary can be managed. - * Referencing a new dictionary effectively "discards" any previous one. * Special: referencing a NULL DDict means "return to no-dictionary mode". * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. */ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); -/*! ZSTD_DCtx_refPrefix() : +/*! ZSTD_DCtx_refPrefix() : Requires v1.4.0+ * Reference a prefix (single-usage dictionary) to decompress next frame. * This is the reverse operation of ZSTD_CCtx_refPrefix(), * and must use the same prefix as the one used during compression. @@ -999,7 +1201,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, /* === Memory management === */ -/*! ZSTD_sizeof_*() : +/*! ZSTD_sizeof_*() : Requires v1.4.0+ * These functions give the _current_ memory usage of selected object. * Note that object memory usage can evolve (increase or decrease) over time. */ ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); @@ -1009,6 +1211,10 @@ ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); +#if defined (__cplusplus) +} +#endif + #endif /* ZSTD_H_235446 */ @@ -1024,6 +1230,21 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY +#if defined (__cplusplus) +extern "C" { +#endif + +/* This can be overridden externally to hide static symbols. */ +#ifndef ZSTDLIB_STATIC_API +# if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDLIB_STATIC_API __declspec(dllexport) ZSTDLIB_VISIBLE +# elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDLIB_STATIC_API __declspec(dllimport) ZSTDLIB_VISIBLE +# else +# define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE +# endif +#endif + /**************************************************************************************** * experimental API (static linking only) **************************************************************************************** @@ -1058,6 +1279,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ #define ZSTD_STRATEGY_MIN ZSTD_fast #define ZSTD_STRATEGY_MAX ZSTD_btultra2 +#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ #define ZSTD_OVERLAPLOG_MIN 0 @@ -1081,35 +1303,51 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) /* Advanced parameter bounds */ -#define ZSTD_TARGETCBLOCKSIZE_MIN 64 +#define ZSTD_TARGETCBLOCKSIZE_MIN 1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */ #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX #define ZSTD_SRCSIZEHINT_MIN 0 #define ZSTD_SRCSIZEHINT_MAX INT_MAX -/* internal */ -#define ZSTD_HASHLOG3_MAX 17 - /* --- Advanced types --- */ typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params; typedef struct { - unsigned int matchPos; /* Match pos in dst */ - /* If seqDef.offset > 3, then this is seqDef.offset - 3 - * If seqDef.offset < 3, then this is the corresponding repeat offset - * But if seqDef.offset < 3 and litLength == 0, this is the - * repeat offset before the corresponding repeat offset - * And if seqDef.offset == 3 and litLength == 0, this is the - * most recent repeat offset - 1 - */ - unsigned int offset; - unsigned int litLength; /* Literal length */ - unsigned int matchLength; /* Match length */ - /* 0 when seq not rep and seqDef.offset otherwise - * when litLength == 0 this will be <= 4, otherwise <= 3 like normal - */ - unsigned int rep; + unsigned int offset; /* The offset of the match. (NOT the same as the offset code) + * If offset == 0 and matchLength == 0, this sequence represents the last + * literals in the block of litLength size. + */ + + unsigned int litLength; /* Literal length of the sequence. */ + unsigned int matchLength; /* Match length of the sequence. */ + + /* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0. + * In this case, we will treat the sequence as a marker for a block boundary. + */ + + unsigned int rep; /* Represents which repeat offset is represented by the field 'offset'. + * Ranges from [0, 3]. + * + * Repeat offsets are essentially previous offsets from previous sequences sorted in + * recency order. For more detail, see doc/zstd_compression_format.md + * + * If rep == 0, then 'offset' does not contain a repeat offset. + * If rep > 0: + * If litLength != 0: + * rep == 1 --> offset == repeat_offset_1 + * rep == 2 --> offset == repeat_offset_2 + * rep == 3 --> offset == repeat_offset_3 + * If litLength == 0: + * rep == 1 --> offset == repeat_offset_2 + * rep == 2 --> offset == repeat_offset_3 + * rep == 3 --> offset == repeat_offset_1 - 1 + * + * Note: This field is optional. ZSTD_generateSequences() will calculate the value of + * 'rep', but repeat offsets do not necessarily need to be calculated from an external + * sequence provider perspective. For example, ZSTD_compressSequences() does not + * use this 'rep' field at all (as of now). + */ } ZSTD_Sequence; typedef struct { @@ -1151,6 +1389,18 @@ typedef enum { * Decoder cannot recognise automatically this format, requiring this instruction. */ } ZSTD_format_e; +typedef enum { + /* Note: this enum controls ZSTD_d_forceIgnoreChecksum */ + ZSTD_d_validateChecksum = 0, + ZSTD_d_ignoreChecksum = 1 +} ZSTD_forceIgnoreChecksum_e; + +typedef enum { + /* Note: this enum controls ZSTD_d_refMultipleDDicts */ + ZSTD_rmd_refSingleDDict = 0, + ZSTD_rmd_refMultipleDDicts = 1 +} ZSTD_refMultipleDDicts_e; + typedef enum { /* Note: this enum and the behavior it controls are effectively internal * implementation details of the compressor. They are expected to continue @@ -1199,9 +1449,19 @@ typedef enum { ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */ } ZSTD_literalCompressionMode_e; +typedef enum { + /* Note: This enum controls features which are conditionally beneficial. + * Zstd can take a decision on whether or not to enable the feature (ZSTD_ps_auto), + * but setting the switch to ZSTD_ps_enable or ZSTD_ps_disable force enable/disable the feature. + */ + ZSTD_ps_auto = 0, /* Let the library automatically determine whether the feature shall be enabled */ + ZSTD_ps_enable = 1, /* Force-enable the feature */ + ZSTD_ps_disable = 2 /* Do not use the feature */ +} ZSTD_ParamSwitch_e; +#define ZSTD_paramSwitch_e ZSTD_ParamSwitch_e /* old name */ /*************************************** -* Frame size functions +* Frame header and size functions ***************************************/ /*! ZSTD_findDecompressedSize() : @@ -1225,14 +1485,14 @@ typedef enum { * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to * read each contained frame header. This is fast as most of the data is skipped, * however it does mean that all frame data must be present and valid. */ -ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); +ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); /*! ZSTD_decompressBound() : * `src` should point to the start of a series of ZSTD encoded and/or skippable frames * `srcSize` must be the _exact_ size of this series * (i.e. there should be a frame boundary at `src + srcSize`) * @return : - upper-bound for the decompressed size of all data in all successive frames - * - if an error occured: ZSTD_CONTENTSIZE_ERROR + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR * * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame. * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`. @@ -1240,22 +1500,253 @@ ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by: * upper-bound = # blocks * min(128 KB, Window_Size) */ -ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); +ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); /*! ZSTD_frameHeaderSize() : - * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * srcSize must be large enough, aka >= ZSTD_FRAMEHEADERSIZE_PREFIX. * @return : size of the Frame Header, * or an error code (if srcSize is too small) */ -ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + +typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_FrameType_e; +#define ZSTD_frameType_e ZSTD_FrameType_e /* old name */ +typedef struct { + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; + ZSTD_FrameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; + unsigned dictID; /* for ZSTD_skippableFrame, contains the skippable magic variant [0-15] */ + unsigned checksumFlag; + unsigned _reserved1; + unsigned _reserved2; +} ZSTD_FrameHeader; +#define ZSTD_frameHeader ZSTD_FrameHeader /* old name */ + +/*! ZSTD_getFrameHeader() : + * decode Frame Header into `zfhPtr`, or requires larger `srcSize`. + * @return : 0 => header is complete, `zfhPtr` is correctly filled, + * >0 => `srcSize` is too small, @return value is the wanted `srcSize` amount, `zfhPtr` is not filled, + * or an error code, which can be tested using ZSTD_isError() */ +ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize); +/*! ZSTD_getFrameHeader_advanced() : + * same as ZSTD_getFrameHeader(), + * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + +/*! ZSTD_decompressionMargin() : + * Zstd supports in-place decompression, where the input and output buffers overlap. + * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, + * and the input buffer must be at the end of the output buffer. + * + * _______________________ Output Buffer ________________________ + * | | + * | ____ Input Buffer ____| + * | | | + * v v v + * |---------------------------------------|-----------|----------| + * ^ ^ ^ + * |___________________ Output_Size ___________________|_ Margin _| + * + * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). + * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or + * ZSTD_decompressDCtx(). + * NOTE: This function supports multi-frame input. + * + * @param src The compressed frame(s) + * @param srcSize The size of the compressed frame(s) + * @returns The decompression margin or an error that can be checked with ZSTD_isError(). + */ +ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); + +/*! ZSTD_DECOMPRESS_MARGIN() : + * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from + * the compressed frame, compute it from the original size and the blockSizeLog. + * See ZSTD_decompressionMargin() for details. + * + * WARNING: This macro does not support multi-frame input, the input must be a single + * zstd frame. If you need that support use the function, or implement it yourself. + * + * @param originalSize The original uncompressed size of the data. + * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). + * Unless you explicitly set the windowLog smaller than + * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. + */ +#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ + ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ + 4 /* checksum */ + \ + ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ + (blockSize) /* One block of margin */ \ + )) + +typedef enum { + ZSTD_sf_noBlockDelimiters = 0, /* ZSTD_Sequence[] has no block delimiters, just sequences */ + ZSTD_sf_explicitBlockDelimiters = 1 /* ZSTD_Sequence[] contains explicit block delimiters */ +} ZSTD_SequenceFormat_e; +#define ZSTD_sequenceFormat_e ZSTD_SequenceFormat_e /* old name */ + +/*! ZSTD_sequenceBound() : + * `srcSize` : size of the input buffer + * @return : upper-bound for the number of sequences that can be generated + * from a buffer of srcSize bytes + * + * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). + */ +ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); + +/*! ZSTD_generateSequences() : + * WARNING: This function is meant for debugging and informational purposes ONLY! + * Its implementation is flawed, and it will be deleted in a future version. + * It is not guaranteed to succeed, as there are several cases where it will give + * up and fail. You should NOT use this function in production code. + * + * This function is deprecated, and will be removed in a future version. + * + * Generate sequences using ZSTD_compress2(), given a source buffer. + * + * @param zc The compression context to be used for ZSTD_compress2(). Set any + * compression parameters you need on this context. + * @param outSeqs The output sequences buffer of size @p outSeqsSize + * @param outSeqsCapacity The size of the output sequences buffer. + * ZSTD_sequenceBound(srcSize) is an upper bound on the number + * of sequences that can be generated. + * @param src The source buffer to generate sequences from of size @p srcSize. + * @param srcSize The size of the source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. + * + * @returns The number of sequences generated, necessarily less than + * ZSTD_sequenceBound(srcSize), or an error code that can be checked + * with ZSTD_isError(). + */ +ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") +ZSTDLIB_STATIC_API size_t +ZSTD_generateSequences(ZSTD_CCtx* zc, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize); + +/*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals + * by merging them into the literals of the next sequence. + * + * As such, the final generated result has no explicit representation of block boundaries, + * and the final last literals segment is not represented in the sequences. + * + * The output of this function can be fed into ZSTD_compressSequences() with CCtx + * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters + * @return : number of sequences left after merging + */ +ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + +/*! ZSTD_compressSequences() : + * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. + * @src contains the entire input (not just the literals). + * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals + * If a dictionary is included, then the cctx should reference the dict (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.). + * The entire source is compressed into a single frame. + * + * The compression behavior changes based on cctx params. In particular: + * If ZSTD_c_blockDelimiters == ZSTD_sf_noBlockDelimiters, the array of ZSTD_Sequence is expected to contain + * no block delimiters (defined in ZSTD_Sequence). Block boundaries are roughly determined based on + * the block size derived from the cctx, and sequences may be split. This is the default setting. + * + * If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain + * valid block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. + * + * When ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, it's possible to decide generating repcodes + * using the advanced parameter ZSTD_c_repcodeResolution. Repcodes will improve compression ratio, though the benefit + * can vary greatly depending on Sequences. On the other hand, repcode resolution is an expensive operation. + * By default, it's disabled at low (<10) compression levels, and enabled above the threshold (>=10). + * ZSTD_c_repcodeResolution makes it possible to directly manage this processing in either direction. + * + * If ZSTD_c_validateSequences == 0, this function blindly accepts the Sequences provided. Invalid Sequences cause undefined + * behavior. If ZSTD_c_validateSequences == 1, then the function will detect invalid Sequences (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) and then bail out and return an error. + * + * In addition to the two adjustable experimental params, there are other important cctx params. + * - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN. + * - ZSTD_c_compressionLevel accordingly adjusts the strength of the entropy coder, as it would in typical compression. + * - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset + * is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md + * + * Note: Repcodes are, as of now, always re-calculated within this function, ZSTD_Sequence.rep is effectively unused. + * Dev Note: Once ability to ingest repcodes become available, the explicit block delims mode must respect those repcodes exactly, + * and cannot emit an RLE block that disagrees with the repcode history. + * @return : final compressed size, or a ZSTD error code. + */ +ZSTDLIB_STATIC_API size_t +ZSTD_compressSequences(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize); + + +/*! ZSTD_compressSequencesAndLiterals() : + * This is a variant of ZSTD_compressSequences() which, + * instead of receiving (src,srcSize) as input parameter, receives (literals,litSize), + * aka all the literals, already extracted and laid out into a single continuous buffer. + * This can be useful if the process generating the sequences also happens to generate the buffer of literals, + * thus skipping an extraction + caching stage. + * It's a speed optimization, useful when the right conditions are met, + * but it also features the following limitations: + * - Only supports explicit delimiter mode + * - Currently does not support Sequences validation (so input Sequences are trusted) + * - Not compatible with frame checksum, which must be disabled + * - If any block is incompressible, will fail and return an error + * - @litSize must be == sum of all @.litLength fields in @inSeqs. Any discrepancy will generate an error. + * - @litBufCapacity is the size of the underlying buffer into which literals are written, starting at address @literals. + * @litBufCapacity must be at least 8 bytes larger than @litSize. + * - @decompressedSize must be correct, and correspond to the sum of all Sequences. Any discrepancy will generate an error. + * @return : final compressed size, or a ZSTD error code. + */ +ZSTDLIB_STATIC_API size_t +ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t nbSequences, + const void* literals, size_t litSize, size_t litBufCapacity, + size_t decompressedSize); + -/*! ZSTD_getSequences() : - * Extract sequences from the sequence store - * zc can be used to insert custom compression params. - * This function invokes ZSTD_compress2 - * @return : number of sequences extracted +/*! ZSTD_writeSkippableFrame() : + * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer. + * + * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number, + * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. + * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, + * so the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. + * + * Returns an error if destination buffer is not large enough, if the source size is not representable + * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid). + * + * @return : number of bytes written or a ZSTD error. */ -ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - size_t outSeqsSize, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned magicVariant); + +/*! ZSTD_readSkippableFrame() : + * Retrieves the content of a zstd skippable frame starting at @src, and writes it to @dst buffer. + * + * The parameter @magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. + * This can be NULL if the caller is not interested in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_STATIC_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, + unsigned* magicVariant, + const void* src, size_t srcSize); + +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + */ +ZSTDLIB_STATIC_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + /*************************************** @@ -1263,55 +1754,71 @@ ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, ***************************************/ /*! ZSTD_estimate*() : - * These functions make it possible to estimate memory usage of a future - * {D,C}Ctx, before its creation. - * - * ZSTD_estimateCCtxSize() will provide a budget large enough for any - * compression level up to selected one. Unlike ZSTD_estimateCStreamSize*(), - * this estimate does not include space for a window buffer, so this estimate - * is guaranteed to be enough for single-shot compressions, but not streaming - * compressions. It will however assume the input may be arbitrarily large, - * which is the worst case. If srcSize is known to always be small, - * ZSTD_estimateCCtxSize_usingCParams() can provide a tighter estimation. - * ZSTD_estimateCCtxSize_usingCParams() can be used in tandem with - * ZSTD_getCParams() to create cParams from compressionLevel. - * ZSTD_estimateCCtxSize_usingCCtxParams() can be used in tandem with - * ZSTD_CCtxParams_setParameter(). - * - * Note: only single-threaded compression is supported. This function will - * return an error code if ZSTD_c_nbWorkers is >= 1. */ -ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); -ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); -ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); -ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. + * This is useful in combination with ZSTD_initStatic(), + * which makes it possible to employ a static buffer for ZSTD_CCtx* state. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough + * to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() + * associated with any compression level up to max specified one. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * + * Note that the size estimation is specific for one-shot compression, + * it is not valid for streaming (see ZSTD_estimateCStreamSize*()) + * nor other potential ways of using a ZSTD_CCtx* state. + * + * When srcSize can be bound by a known and rather "small" value, + * this knowledge can be used to provide a tighter budget estimation + * because the ZSTD_CCtx* state will need less memory for small inputs. + * This tighter estimation can be provided by employing more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * + * Note : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); /*! ZSTD_estimateCStreamSize() : - * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. - * It will also consider src size to be arbitrarily "large", which is worst case. + * ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression + * using any compression level up to the max specified one. + * It will also consider src size to be arbitrarily "large", which is a worst case scenario. * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. * Note : CStream size estimation is only correct for single-threaded compression. - * ZSTD_DStream memory budget depends on window Size. + * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. + * Size estimates assume that no external sequence producer is registered. + * + * ZSTD_DStream memory budget depends on frame's window Size. * This information can be passed manually, using ZSTD_estimateDStreamSize, * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Any frame requesting a window size larger than max specified one will be rejected. * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), * an internal ?Dict will be created, which additional size is not estimated here. - * In this case, get total size by adding ZSTD_estimate?DictSize */ -ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); -ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); -ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); -ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); -ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + * In this case, get total size by adding ZSTD_estimate?DictSize + */ +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); /*! ZSTD_estimate?DictSize() : * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. */ -ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); -ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); -ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); /*! ZSTD_initStatic*() : * Initialize an object using a pre-allocated fixed-size buffer. @@ -1334,20 +1841,20 @@ ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e * Limitation 2 : static cctx currently not compatible with multi-threading. * Limitation 3 : static dctx is incompatible with legacy support. */ -ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); -ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */ +ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */ -ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); -ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */ +ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */ -ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( +ZSTDLIB_STATIC_API const ZSTD_CDict* ZSTD_initStaticCDict( void* workspace, size_t workspaceSize, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams); -ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( +ZSTDLIB_STATIC_API const ZSTD_DDict* ZSTD_initStaticDDict( void* workspace, size_t workspaceSize, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, @@ -1362,25 +1869,63 @@ ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size); typedef void (*ZSTD_freeFunction) (void* opaque, void* address); typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; -static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */ +static +#ifdef __GNUC__ +__attribute__((__unused__)) +#endif + +#if defined(__clang__) && __clang_major__ >= 5 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" +#endif +ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */ +#if defined(__clang__) && __clang_major__ >= 5 +#pragma clang diagnostic pop +#endif -ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, +ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams, ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, - ZSTD_dictLoadMethod_e dictLoadMethod, - ZSTD_dictContentType_e dictContentType, - ZSTD_customMem customMem); +/*! Thread pool : + * These prototypes make it possible to share a thread pool among multiple compression contexts. + * This can limit resources for applications with multiple threads where each one uses + * a threaded compression mode (via ZSTD_c_nbWorkers parameter). + * ZSTD_createThreadPool creates a new thread pool with a given number of threads. + * Note that the lifetime of such pool must exist while being used. + * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value + * to use an internal thread pool). + * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer. + */ +typedef struct POOL_ctx_s ZSTD_threadPool; +ZSTDLIB_STATIC_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads); +ZSTDLIB_STATIC_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); /* accept NULL pointer */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool); +/* + * This API is temporary and is expected to change or disappear in the future! + */ +ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced2( + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + const ZSTD_CCtx_params* cctxParams, + ZSTD_customMem customMem); + +ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_advanced( + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem); + /*************************************** * Advanced compression functions @@ -1392,22 +1937,22 @@ ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictS * As a consequence, `dictBuffer` **must** outlive CDict, * and its content must remain unmodified throughout the lifetime of CDict. * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); +ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); /*! ZSTD_getCParams() : * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. * `estimatedSrcSize` value is optional, select 0 if not known */ -ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); /*! ZSTD_getParams() : * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ -ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); /*! ZSTD_checkCParams() : * Ensure param values remain within authorized range. * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ -ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); +ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); /*! ZSTD_adjustCParams() : * optimize params for a given `srcSize` and `dictSize`. @@ -1415,23 +1960,48 @@ ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); * `dictSize` must be `0` when there is no dictionary. * cPar can be invalid : all parameters will be clamped within valid range in the @return struct. * This function never fails (wide contract) */ -ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + +/*! ZSTD_CCtx_setCParams() : + * Set all parameters provided within @p cparams into the working @p cctx. + * Note : if modifying parameters during compression (MT mode only), + * note that changes to the .windowLog parameter will be ignored. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()). + * On failure, no parameters are updated. + */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); + +/*! ZSTD_CCtx_setFParams() : + * Set all parameters provided within @p fparams into the working @p cctx. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()). + */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); + +/*! ZSTD_CCtx_setParams() : + * Set all parameters provided within @p params into the working @p cctx. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()). + */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); /*! ZSTD_compress_advanced() : * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. - * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const void* dict,size_t dictSize, - ZSTD_parameters params); + * This prototype will generate compilation warnings. */ +ZSTD_DEPRECATED("use ZSTD_compress2") +ZSTDLIB_STATIC_API +size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params); /*! ZSTD_compress_usingCDict_advanced() : - * Note : this function is now REDUNDANT. + * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. - * This prototype will be marked as deprecated and generate compilation warning in some future version */ -ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + * This prototype will generate compilation warnings. */ +ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") +ZSTDLIB_STATIC_API +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict, @@ -1441,18 +2011,18 @@ ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, /*! ZSTD_CCtx_loadDictionary_byReference() : * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx. * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */ -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); /*! ZSTD_CCtx_loadDictionary_advanced() : * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over * how to load the dictionary (by copy ? by reference ?) * and how to interpret it (automatic ? force raw mode ? full mode only ?) */ -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); /*! ZSTD_CCtx_refPrefix_advanced() : * Same as ZSTD_CCtx_refPrefix(), but gives finer control over * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ -ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); /* === experimental parameters === */ /* these parameters can be used with ZSTD_setParameter() @@ -1491,29 +2061,308 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre * See the comments on that enum for an explanation of the feature. */ #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 -/* Controls how the literals are compressed (default is auto). - * The value must be of type ZSTD_literalCompressionMode_e. - * See ZSTD_literalCompressionMode_t enum definition for details. +/* Controlled with ZSTD_ParamSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never compress literals. + * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals + * may still be emitted if huffman is not beneficial to use.) + * + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use + * literals compression based on the compression parameters - specifically, + * negative compression levels do not use literal compression. */ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 -/* Tries to fit compressed block size to be around targetCBlockSize. - * No target when targetCBlockSize == 0. - * There is no guarantee on compressed block size (default:0) */ -#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 - /* User's best guess of source size. * Hint is not valid when srcSizeHint == 0. * There is no guarantee that hint is close to actual source size, * but compression ratio may regress significantly if guess considerably underestimates */ #define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7 +/* Controls whether the new and experimental "dedicated dictionary search + * structure" can be used. This feature is still rough around the edges, be + * prepared for surprising behavior! + * + * How to use it: + * + * When using a CDict, whether to use this feature or not is controlled at + * CDict creation, and it must be set in a CCtxParams set passed into that + * construction (via ZSTD_createCDict_advanced2()). A compression will then + * use the feature or not based on how the CDict was constructed; the value of + * this param, set in the CCtx, will have no effect. + * + * However, when a dictionary buffer is passed into a CCtx, such as via + * ZSTD_CCtx_loadDictionary(), this param can be set on the CCtx to control + * whether the CDict that is created internally can use the feature or not. + * + * What it does: + * + * Normally, the internal data structures of the CDict are analogous to what + * would be stored in a CCtx after compressing the contents of a dictionary. + * To an approximation, a compression using a dictionary can then use those + * data structures to simply continue what is effectively a streaming + * compression where the simulated compression of the dictionary left off. + * Which is to say, the search structures in the CDict are normally the same + * format as in the CCtx. + * + * It is possible to do better, since the CDict is not like a CCtx: the search + * structures are written once during CDict creation, and then are only read + * after that, while the search structures in the CCtx are both read and + * written as the compression goes along. This means we can choose a search + * structure for the dictionary that is read-optimized. + * + * This feature enables the use of that different structure. + * + * Note that some of the members of the ZSTD_compressionParameters struct have + * different semantics and constraints in the dedicated search structure. It is + * highly recommended that you simply set a compression level in the CCtxParams + * you pass into the CDict creation call, and avoid messing with the cParams + * directly. + * + * Effects: + * + * This will only have any effect when the selected ZSTD_strategy + * implementation supports this feature. Currently, that's limited to + * ZSTD_greedy, ZSTD_lazy, and ZSTD_lazy2. + * + * Note that this means that the CDict tables can no longer be copied into the + * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be + * usable. The dictionary can only be attached or reloaded. + * + * In general, you should expect compression to be faster--sometimes very much + * so--and CDict creation to be slightly slower. Eventually, we will probably + * make this mode the default. + */ +#define ZSTD_c_enableDedicatedDictSearch ZSTD_c_experimentalParam8 + +/* ZSTD_c_stableInBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the compressor that input data presented with ZSTD_inBuffer + * will ALWAYS be the same between calls. + * Technically, the @src pointer must never be changed, + * and the @pos field can only be updated by zstd. + * However, it's possible to increase the @size field, + * allowing scenarios where more data can be appended after compressions starts. + * These conditions are checked by the compressor, + * and compression will fail if they are not respected. + * Also, data in the ZSTD_inBuffer within the range [src, src + pos) + * MUST not be modified during compression or it will result in data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until + * the frame is complete. But, it will still allocate an output buffer + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, compression WILL fail if conditions are not respected. + * + * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST + * not be modified during compression or it will result in data corruption. + * This is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, + * but passing this flag tells zstd to rely on user provided buffer instead. + */ +#define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 + +/* ZSTD_c_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells he compressor that the ZSTD_outBuffer will not be resized between + * calls. Specifically: (out.size - out.pos) will never grow. This gives the + * compressor the freedom to say: If the compressed data doesn't fit in the + * output buffer then return ZSTD_error_dstSizeTooSmall. This allows us to + * always decompress directly into the output buffer, instead of decompressing + * into an internal buffer and copying to the output buffer. + * + * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer. It will still allocate the + * input window buffer (see ZSTD_c_stableInBuffer). + * + * Zstd will check that (out.size - out.pos) never grows and return an error + * if it does. While not strictly necessary, this should prevent surprises. + */ +#define ZSTD_c_stableOutBuffer ZSTD_c_experimentalParam10 + +/* ZSTD_c_blockDelimiters + * Default is 0 == ZSTD_sf_noBlockDelimiters. + * + * For use with sequence compression API: ZSTD_compressSequences(). + * + * Designates whether or not the given array of ZSTD_Sequence contains block delimiters + * and last literals, which are defined as sequences with offset == 0 and matchLength == 0. + * See the definition of ZSTD_Sequence for more specifics. + */ +#define ZSTD_c_blockDelimiters ZSTD_c_experimentalParam11 + +/* ZSTD_c_validateSequences + * Default is 0 == disabled. Set to 1 to enable sequence validation. + * + * For use with sequence compression API: ZSTD_compressSequences*(). + * Designates whether or not provided sequences are validated within ZSTD_compressSequences*() + * during function execution. + * + * When Sequence validation is disabled (default), Sequences are compressed as-is, + * so they must correct, otherwise it would result in a corruption error. + * + * Sequence validation adds some protection, by ensuring that all values respect boundary conditions. + * If a Sequence is detected invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. + */ +#define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 + +/* ZSTD_c_blockSplitterLevel + * note: this parameter only influences the first splitter stage, + * which is active before producing the sequences. + * ZSTD_c_splitAfterSequences controls the next splitter stage, + * which is active after sequence production. + * Note that both can be combined. + * Allowed values are between 0 and ZSTD_BLOCKSPLITTER_LEVEL_MAX included. + * 0 means "auto", which will select a value depending on current ZSTD_c_strategy. + * 1 means no splitting. + * Then, values from 2 to 6 are sorted in increasing cpu load order. + * + * Note that currently the first block is never split, + * to ensure expansion guarantees in presence of incompressible data. + */ +#define ZSTD_BLOCKSPLITTER_LEVEL_MAX 6 +#define ZSTD_c_blockSplitterLevel ZSTD_c_experimentalParam20 + +/* ZSTD_c_splitAfterSequences + * This is a stronger splitter algorithm, + * based on actual sequences previously produced by the selected parser. + * It's also slower, and as a consequence, mostly used for high compression levels. + * While the post-splitter does overlap with the pre-splitter, + * both can nonetheless be combined, + * notably with ZSTD_c_blockSplitterLevel at ZSTD_BLOCKSPLITTER_LEVEL_MAX, + * resulting in higher compression ratio than just one of them. + * + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use block splitter. + * Set to ZSTD_ps_enable to always use block splitter. + * + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use + * block splitting based on the compression parameters. + */ +#define ZSTD_c_splitAfterSequences ZSTD_c_experimentalParam13 + +/* ZSTD_c_useRowMatchFinder + * Controlled with ZSTD_ParamSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use row-based matchfinder. + * Set to ZSTD_ps_enable to force usage of row-based matchfinder. + * + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use + * the row-based matchfinder based on support for SIMD instructions and the window log. + * Note that this only pertains to compression strategies: greedy, lazy, and lazy2 + */ +#define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14 + +/* ZSTD_c_deterministicRefPrefix + * Default is 0 == disabled. Set to 1 to enable. + * + * Zstd produces different results for prefix compression when the prefix is + * directly adjacent to the data about to be compressed vs. when it isn't. + * This is because zstd detects that the two buffers are contiguous and it can + * use a more efficient match finding algorithm. However, this produces different + * results than when the two buffers are non-contiguous. This flag forces zstd + * to always load the prefix in non-contiguous mode, even if it happens to be + * adjacent to the data, to guarantee determinism. + * + * If you really care about determinism when using a dictionary or prefix, + * like when doing delta compression, you should select this option. It comes + * at a speed penalty of about ~2.5% if the dictionary and data happened to be + * contiguous, and is free if they weren't contiguous. We don't expect that + * intentionally making the dictionary and data contiguous will be worth the + * cost to memcpy() the data. + */ +#define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + +/* ZSTD_c_prefetchCDictTables + * Controlled with ZSTD_ParamSwitch_e enum. Default is ZSTD_ps_auto. + * + * In some situations, zstd uses CDict tables in-place rather than copying them + * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). + * In such situations, compression speed is seriously impacted when CDict tables are + * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables + * when they are used in-place. + * + * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. + * For sufficiently large inputs, zstd will by default memcpy() CDict tables + * into the working context, so there is no need to prefetch. This parameter is + * targeted at a middle range of input sizes, where a prefetch is cheap enough to be + * useful but memcpy() is too expensive. The exact range of input sizes where this + * makes sense is best determined by careful experimentation. + * + * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, + * but in the future zstd may conditionally enable this feature via an auto-detection + * heuristic for cold CDicts. + * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. + */ +#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 + +/* ZSTD_c_enableSeqProducerFallback + * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. + * + * Controls whether zstd will fall back to an internal sequence producer if an + * external sequence producer is registered and returns an error code. This fallback + * is block-by-block: the internal sequence producer will only be called for blocks + * where the external sequence producer returns an error code. Fallback parsing will + * follow any other cParam settings, such as compression level, the same as in a + * normal (fully-internal) compression operation. + * + * The user is strongly encouraged to read the full Block-Level Sequence Producer API + * documentation (below) before setting this parameter. */ +#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 + +/* ZSTD_c_maxBlockSize + * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). + * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. + * + * This parameter can be used to set an upper bound on the blocksize + * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper + * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make + * compressBound() inaccurate). Only currently meant to be used for testing. + */ +#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 + +/* ZSTD_c_repcodeResolution + * This parameter only has an effect if ZSTD_c_blockDelimiters is + * set to ZSTD_sf_explicitBlockDelimiters (may change in the future). + * + * This parameter affects how zstd parses external sequences, + * provided via the ZSTD_compressSequences*() API + * or from an external block-level sequence producer. + * + * If set to ZSTD_ps_enable, the library will check for repeated offsets within + * external sequences, even if those repcodes are not explicitly indicated in + * the "rep" field. Note that this is the only way to exploit repcode matches + * while using compressSequences*() or an external sequence producer, since zstd + * currently ignores the "rep" field of external sequences. + * + * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in + * external sequences, regardless of whether the "rep" field has been set. This + * reduces sequence compression overhead by about 25% while sacrificing some + * compression ratio. + * + * The default value is ZSTD_ps_auto, for which the library will enable/disable + * based on compression level (currently: level<10 disables, level>=10 enables). + */ +#define ZSTD_c_repcodeResolution ZSTD_c_experimentalParam19 +#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 /* older name */ + + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. * @return : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); /*! ZSTD_CCtx_params : @@ -1528,45 +2377,47 @@ ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param * These parameters will be applied to * all subsequent frames. * - ZSTD_compressStream2() : Do compression using the CCtx. - * - ZSTD_freeCCtxParams() : Free the memory. + * - ZSTD_freeCCtxParams() : Free the memory, accept NULL pointer. * * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() * for static allocation of CCtx for single-threaded compression. */ -ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); -ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); +ZSTDLIB_STATIC_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_STATIC_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); /* accept NULL pointer */ /*! ZSTD_CCtxParams_reset() : * Reset params to default values. */ -ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); /*! ZSTD_CCtxParams_init() : * Initializes the compression parameters of cctxParams according to * compression level. All other parameters are reset to their default values. */ -ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); /*! ZSTD_CCtxParams_init_advanced() : * Initializes the compression and frame parameters of cctxParams according to * params. All other parameters are reset to their default values. */ -ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); -/*! ZSTD_CCtxParams_setParameter() : +/*! ZSTD_CCtxParams_setParameter() : Requires v1.4.0+ * Similar to ZSTD_CCtx_setParameter. * Set one compression parameter, selected by enum ZSTD_cParameter. - * Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams(). - * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Parameters must be applied to a ZSTD_CCtx using + * ZSTD_CCtx_setParametersUsingCCtxParams(). + * @result : a code representing success or failure (which can be tested with + * ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); /*! ZSTD_CCtxParams_getParameter() : * Similar to ZSTD_CCtx_getParameter. * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. * @result : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); /*! ZSTD_CCtx_setParametersUsingCCtxParams() : * Apply a set of ZSTD_CCtx_params to the compression context. @@ -1575,7 +2426,7 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_c * if nbWorkers>=1, new parameters will be picked up at next job, * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). */ -ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); /*! ZSTD_compressStream2_simpleArgs() : @@ -1584,7 +2435,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( * This variant might be helpful for binders from dynamic languages * which have troubles handling structures containing memory pointers. */ -ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( +ZSTDLIB_STATIC_API size_t ZSTD_compressStream2_simpleArgs ( ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos, @@ -1600,33 +2451,33 @@ ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. * Note 3 : Skippable Frame Identifiers are considered valid. */ -ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); +ZSTDLIB_STATIC_API unsigned ZSTD_isFrame(const void* buffer, size_t size); /*! ZSTD_createDDict_byReference() : * Create a digested dictionary, ready to start decompression operation without startup delay. * Dictionary content is referenced, and therefore stays in dictBuffer. * It is important that dictBuffer outlives DDict, * it must remain read accessible throughout the lifetime of DDict */ -ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); /*! ZSTD_DCtx_loadDictionary_byReference() : * Same as ZSTD_DCtx_loadDictionary(), * but references `dict` content instead of copying it into `dctx`. * This saves memory if `dict` remains around., * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */ -ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); /*! ZSTD_DCtx_loadDictionary_advanced() : * Same as ZSTD_DCtx_loadDictionary(), * but gives direct control over * how to load the dictionary (by copy ? by reference ?) * and how to interpret it (automatic ? force raw mode ? full mode only ?). */ -ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); /*! ZSTD_DCtx_refPrefix_advanced() : * Same as ZSTD_DCtx_refPrefix(), but gives finer control over * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ -ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); /*! ZSTD_DCtx_setMaxWindowSize() : * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. @@ -1635,20 +2486,123 @@ ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* pre * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); + +/*! ZSTD_DCtx_getParameter() : + * Get the requested decompression parameter value, selected by enum ZSTD_dParameter, + * and store it into int* value. + * @return : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value); /* ZSTD_d_format * experimental parameter, * allowing selection between ZSTD_format_e input compression formats */ #define ZSTD_d_format ZSTD_d_experimentalParam1 +/* ZSTD_d_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same + * between calls, except for the modifications that zstd makes to pos (the + * caller must not modify pos). This is checked by the decompressor, and + * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer + * MUST be large enough to fit the entire decompressed frame. This will be + * checked when the frame content size is known. The data in the ZSTD_outBuffer + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * + * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. + * If you need to avoid the input buffer allocation use the buffer-less + * streaming API. + * + * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, decompression WILL fail if you violate the preconditions. + * + * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST + * not be modified during decompression or you will get data corruption. This + * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate + * matches. Normally zstd maintains its own buffer for this purpose, but passing + * this flag tells zstd to use the user provided buffer. + */ +#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2 + +/* ZSTD_d_forceIgnoreChecksum + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable + * + * Tells the decompressor to skip checksum validation during decompression, regardless + * of whether checksumming was specified during compression. This offers some + * slight performance benefits, and may be useful for debugging. + * Param has values of type ZSTD_forceIgnoreChecksum_e + */ +#define ZSTD_d_forceIgnoreChecksum ZSTD_d_experimentalParam3 + +/* ZSTD_d_refMultipleDDicts + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable + * + * If enabled and dctx is allocated on the heap, then additional memory will be allocated + * to store references to multiple ZSTD_DDict. That is, multiple calls of ZSTD_refDDict() + * using a given ZSTD_DCtx, rather than overwriting the previous DDict reference, will instead + * store all references. At decompression time, the appropriate dictID is selected + * from the set of DDicts based on the dictID in the frame. + * + * Usage is simply calling ZSTD_refDDict() on multiple dict buffers. + * + * Param has values of byte ZSTD_refMultipleDDicts_e + * + * WARNING: Enabling this parameter and calling ZSTD_DCtx_refDDict(), will trigger memory + * allocation for the hash table. ZSTD_freeDCtx() also frees this memory. + * Memory is allocated as per ZSTD_DCtx::customMem. + * + * Although this function allocates memory for the table, the user is still responsible for + * memory management of the underlying ZSTD_DDict* themselves. + */ +#define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + +/* ZSTD_d_disableHuffmanAssembly + * Set to 1 to disable the Huffman assembly implementation. + * The default value is 0, which allows zstd to use the Huffman assembly + * implementation if available. + * + * This parameter can be used to disable Huffman assembly at runtime. + * If you want to disable it at compile time you can define the macro + * ZSTD_DISABLE_ASM. + */ +#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 + +/* ZSTD_d_maxBlockSize + * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). + * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. + * + * Forces the decompressor to reject blocks whose content size is + * larger than the configured maxBlockSize. When maxBlockSize is + * larger than the windowSize, the windowSize is used instead. + * This saves memory on the decoder when you know all blocks are small. + * + * This option is typically used in conjunction with ZSTD_c_maxBlockSize. + * + * WARNING: This causes the decoder to reject otherwise valid frames + * that have block sizes larger than the configured maxBlockSize. + */ +#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6 + /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). * Instruct the decoder context about what kind of data to decode next. * This instruction is mandatory to decode data without a fully-formed header, * such ZSTD_f_zstd1_magicless for example. * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); +ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") +ZSTDLIB_STATIC_API +size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); /*! ZSTD_decompressStream_simpleArgs() : * Same as ZSTD_decompressStream(), @@ -1656,7 +2610,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); * This can be helpful for binders from dynamic languages * which have troubles handling structures containing memory pointers. */ -ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( +ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos); @@ -1670,8 +2624,9 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( ********************************************************************/ /*===== Advanced Streaming compression functions =====*/ -/**! ZSTD_initCStream_srcSize() : - * This function is deprecated, and equivalent to: + +/*! ZSTD_initCStream_srcSize() : + * This function is DEPRECATED, and equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); @@ -1680,15 +2635,16 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( * pledgedSrcSize must be correct. If it is not known at init time, use * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, * "0" also disables frame content size field. It may be enabled in the future. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API +size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); -/**! ZSTD_initCStream_usingDict() : - * This function is deprecated, and is equivalent to: +/*! ZSTD_initCStream_usingDict() : + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); @@ -1697,81 +2653,85 @@ ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, * dict == NULL or dictSize < 8, in which case no dict is used. * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API +size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); -/**! ZSTD_initCStream_advanced() : - * This function is deprecated, and is approximately equivalent to: +/*! ZSTD_initCStream_advanced() : + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); - * // Pseudocode: Set each zstd parameter and leave the rest as-is. - * for ((param, value) : params) { - * ZSTD_CCtx_setParameter(zcs, param, value); - * } + * ZSTD_CCtx_setParams(zcs, params); * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); * * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. * pledgedSrcSize must be correct. * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_advanced(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API +size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); -/**! ZSTD_initCStream_usingCDict() : - * This function is deprecated, and equivalent to: +/*! ZSTD_initCStream_usingCDict() : + * This function is DEPRECATED, and equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, cdict); * * note : cdict will just be referenced, and must outlive compression session - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); +ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API +size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); -/**! ZSTD_initCStream_usingCDict_advanced() : - * This function is DEPRECATED, and is approximately equivalent to: +/*! ZSTD_initCStream_usingCDict_advanced() : + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); - * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. - * for ((fParam, value) : fParams) { - * ZSTD_CCtx_setParameter(zcs, fParam, value); - * } + * ZSTD_CCtx_setFParams(zcs, fParams); * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); * ZSTD_CCtx_refCDict(zcs, cdict); * * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. * pledgedSrcSize must be correct. If srcSize is not known at init time, use * value ZSTD_CONTENTSIZE_UNKNOWN. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API +size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, unsigned long long pledgedSrcSize); /*! ZSTD_resetCStream() : - * This function is deprecated, and is equivalent to: + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * Note: ZSTD_resetCStream() interprets pledgedSrcSize == 0 as ZSTD_CONTENTSIZE_UNKNOWN, but + * ZSTD_CCtx_setPledgedSrcSize() does not do the same, so ZSTD_CONTENTSIZE_UNKNOWN must be + * explicitly specified. * * start a new frame, using same parameters from previous frame. - * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. + * This is typically useful to skip dictionary loading stage, since it will reuse it in-place. * Note that zcs must be init at least once before using ZSTD_resetCStream(). * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. * @return : 0, or an error code (which can be tested using ZSTD_isError()) - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API +size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); typedef struct { @@ -1789,7 +2749,7 @@ typedef struct { * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed. * Aggregates progression inside active worker threads. */ -ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); +ZSTDLIB_STATIC_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); /*! ZSTD_toFlushNow() : * Tell how many bytes are ready to be flushed immediately. @@ -1804,49 +2764,234 @@ ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx * therefore flush speed is limited by production speed of oldest job * irrespective of the speed of concurrent (and newer) jobs. */ -ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); +ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); /*===== Advanced Streaming decompression functions =====*/ -/** + +/*! * This function is deprecated, and is equivalent to: * * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); * * note: no dictionary will be used if dict == NULL or dictSize < 8 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); +ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); -/** +/*! * This function is deprecated, and is equivalent to: * * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); * ZSTD_DCtx_refDDict(zds, ddict); * * note : ddict is referenced, it must outlive decompression session - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); +ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); -/** +/*! * This function is deprecated, and is equivalent to: * * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); * - * re-use decompression parameters from previous init; saves dictionary loading - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * reuse decompression parameters from previous init; saves dictionary loading */ -ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); +ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + +/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* + * + * *** OVERVIEW *** + * The Block-Level Sequence Producer API allows users to provide their own custom + * sequence producer which libzstd invokes to process each block. The produced list + * of sequences (literals and matches) is then post-processed by libzstd to produce + * valid compressed blocks. + * + * This block-level offload API is a more granular complement of the existing + * frame-level offload API compressSequences() (introduced in v1.5.1). It offers + * an easier migration story for applications already integrated with libzstd: the + * user application continues to invoke the same compression functions + * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits + * from the specific advantages of the external sequence producer. For example, + * the sequence producer could be tuned to take advantage of known characteristics + * of the input, to offer better speed / ratio, or could leverage hardware + * acceleration not available within libzstd itself. + * + * See contrib/externalSequenceProducer for an example program employing the + * Block-Level Sequence Producer API. + * + * *** USAGE *** + * The user is responsible for implementing a function of type + * ZSTD_sequenceProducer_F. For each block, zstd will pass the following + * arguments to the user-provided function: + * + * - sequenceProducerState: a pointer to a user-managed state for the sequence + * producer. + * + * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. + * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory + * backing outSeqs is managed by the CCtx. + * + * - src, srcSize: an input buffer for the sequence producer to parse. + * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. + * + * - dict, dictSize: a history buffer, which may be empty, which the sequence + * producer may reference as it parses the src buffer. Currently, zstd will + * always pass dictSize == 0 into external sequence producers, but this will + * change in the future. + * + * - compressionLevel: a signed integer representing the zstd compression level + * set by the user for the current operation. The sequence producer may choose + * to use this information to change its compression strategy and speed/ratio + * tradeoff. Note: the compression level does not reflect zstd parameters set + * through the advanced API. + * + * - windowSize: a size_t representing the maximum allowed offset for external + * sequences. Note that sequence offsets are sometimes allowed to exceed the + * windowSize if a dictionary is present, see doc/zstd_compression_format.md + * for details. + * + * The user-provided function shall return a size_t representing the number of + * sequences written to outSeqs. This return value will be treated as an error + * code if it is greater than outSeqsCapacity. The return value must be non-zero + * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided + * for convenience, but any value greater than outSeqsCapacity will be treated as + * an error code. + * + * If the user-provided function does not return an error code, the sequences + * written to outSeqs must be a valid parse of the src buffer. Data corruption may + * occur if the parse is not valid. A parse is defined to be valid if the + * following conditions hold: + * - The sum of matchLengths and literalLengths must equal srcSize. + * - All sequences in the parse, except for the final sequence, must have + * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have + * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. + * - All offsets must respect the windowSize parameter as specified in + * doc/zstd_compression_format.md. + * - If the final sequence has matchLength == 0, it must also have offset == 0. + * + * zstd will only validate these conditions (and fail compression if they do not + * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence + * validation has a performance cost. + * + * If the user-provided function returns an error, zstd will either fall back + * to an internal sequence producer or fail the compression operation. The user can + * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback + * cParam. Fallback compression will follow any other cParam settings, such as + * compression level, the same as in a normal compression operation. + * + * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F + * function by calling + * ZSTD_registerSequenceProducer(cctx, + * sequenceProducerState, + * sequenceProducer) + * This setting will persist until the next parameter reset of the CCtx. + * + * The sequenceProducerState must be initialized by the user before calling + * ZSTD_registerSequenceProducer(). The user is responsible for destroying the + * sequenceProducerState. + * + * *** LIMITATIONS *** + * This API is compatible with all zstd compression APIs which respect advanced parameters. + * However, there are three limitations: + * + * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. + * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level + * external sequence producer. + * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some + * cases (see its documentation for details). Users must explicitly set + * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external + * sequence producer is registered. + * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default + * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should + * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence + * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). + * + * Second, history buffers are not currently supported. Concretely, zstd will always pass + * dictSize == 0 to the external sequence producer (for now). This has two implications: + * - Dictionaries are not currently supported. Compression will *not* fail if the user + * references a dictionary, but the dictionary won't have any effect. + * - Stream history is not currently supported. All advanced compression APIs, including + * streaming APIs, work with external sequence producers, but each block is treated as + * an independent chunk without history from previous blocks. + * + * Third, multi-threading within a single compression is not currently supported. In other words, + * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. + * Multi-threading across compressions is fine: simply create one CCtx per thread. + * + * Long-term, we plan to overcome all three limitations. There is no technical blocker to + * overcoming them. It is purely a question of engineering effort. + */ + +#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) + +typedef size_t (*ZSTD_sequenceProducer_F) ( + void* sequenceProducerState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel, + size_t windowSize +); + +/*! ZSTD_registerSequenceProducer() : + * Instruct zstd to use a block-level external sequence producer function. + * + * The sequenceProducerState must be initialized by the caller, and the caller is + * responsible for managing its lifetime. This parameter is sticky across + * compressions. It will remain set until the user explicitly resets compression + * parameters. + * + * Sequence producer registration is considered to be an "advanced parameter", + * part of the "advanced API". This means it will only have an effect on compression + * APIs which respect advanced parameters, such as compress2() and compressStream2(). + * Older compression APIs such as compressCCtx(), which predate the introduction of + * "advanced parameters", will ignore any external sequence producer setting. + * + * The sequence producer can be "cleared" by registering a NULL function pointer. This + * removes all limitations described above in the "LIMITATIONS" section of the API docs. + * + * The user is strongly encouraged to read the full API documentation (above) before + * calling this function. */ +ZSTDLIB_STATIC_API void +ZSTD_registerSequenceProducer( + ZSTD_CCtx* cctx, + void* sequenceProducerState, + ZSTD_sequenceProducer_F sequenceProducer +); + +/*! ZSTD_CCtxParams_registerSequenceProducer() : + * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. + * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), + * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). + * + * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() + * is required, then this function is for you. Otherwise, you probably don't need it. + * + * See tests/zstreamtest.c for example usage. */ +ZSTDLIB_STATIC_API void +ZSTD_CCtxParams_registerSequenceProducer( + ZSTD_CCtx_params* params, + void* sequenceProducerState, + ZSTD_sequenceProducer_F sequenceProducer +); /********************************************************************* -* Buffer-less and synchronous inner streaming functions +* Buffer-less and synchronous inner streaming functions (DEPRECATED) * -* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. -* But it's also a complex one, with several restrictions, documented below. -* Prefer normal streaming API for an easier experience. +* This API is deprecated, and will be removed in a future version. +* It allows streaming (de)compression with user allocated buffers. +* However, it is hard to use, and not as well tested as the rest of +* our API. +* +* Please use the normal streaming API instead: ZSTD_compressStream2, +* and ZSTD_decompressStream. +* If there is functionality that you need, but it doesn't provide, +* please open an issue on our GitHub. ********************************************************************* */ /** @@ -1854,12 +2999,10 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); A ZSTD_CCtx object is required to track streaming operations. Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. - ZSTD_CCtx object can be re-used multiple times within successive compression operations. + ZSTD_CCtx object can be reused multiple times within successive compression operations. Start by initializing a context. - Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, - or ZSTD_compressBegin_advanced(), for finer parameter control. - It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. Then, consume your input using ZSTD_compressContinue(). There are some important considerations to keep in mind when using this advanced function : @@ -1877,37 +3020,49 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. - `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. + `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again. */ /*===== Buffer-less streaming compression functions =====*/ -ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); -ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); -ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ -ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */ -ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ -ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ - -ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); - - -/*- +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */ + +ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") +ZSTDLIB_STATIC_API +size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") +ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") +ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +/* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ +ZSTD_DEPRECATED("use advanced API to access custom parameters") +ZSTDLIB_STATIC_API +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTD_DEPRECATED("use advanced API to access custom parameters") +ZSTDLIB_STATIC_API +size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ +/** Buffer-less streaming decompression (synchronous mode) A ZSTD_DCtx object is required to track streaming operations. Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. - A ZSTD_DCtx object can be re-used multiple times. + A ZSTD_DCtx object can be reused multiple times. First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. Data fragment must be large enough to ensure successful decoding. `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. - @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. - >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. + result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. + >0 : `srcSize` is too small, please provide at least result bytes on next attempt. errorCode, which can be tested using ZSTD_isError(). - It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, + It fills a ZSTD_FrameHeader structure with important information to correctly decode the frame, such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. As a consequence, check that values remain within valid application range. @@ -1923,7 +3078,7 @@ ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapaci The most memory efficient way is to use a round buffer of sufficient size. Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), - which can @return an error code if required value is too large for current system (in 32-bits mode). + which can return an error code if required value is too large for current system (in 32-bits mode). In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, up to the moment there is not enough room left in the buffer to guarantee decoding another full block, which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. @@ -1943,7 +3098,7 @@ ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapaci ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. - @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. It can also be an error code, which can be tested with ZSTD_isError(). @@ -1966,49 +3121,42 @@ ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapaci */ /*===== Buffer-less streaming decompression functions =====*/ -typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; -typedef struct { - unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ - unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ - unsigned blockSizeMax; - ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ - unsigned headerSize; - unsigned dictID; - unsigned checksumFlag; -} ZSTD_frameHeader; -/*! ZSTD_getFrameHeader() : - * decode Frame Header, or requires larger `srcSize`. - * @return : 0, `zfhPtr` is correctly filled, - * >0, `srcSize` is too small, value is wanted `srcSize` amount, - * or an error code, which can be tested using ZSTD_isError() */ -ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */ -/*! ZSTD_getFrameHeader_advanced() : - * same as ZSTD_getFrameHeader(), - * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ -ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); -ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ -ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); -ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); -ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); -ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); -ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); +ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); /* misc */ -ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); +ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") +ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; -ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); -/* ============================ */ -/** Block level API */ -/* ============================ */ +/* ========================================= */ +/** Block level API (DEPRECATED) */ +/* ========================================= */ /*! + + This API is deprecated in favor of the regular compression API. + You can get the frame header down to 2 bytes by setting: + - ZSTD_c_format = ZSTD_f_zstd1_magicless + - ZSTD_c_contentSizeFlag = 0 + - ZSTD_c_checksumFlag = 0 + - ZSTD_c_dictIDFlag = 0 + + This API is not as well tested as our normal API, so we recommend not using it. + We will be removing it in a future version. If the normal API doesn't provide + the functionality you need, please open a GitHub issue. + Block functions produce and decode raw zstd blocks, without frame metadata. Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. @@ -2019,7 +3167,6 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); - It is necessary to init context before starting + compression : any ZSTD_compressBegin*() variant, including with dictionary + decompression : any ZSTD_decompressBegin*() variant, including with dictionary - + copyCCtx() and copyDCtx() can be used too - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + If input is larger than a block size, it's necessary to split input data into multiple blocks + For inputs larger than a single block, consider using regular ZSTD_compress() instead. @@ -2036,14 +3183,21 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); */ /*===== Raw zstd block functions =====*/ -ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); -ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ - - -#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ +ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") +ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") +ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") +ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") +ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ #if defined (__cplusplus) } #endif + +#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +#else /* USE_EXTERNAL_ZSTD */ +#include_next +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_bulk.go b/vendor/github.com/DataDog/zstd/zstd_bulk.go new file mode 100644 index 0000000..6294a65 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/zstd_bulk.go @@ -0,0 +1,151 @@ +package zstd + +/* +#include "zstd.h" +*/ +import "C" +import ( + "errors" + "runtime" + "unsafe" +) + +var ( + // ErrEmptyDictionary is returned when the given dictionary is empty + ErrEmptyDictionary = errors.New("Dictionary is empty") + // ErrBadDictionary is returned when cannot load the given dictionary + ErrBadDictionary = errors.New("Cannot load dictionary") +) + +// BulkProcessor implements Bulk processing dictionary API. +// When compressing multiple messages or blocks using the same dictionary, +// it's recommended to digest the dictionary only once, since it's a costly operation. +// NewBulkProcessor() will create a state from digesting a dictionary. +// The resulting state can be used for future compression/decompression operations with very limited startup cost. +// BulkProcessor can be created once and shared by multiple threads concurrently, since its usage is read-only. +// The state will be freed when gc cleans up BulkProcessor. +type BulkProcessor struct { + cDict *C.struct_ZSTD_CDict_s + dDict *C.struct_ZSTD_DDict_s +} + +// NewBulkProcessor creates a new BulkProcessor with a pre-trained dictionary and compression level +func NewBulkProcessor(dictionary []byte, compressionLevel int) (*BulkProcessor, error) { + if len(dictionary) < 1 { + return nil, ErrEmptyDictionary + } + + p := &BulkProcessor{} + runtime.SetFinalizer(p, finalizeBulkProcessor) + + p.cDict = C.ZSTD_createCDict( + unsafe.Pointer(&dictionary[0]), + C.size_t(len(dictionary)), + C.int(compressionLevel), + ) + if p.cDict == nil { + return nil, ErrBadDictionary + } + p.dDict = C.ZSTD_createDDict( + unsafe.Pointer(&dictionary[0]), + C.size_t(len(dictionary)), + ) + if p.dDict == nil { + return nil, ErrBadDictionary + } + + return p, nil +} + +// Compress compresses `src` into `dst` with the dictionary given when creating the BulkProcessor. +// If you have a buffer to use, you can pass it to prevent allocation. +// If it is too small, or if nil is passed, a new buffer will be allocated and returned. +func (p *BulkProcessor) Compress(dst, src []byte) ([]byte, error) { + bound := CompressBound(len(src)) + if cap(dst) >= bound { + dst = dst[0:bound] + } else { + dst = make([]byte, bound) + } + + cctx := C.ZSTD_createCCtx() + // We need unsafe.Pointer(&src[0]) in the Cgo call to avoid "Go pointer to Go pointer" panics. + // This means we need to special case empty input. See: + // https://github.com/golang/go/issues/14210#issuecomment-346402945 + var cWritten C.size_t + if len(src) == 0 { + cWritten = C.ZSTD_compress_usingCDict( + cctx, + unsafe.Pointer(&dst[0]), + C.size_t(len(dst)), + unsafe.Pointer(nil), + C.size_t(len(src)), + p.cDict, + ) + } else { + cWritten = C.ZSTD_compress_usingCDict( + cctx, + unsafe.Pointer(&dst[0]), + C.size_t(len(dst)), + unsafe.Pointer(&src[0]), + C.size_t(len(src)), + p.cDict, + ) + } + + C.ZSTD_freeCCtx(cctx) + + written := int(cWritten) + if err := getError(written); err != nil { + return nil, err + } + return dst[:written], nil +} + +// Decompress decompresses `src` into `dst` with the dictionary given when creating the BulkProcessor. +// If you have a buffer to use, you can pass it to prevent allocation. +// If it is too small, or if nil is passed, a new buffer will be allocated and returned. +func (p *BulkProcessor) Decompress(dst, src []byte) ([]byte, error) { + if len(src) == 0 { + return nil, ErrEmptySlice + } + + contentSize := decompressSizeHint(src) + if cap(dst) >= contentSize { + dst = dst[0:cap(dst)] + } else { + dst = make([]byte, contentSize) + } + + if len(dst) == 0 { + return dst, nil + } + + dctx := C.ZSTD_createDCtx() + cWritten := C.ZSTD_decompress_usingDDict( + dctx, + unsafe.Pointer(&dst[0]), + C.size_t(len(dst)), + unsafe.Pointer(&src[0]), + C.size_t(len(src)), + p.dDict, + ) + C.ZSTD_freeDCtx(dctx) + + written := int(cWritten) + if err := getError(written); err != nil { + return nil, err + } + + return dst[:written], nil +} + +// finalizeBulkProcessor frees compression and decompression dictionaries from memory +func finalizeBulkProcessor(p *BulkProcessor) { + if p.cDict != nil { + C.ZSTD_freeCDict(p.cDict) + } + if p.dDict != nil { + C.ZSTD_freeDDict(p.dDict) + } +} diff --git a/vendor/github.com/DataDog/zstd/zstd_common.c b/vendor/github.com/DataDog/zstd/zstd_common.c index 667f4a2..656c46c 100644 --- a/vendor/github.com/DataDog/zstd/zstd_common.c +++ b/vendor/github.com/DataDog/zstd/zstd_common.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -13,8 +14,7 @@ /*-************************************* * Dependencies ***************************************/ -#include /* malloc, calloc, free */ -#include /* memset */ +#define ZSTD_DEPS_NEED_MALLOC #include "error_private.h" #include "zstd_internal.h" @@ -48,36 +48,4 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } * provides error code string from enum */ const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } - - -/*=************************************************************** -* Custom allocator -****************************************************************/ -void* ZSTD_malloc(size_t size, ZSTD_customMem customMem) -{ - if (customMem.customAlloc) - return customMem.customAlloc(customMem.opaque, size); - return malloc(size); -} - -void* ZSTD_calloc(size_t size, ZSTD_customMem customMem) -{ - if (customMem.customAlloc) { - /* calloc implemented as malloc+memset; - * not as efficient as calloc, but next best guess for custom malloc */ - void* const ptr = customMem.customAlloc(customMem.opaque, size); - memset(ptr, 0, size); - return ptr; - } - return calloc(1, size); -} - -void ZSTD_free(void* ptr, ZSTD_customMem customMem) -{ - if (ptr!=NULL) { - if (customMem.customFree) - customMem.customFree(customMem.opaque, ptr); - else - free(ptr); - } -} +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_compress.c b/vendor/github.com/DataDog/zstd/zstd_compress.c index 35346b9..fcfd4d4 100644 --- a/vendor/github.com/DataDog/zstd/zstd_compress.c +++ b/vendor/github.com/DataDog/zstd/zstd_compress.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,14 +12,13 @@ /*-************************************* * Dependencies ***************************************/ -#include /* INT_MAX */ -#include /* memset */ -#include "cpu.h" +#include "allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ +#include "zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ #include "mem.h" +#include "error_private.h" #include "hist.h" /* HIST_countFast_wksp */ #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ #include "fse.h" -#define HUF_STATIC_LINKING_ONLY #include "huf.h" #include "zstd_compress_internal.h" #include "zstd_compress_sequences.h" @@ -28,13 +28,50 @@ #include "zstd_lazy.h" #include "zstd_opt.h" #include "zstd_ldm.h" +#include "zstd_compress_superblock.h" +#include "bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ + +/* *************************************************************** +* Tuning parameters +*****************************************************************/ +/*! + * COMPRESS_HEAPMODE : + * Select how default decompression function ZSTD_compress() allocates its context, + * on stack (0, default), or into heap (1). + * Note that functions with explicit context such as ZSTD_compressCCtx() are unaffected. + */ +#ifndef ZSTD_COMPRESS_HEAPMODE +# define ZSTD_COMPRESS_HEAPMODE 0 +#endif +/*! + * ZSTD_HASHLOG3_MAX : + * Maximum size of the hash table dedicated to find 3-bytes matches, + * in log format, aka 17 => 1 << 17 == 128Ki positions. + * This structure is only used in zstd_opt. + * Since allocation is centralized for all strategies, it has to be known here. + * The actual (selected) size of the hash table is then stored in ZSTD_MatchState_t.hashLog3, + * so that zstd_opt.c doesn't need to know about this constant. + */ +#ifndef ZSTD_HASHLOG3_MAX +# define ZSTD_HASHLOG3_MAX 17 +#endif /*-************************************* * Helper functions ***************************************/ +/* ZSTD_compressBound() + * Note that the result from this function is only valid for + * the one-pass compression functions. + * When employing the streaming mode, + * if flushes are frequently altering the size of blocks, + * the overhead from block headers can make the compressed data larger + * than the return value of ZSTD_compressBound(). + */ size_t ZSTD_compressBound(size_t srcSize) { - return ZSTD_COMPRESSBOUND(srcSize); + size_t const r = ZSTD_COMPRESSBOUND(srcSize); + if (r==0) return ERROR(srcSize_wrong); + return r; } @@ -44,13 +81,18 @@ size_t ZSTD_compressBound(size_t srcSize) { struct ZSTD_CDict_s { const void* dictContent; size_t dictContentSize; + ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */ U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ ZSTD_cwksp workspace; - ZSTD_matchState_t matchState; + ZSTD_MatchState_t matchState; ZSTD_compressedBlockState_t cBlockState; ZSTD_customMem customMem; U32 dictID; int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ + ZSTD_ParamSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use + * row-based matchfinder. Unless the cdict is reloaded, we will use + * the same greedy/lazy matchfinder at compression time. + */ }; /* typedef'd to ZSTD_CDict within "zstd.h" */ ZSTD_CCtx* ZSTD_createCCtx(void) @@ -61,9 +103,9 @@ ZSTD_CCtx* ZSTD_createCCtx(void) static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager) { assert(cctx != NULL); - memset(cctx, 0, sizeof(*cctx)); + ZSTD_memset(cctx, 0, sizeof(*cctx)); cctx->customMem = memManager; - cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + cctx->bmi2 = ZSTD_cpuSupportsBmi2(); { size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters); assert(!ZSTD_isError(err)); (void)err; @@ -74,36 +116,35 @@ ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem) { ZSTD_STATIC_ASSERT(zcss_init==0); ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1)); - if (!customMem.customAlloc ^ !customMem.customFree) return NULL; - { ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem); + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; + { ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_customMalloc(sizeof(ZSTD_CCtx), customMem); if (!cctx) return NULL; ZSTD_initCCtx(cctx, customMem); return cctx; } } -ZSTD_CCtx* ZSTD_initStaticCCtx(void *workspace, size_t workspaceSize) +ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) { ZSTD_cwksp ws; ZSTD_CCtx* cctx; if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL; /* minimum size */ if ((size_t)workspace & 7) return NULL; /* must be 8-aligned */ - ZSTD_cwksp_init(&ws, workspace, workspaceSize); + ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc); cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx)); - if (cctx == NULL) { - return NULL; - } - memset(cctx, 0, sizeof(ZSTD_CCtx)); + if (cctx == NULL) return NULL; + + ZSTD_memset(cctx, 0, sizeof(ZSTD_CCtx)); ZSTD_cwksp_move(&cctx->workspace, &ws); cctx->staticSize = workspaceSize; - /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */ - if (!ZSTD_cwksp_check_available(&cctx->workspace, HUF_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; + /* statically sized space. tmpWorkspace never moves (but prev/next block swap places) */ + if (!ZSTD_cwksp_check_available(&cctx->workspace, TMP_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); - cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object( - &cctx->workspace, HUF_WORKSPACE_SIZE); + cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE); + cctx->tmpWkspSize = TMP_WORKSPACE_SIZE; cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); return cctx; } @@ -113,10 +154,10 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void *workspace, size_t workspaceSize) */ static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx) { - ZSTD_free(cctx->localDict.dictBuffer, cctx->customMem); + ZSTD_customFree(cctx->localDict.dictBuffer, cctx->customMem); ZSTD_freeCDict(cctx->localDict.cdict); - memset(&cctx->localDict, 0, sizeof(cctx->localDict)); - memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); + ZSTD_memset(&cctx->localDict, 0, sizeof(cctx->localDict)); + ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); cctx->cdict = NULL; } @@ -140,15 +181,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) { + DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); if (cctx==NULL) return 0; /* support free on NULL */ RETURN_ERROR_IF(cctx->staticSize, memory_allocation, "not compatible with static CCtx"); - { - int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); ZSTD_freeCCtxContent(cctx); - if (!cctxInWorkspace) { - ZSTD_free(cctx, cctx->customMem); - } + if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); } return 0; } @@ -181,17 +220,109 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) } /* private API call, for dictBuilder only */ -const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } +const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } + +/* Returns true if the strategy supports using a row based matchfinder */ +static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { + return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2); +} + +/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder + * for this compression. + */ +static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_ParamSwitch_e mode) { + assert(mode != ZSTD_ps_auto); + return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable); +} + +/* Returns row matchfinder usage given an initial mode and cParams */ +static ZSTD_ParamSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ + mode = ZSTD_ps_disable; + if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; + if (cParams->windowLog > 14) mode = ZSTD_ps_enable; + return mode; +} + +/* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */ +static ZSTD_ParamSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable; +} + +/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ +static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + const ZSTD_ParamSwitch_e useRowMatchFinder, + const U32 forDDSDict) { + assert(useRowMatchFinder != ZSTD_ps_auto); + /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. + * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder. + */ + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); +} + +/* Returns ZSTD_ps_enable if compression parameters are such that we should + * enable long distance matching (wlog >= 27, strategy >= btopt). + * Returns ZSTD_ps_disable otherwise. + */ +static ZSTD_ParamSwitch_e ZSTD_resolveEnableLdm(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; +} + +static int ZSTD_resolveExternalSequenceValidation(int mode) { + return mode; +} + +/* Resolves maxBlockSize to the default if no value is present. */ +static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { + if (maxBlockSize == 0) { + return ZSTD_BLOCKSIZE_MAX; + } else { + return maxBlockSize; + } +} + +static ZSTD_ParamSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_ParamSwitch_e value, int cLevel) { + if (value != ZSTD_ps_auto) return value; + if (cLevel < 10) { + return ZSTD_ps_disable; + } else { + return ZSTD_ps_enable; + } +} + +/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. + * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ +static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { + return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; +} static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( ZSTD_compressionParameters cParams) { ZSTD_CCtx_params cctxParams; - memset(&cctxParams, 0, sizeof(cctxParams)); + /* should not matter, as all cParams are presumed properly defined */ + ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT); cctxParams.cParams = cParams; - cctxParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + + /* Adjust advanced params according to cParams */ + cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams); + if (cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) { + ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams); + assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog); + assert(cctxParams.ldmParams.hashRateLog < 32); + } + cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); + cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); + cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); + cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, + cctxParams.compressionLevel); assert(!ZSTD_checkCParams(cParams)); - cctxParams.fParams.contentSizeFlag = 1; return cctxParams; } @@ -199,13 +330,12 @@ static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced( ZSTD_customMem customMem) { ZSTD_CCtx_params* params; - if (!customMem.customAlloc ^ !customMem.customFree) return NULL; - params = (ZSTD_CCtx_params*)ZSTD_calloc( + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; + params = (ZSTD_CCtx_params*)ZSTD_customCalloc( sizeof(ZSTD_CCtx_params), customMem); if (!params) { return NULL; } + ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT); params->customMem = customMem; - params->compressionLevel = ZSTD_CLEVEL_DEFAULT; - params->fParams.contentSizeFlag = 1; return params; } @@ -217,7 +347,7 @@ ZSTD_CCtx_params* ZSTD_createCCtxParams(void) size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params) { if (params == NULL) { return 0; } - ZSTD_free(params, params->customMem); + ZSTD_customFree(params, params->customMem); return 0; } @@ -227,36 +357,64 @@ size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params) } size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) { - RETURN_ERROR_IF(!cctxParams, GENERIC); - memset(cctxParams, 0, sizeof(*cctxParams)); + RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); cctxParams->compressionLevel = compressionLevel; cctxParams->fParams.contentSizeFlag = 1; return 0; } +#define ZSTD_NO_CLEVEL 0 + +/** + * Initializes `cctxParams` from `params` and `compressionLevel`. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +static void +ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, + const ZSTD_parameters* params, + int compressionLevel) +{ + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); + cctxParams->cParams = params->cParams; + cctxParams->fParams = params->fParams; + /* Should not matter, as all cParams are presumed properly defined. + * But, set it for tracing anyway. + */ + cctxParams->compressionLevel = compressionLevel; + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); + cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); + cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); + cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); + cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", + cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm); +} + size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) { - RETURN_ERROR_IF(!cctxParams, GENERIC); - FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) ); - memset(cctxParams, 0, sizeof(*cctxParams)); - assert(!ZSTD_checkCParams(params.cParams)); - cctxParams->cParams = params.cParams; - cctxParams->fParams = params.fParams; - cctxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); + FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); + ZSTD_CCtxParams_init_internal(cctxParams, ¶ms, ZSTD_NO_CLEVEL); return 0; } -/* ZSTD_assignParamsToCCtxParams() : - * params is presumed valid at this stage */ -static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams( - const ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) -{ - ZSTD_CCtx_params ret = *cctxParams; - assert(!ZSTD_checkCParams(params.cParams)); - ret.cParams = params.cParams; - ret.fParams = params.fParams; - ret.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ - return ret; +/** + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. + * @param params Validated zstd parameters. + */ +static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +{ + assert(!ZSTD_checkCParams(params->cParams)); + cctxParams->cParams = params->cParams; + cctxParams->fParams = params->fParams; + /* Should not matter, as all cParams are presumed properly defined. + * But, set it for tracing anyway. + */ + cctxParams->compressionLevel = ZSTD_NO_CLEVEL; } ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) @@ -339,15 +497,25 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) return bounds; case ZSTD_c_overlapLog: +#ifdef ZSTD_MULTITHREAD bounds.lowerBound = ZSTD_OVERLAPLOG_MIN; bounds.upperBound = ZSTD_OVERLAPLOG_MAX; +#else + bounds.lowerBound = 0; + bounds.upperBound = 0; +#endif return bounds; - case ZSTD_c_enableLongDistanceMatching: + case ZSTD_c_enableDedicatedDictSearch: bounds.lowerBound = 0; bounds.upperBound = 1; return bounds; + case ZSTD_c_enableLongDistanceMatching: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + case ZSTD_c_ldmHashLog: bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN; bounds.upperBound = ZSTD_LDM_HASHLOG_MAX; @@ -386,15 +554,15 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) return bounds; case ZSTD_c_forceAttachDict: - ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceCopy); + ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceLoad); bounds.lowerBound = ZSTD_dictDefaultAttach; bounds.upperBound = ZSTD_dictForceLoad; /* note : how to ensure at compile time that this is the highest value enum ? */ return bounds; case ZSTD_c_literalCompressionMode: - ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_huffman < ZSTD_lcm_uncompressed); - bounds.lowerBound = ZSTD_lcm_auto; - bounds.upperBound = ZSTD_lcm_uncompressed; + ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable); + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; return bounds; case ZSTD_c_targetCBlockSize: @@ -407,10 +575,65 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) bounds.upperBound = ZSTD_SRCSIZEHINT_MAX; return bounds; + case ZSTD_c_stableInBuffer: + case ZSTD_c_stableOutBuffer: + bounds.lowerBound = (int)ZSTD_bm_buffered; + bounds.upperBound = (int)ZSTD_bm_stable; + return bounds; + + case ZSTD_c_blockDelimiters: + bounds.lowerBound = (int)ZSTD_sf_noBlockDelimiters; + bounds.upperBound = (int)ZSTD_sf_explicitBlockDelimiters; + return bounds; + + case ZSTD_c_validateSequences: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_splitAfterSequences: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_blockSplitterLevel: + bounds.lowerBound = 0; + bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX; + return bounds; + + case ZSTD_c_useRowMatchFinder: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_deterministicRefPrefix: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_prefetchCDictTables: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_enableSeqProducerFallback: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_maxBlockSize: + bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; + bounds.upperBound = ZSTD_BLOCKSIZE_MAX; + return bounds; + + case ZSTD_c_repcodeResolution: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + default: - { ZSTD_bounds const boundError = { ERROR(parameter_unsupported), 0, 0 }; - return boundError; - } + bounds.error = ERROR(parameter_unsupported); + return bounds; } } @@ -426,10 +649,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) return 0; } -#define BOUNDCHECK(cParam, val) { \ - RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ - parameter_outOfBound); \ -} +#define BOUNDCHECK(cParam, val) \ + do { \ + RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ + parameter_outOfBound, "Param out of bounds"); \ + } while (0) static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) @@ -443,6 +667,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_minMatch: case ZSTD_c_targetLength: case ZSTD_c_strategy: + case ZSTD_c_blockSplitterLevel: return 1; case ZSTD_c_format: @@ -455,6 +680,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_jobSize: case ZSTD_c_overlapLog: case ZSTD_c_rsyncable: + case ZSTD_c_enableDedicatedDictSearch: case ZSTD_c_enableLongDistanceMatching: case ZSTD_c_ldmHashLog: case ZSTD_c_ldmMinMatch: @@ -464,6 +690,17 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_literalCompressionMode: case ZSTD_c_targetCBlockSize: case ZSTD_c_srcSizeHint: + case ZSTD_c_stableInBuffer: + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: + case ZSTD_c_splitAfterSequences: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: + case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableSeqProducerFallback: + case ZSTD_c_maxBlockSize: + case ZSTD_c_repcodeResolution: default: return 0; } @@ -476,7 +713,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) if (ZSTD_isUpdateAuthorized(param)) { cctx->cParamsChanged = 1; } else { - RETURN_ERROR(stage_wrong); + RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); } } switch(param) @@ -505,15 +742,28 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) case ZSTD_c_jobSize: case ZSTD_c_overlapLog: case ZSTD_c_rsyncable: + case ZSTD_c_enableDedicatedDictSearch: case ZSTD_c_enableLongDistanceMatching: case ZSTD_c_ldmHashLog: case ZSTD_c_ldmMinMatch: case ZSTD_c_ldmBucketSizeLog: case ZSTD_c_targetCBlockSize: case ZSTD_c_srcSizeHint: + case ZSTD_c_stableInBuffer: + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: + case ZSTD_c_splitAfterSequences: + case ZSTD_c_blockSplitterLevel: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: + case ZSTD_c_prefetchCDictTables: + case ZSTD_c_enableSeqProducerFallback: + case ZSTD_c_maxBlockSize: + case ZSTD_c_repcodeResolution: break; - default: RETURN_ERROR(parameter_unsupported); + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value); } @@ -530,10 +780,11 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, return (size_t)CCtxParams->format; case ZSTD_c_compressionLevel : { - FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value)); - if (value) { /* 0 : does not change current level */ + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + if (value == 0) + CCtxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* 0 == default */ + else CCtxParams->compressionLevel = value; - } if (CCtxParams->compressionLevel >= 0) return (size_t)CCtxParams->compressionLevel; return 0; /* return type (size_t) cannot represent negative values */ } @@ -565,12 +816,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_minMatch : if (value!=0) /* 0 => use default */ BOUNDCHECK(ZSTD_c_minMatch, value); - CCtxParams->cParams.minMatch = value; + CCtxParams->cParams.minMatch = (U32)value; return CCtxParams->cParams.minMatch; case ZSTD_c_targetLength : BOUNDCHECK(ZSTD_c_targetLength, value); - CCtxParams->cParams.targetLength = value; + CCtxParams->cParams.targetLength = (U32)value; return CCtxParams->cParams.targetLength; case ZSTD_c_strategy : @@ -583,12 +834,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, /* Content size written in frame header _when known_ (default:1) */ DEBUGLOG(4, "set content size flag = %u", (value!=0)); CCtxParams->fParams.contentSizeFlag = value != 0; - return CCtxParams->fParams.contentSizeFlag; + return (size_t)CCtxParams->fParams.contentSizeFlag; case ZSTD_c_checksumFlag : /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ CCtxParams->fParams.checksumFlag = value != 0; - return CCtxParams->fParams.checksumFlag; + return (size_t)CCtxParams->fParams.checksumFlag; case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); @@ -597,18 +848,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, case ZSTD_c_forceMaxWindow : CCtxParams->forceWindow = (value != 0); - return CCtxParams->forceWindow; + return (size_t)CCtxParams->forceWindow; case ZSTD_c_forceAttachDict : { const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; - BOUNDCHECK(ZSTD_c_forceAttachDict, pref); + BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); CCtxParams->attachDictPref = pref; return CCtxParams->attachDictPref; } case ZSTD_c_literalCompressionMode : { - const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value; - BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); + const ZSTD_ParamSwitch_e lcm = (ZSTD_ParamSwitch_e)value; + BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); CCtxParams->literalCompressionMode = lcm; return CCtxParams->literalCompressionMode; } @@ -618,9 +869,9 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); return 0; #else - FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value)); + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); CCtxParams->nbWorkers = value; - return CCtxParams->nbWorkers; + return (size_t)(CCtxParams->nbWorkers); #endif case ZSTD_c_jobSize : @@ -631,9 +882,9 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, /* Adjust to the minimum non-default value. */ if (value != 0 && value < ZSTDMT_JOBSIZE_MIN) value = ZSTDMT_JOBSIZE_MIN; - FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value)); + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); assert(value >= 0); - CCtxParams->jobSize = value; + CCtxParams->jobSize = (size_t)value; return CCtxParams->jobSize; #endif @@ -642,9 +893,9 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); return 0; #else - FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value)); + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); CCtxParams->overlapLog = value; - return CCtxParams->overlapLog; + return (size_t)CCtxParams->overlapLog; #endif case ZSTD_c_rsyncable : @@ -652,67 +903,136 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); return 0; #else - FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value)); + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); CCtxParams->rsyncable = value; - return CCtxParams->rsyncable; + return (size_t)CCtxParams->rsyncable; #endif + case ZSTD_c_enableDedicatedDictSearch : + CCtxParams->enableDedicatedDictSearch = (value!=0); + return (size_t)CCtxParams->enableDedicatedDictSearch; + case ZSTD_c_enableLongDistanceMatching : - CCtxParams->ldmParams.enableLdm = (value!=0); + BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); + CCtxParams->ldmParams.enableLdm = (ZSTD_ParamSwitch_e)value; return CCtxParams->ldmParams.enableLdm; case ZSTD_c_ldmHashLog : if (value!=0) /* 0 ==> auto */ BOUNDCHECK(ZSTD_c_ldmHashLog, value); - CCtxParams->ldmParams.hashLog = value; + CCtxParams->ldmParams.hashLog = (U32)value; return CCtxParams->ldmParams.hashLog; case ZSTD_c_ldmMinMatch : if (value!=0) /* 0 ==> default */ BOUNDCHECK(ZSTD_c_ldmMinMatch, value); - CCtxParams->ldmParams.minMatchLength = value; + CCtxParams->ldmParams.minMatchLength = (U32)value; return CCtxParams->ldmParams.minMatchLength; case ZSTD_c_ldmBucketSizeLog : if (value!=0) /* 0 ==> default */ BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); - CCtxParams->ldmParams.bucketSizeLog = value; + CCtxParams->ldmParams.bucketSizeLog = (U32)value; return CCtxParams->ldmParams.bucketSizeLog; case ZSTD_c_ldmHashRateLog : - RETURN_ERROR_IF(value > ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN, - parameter_outOfBound); - CCtxParams->ldmParams.hashRateLog = value; + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); + CCtxParams->ldmParams.hashRateLog = (U32)value; return CCtxParams->ldmParams.hashRateLog; case ZSTD_c_targetCBlockSize : - if (value!=0) /* 0 ==> default */ + if (value!=0) { /* 0 ==> default */ + value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); BOUNDCHECK(ZSTD_c_targetCBlockSize, value); - CCtxParams->targetCBlockSize = value; + } + CCtxParams->targetCBlockSize = (U32)value; return CCtxParams->targetCBlockSize; case ZSTD_c_srcSizeHint : if (value!=0) /* 0 ==> default */ BOUNDCHECK(ZSTD_c_srcSizeHint, value); CCtxParams->srcSizeHint = value; - return CCtxParams->srcSizeHint; + return (size_t)CCtxParams->srcSizeHint; + + case ZSTD_c_stableInBuffer: + BOUNDCHECK(ZSTD_c_stableInBuffer, value); + CCtxParams->inBufferMode = (ZSTD_bufferMode_e)value; + return CCtxParams->inBufferMode; + + case ZSTD_c_stableOutBuffer: + BOUNDCHECK(ZSTD_c_stableOutBuffer, value); + CCtxParams->outBufferMode = (ZSTD_bufferMode_e)value; + return CCtxParams->outBufferMode; + + case ZSTD_c_blockDelimiters: + BOUNDCHECK(ZSTD_c_blockDelimiters, value); + CCtxParams->blockDelimiters = (ZSTD_SequenceFormat_e)value; + return CCtxParams->blockDelimiters; + + case ZSTD_c_validateSequences: + BOUNDCHECK(ZSTD_c_validateSequences, value); + CCtxParams->validateSequences = value; + return (size_t)CCtxParams->validateSequences; + + case ZSTD_c_splitAfterSequences: + BOUNDCHECK(ZSTD_c_splitAfterSequences, value); + CCtxParams->postBlockSplitter = (ZSTD_ParamSwitch_e)value; + return CCtxParams->postBlockSplitter; + + case ZSTD_c_blockSplitterLevel: + BOUNDCHECK(ZSTD_c_blockSplitterLevel, value); + CCtxParams->preBlockSplitter_level = value; + return (size_t)CCtxParams->preBlockSplitter_level; + + case ZSTD_c_useRowMatchFinder: + BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); + CCtxParams->useRowMatchFinder = (ZSTD_ParamSwitch_e)value; + return CCtxParams->useRowMatchFinder; + + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix = !!value; + return (size_t)CCtxParams->deterministicRefPrefix; + + case ZSTD_c_prefetchCDictTables: + BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); + CCtxParams->prefetchCDictTables = (ZSTD_ParamSwitch_e)value; + return CCtxParams->prefetchCDictTables; + + case ZSTD_c_enableSeqProducerFallback: + BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); + CCtxParams->enableMatchFinderFallback = value; + return (size_t)CCtxParams->enableMatchFinderFallback; + + case ZSTD_c_maxBlockSize: + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_maxBlockSize, value); + assert(value>=0); + CCtxParams->maxBlockSize = (size_t)value; + return CCtxParams->maxBlockSize; + + case ZSTD_c_repcodeResolution: + BOUNDCHECK(ZSTD_c_repcodeResolution, value); + CCtxParams->searchForExternalRepcodes = (ZSTD_ParamSwitch_e)value; + return CCtxParams->searchForExternalRepcodes; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } -size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value) +size_t ZSTD_CCtx_getParameter(ZSTD_CCtx const* cctx, ZSTD_cParameter param, int* value) { return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value); } size_t ZSTD_CCtxParams_getParameter( - ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, int* value) + ZSTD_CCtx_params const* CCtxParams, ZSTD_cParameter param, int* value) { switch(param) { case ZSTD_c_format : - *value = CCtxParams->format; + *value = (int)CCtxParams->format; break; case ZSTD_c_compressionLevel : *value = CCtxParams->compressionLevel; @@ -727,16 +1047,16 @@ size_t ZSTD_CCtxParams_getParameter( *value = (int)CCtxParams->cParams.chainLog; break; case ZSTD_c_searchLog : - *value = CCtxParams->cParams.searchLog; + *value = (int)CCtxParams->cParams.searchLog; break; case ZSTD_c_minMatch : - *value = CCtxParams->cParams.minMatch; + *value = (int)CCtxParams->cParams.minMatch; break; case ZSTD_c_targetLength : - *value = CCtxParams->cParams.targetLength; + *value = (int)CCtxParams->cParams.targetLength; break; case ZSTD_c_strategy : - *value = (unsigned)CCtxParams->cParams.strategy; + *value = (int)CCtxParams->cParams.strategy; break; case ZSTD_c_contentSizeFlag : *value = CCtxParams->fParams.contentSizeFlag; @@ -751,10 +1071,10 @@ size_t ZSTD_CCtxParams_getParameter( *value = CCtxParams->forceWindow; break; case ZSTD_c_forceAttachDict : - *value = CCtxParams->attachDictPref; + *value = (int)CCtxParams->attachDictPref; break; case ZSTD_c_literalCompressionMode : - *value = CCtxParams->literalCompressionMode; + *value = (int)CCtxParams->literalCompressionMode; break; case ZSTD_c_nbWorkers : #ifndef ZSTD_MULTITHREAD @@ -784,20 +1104,23 @@ size_t ZSTD_CCtxParams_getParameter( *value = CCtxParams->rsyncable; break; #endif + case ZSTD_c_enableDedicatedDictSearch : + *value = CCtxParams->enableDedicatedDictSearch; + break; case ZSTD_c_enableLongDistanceMatching : - *value = CCtxParams->ldmParams.enableLdm; + *value = (int)CCtxParams->ldmParams.enableLdm; break; case ZSTD_c_ldmHashLog : - *value = CCtxParams->ldmParams.hashLog; + *value = (int)CCtxParams->ldmParams.hashLog; break; case ZSTD_c_ldmMinMatch : - *value = CCtxParams->ldmParams.minMatchLength; + *value = (int)CCtxParams->ldmParams.minMatchLength; break; case ZSTD_c_ldmBucketSizeLog : - *value = CCtxParams->ldmParams.bucketSizeLog; + *value = (int)CCtxParams->ldmParams.bucketSizeLog; break; case ZSTD_c_ldmHashRateLog : - *value = CCtxParams->ldmParams.hashRateLog; + *value = (int)CCtxParams->ldmParams.hashRateLog; break; case ZSTD_c_targetCBlockSize : *value = (int)CCtxParams->targetCBlockSize; @@ -805,6 +1128,42 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_srcSizeHint : *value = (int)CCtxParams->srcSizeHint; break; + case ZSTD_c_stableInBuffer : + *value = (int)CCtxParams->inBufferMode; + break; + case ZSTD_c_stableOutBuffer : + *value = (int)CCtxParams->outBufferMode; + break; + case ZSTD_c_blockDelimiters : + *value = (int)CCtxParams->blockDelimiters; + break; + case ZSTD_c_validateSequences : + *value = (int)CCtxParams->validateSequences; + break; + case ZSTD_c_splitAfterSequences : + *value = (int)CCtxParams->postBlockSplitter; + break; + case ZSTD_c_blockSplitterLevel : + *value = CCtxParams->preBlockSplitter_level; + break; + case ZSTD_c_useRowMatchFinder : + *value = (int)CCtxParams->useRowMatchFinder; + break; + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; + case ZSTD_c_prefetchCDictTables: + *value = (int)CCtxParams->prefetchCDictTables; + break; + case ZSTD_c_enableSeqProducerFallback: + *value = CCtxParams->enableMatchFinderFallback; + break; + case ZSTD_c_maxBlockSize: + *value = (int)CCtxParams->maxBlockSize; + break; + case ZSTD_c_repcodeResolution: + *value = (int)CCtxParams->searchForExternalRepcodes; + break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; @@ -821,31 +1180,79 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params) { DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams"); - RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong); - RETURN_ERROR_IF(cctx->cdict, stage_wrong); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "The context is in the wrong stage!"); + RETURN_ERROR_IF(cctx->cdict, stage_wrong, + "Can't override parameters with cdict attached (some must " + "be inherited from the cdict)."); cctx->requestedParams = *params; return 0; } -ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) +size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) { - DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); - RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong); + ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); + DEBUGLOG(4, "ZSTD_CCtx_setCParams"); + /* only update if all parameters are valid */ + FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, (int)cparams.windowLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, (int)cparams.chainLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, (int)cparams.hashLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, (int)cparams.searchLog), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, (int)cparams.minMatch), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, (int)cparams.targetLength), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, (int)cparams.strategy), ""); + return 0; +} + +size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) +{ + ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); + DEBUGLOG(4, "ZSTD_CCtx_setFParams"); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); + FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); + return 0; +} + +size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) +{ + DEBUGLOG(4, "ZSTD_CCtx_setParams"); + /* First check cParams, because we want to update all or none. */ + FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); + /* Next set fParams, because this could fail if the cctx isn't in init stage. */ + FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); + /* Finally set cParams, which should succeed. */ + FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); + return 0; +} + +size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; return 0; } +static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams( + int const compressionLevel, + size_t const dictSize); +static int ZSTD_dedicatedDictSearch_isSupported( + const ZSTD_compressionParameters* cParams); +static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams); + /** - * Initializes the local dict using the requested parameters. - * NOTE: This does not use the pledged src size, because it may be used for more - * than one compression. + * Initializes the local dictionary using requested parameters. + * NOTE: Initialization does not employ the pledged src size, + * because the dictionary may be used for multiple compressions. */ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) { ZSTD_localDict* const dl = &cctx->localDict; - ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams( - &cctx->requestedParams, 0, dl->dictSize); if (dl->dict == NULL) { /* No local dictionary. */ assert(dl->dictBuffer == NULL); @@ -854,59 +1261,65 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) return 0; } if (dl->cdict != NULL) { - assert(cctx->cdict == dl->cdict); /* Local dictionary already initialized. */ + assert(cctx->cdict == dl->cdict); return 0; } assert(dl->dictSize > 0); assert(cctx->cdict == NULL); assert(cctx->prefixDict.dict == NULL); - dl->cdict = ZSTD_createCDict_advanced( + dl->cdict = ZSTD_createCDict_advanced2( dl->dict, dl->dictSize, ZSTD_dlm_byRef, dl->dictContentType, - cParams, + &cctx->requestedParams, cctx->customMem); - RETURN_ERROR_IF(!dl->cdict, memory_allocation); + RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed"); cctx->cdict = dl->cdict; return 0; } size_t ZSTD_CCtx_loadDictionary_advanced( - ZSTD_CCtx* cctx, const void* dict, size_t dictSize, - ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) + ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) { - RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong); - RETURN_ERROR_IF(cctx->staticSize, memory_allocation, - "no malloc for static CCtx"); DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); - ZSTD_clearAllDicts(cctx); /* in case one already exists */ - if (dict == NULL || dictSize == 0) /* no dictionary mode */ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't load a dictionary when cctx is not in init stage."); + ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ + if (dict == NULL || dictSize == 0) /* no dictionary */ return 0; if (dictLoadMethod == ZSTD_dlm_byRef) { cctx->localDict.dict = dict; } else { - void* dictBuffer = ZSTD_malloc(dictSize, cctx->customMem); - RETURN_ERROR_IF(!dictBuffer, memory_allocation); - memcpy(dictBuffer, dict, dictSize); - cctx->localDict.dictBuffer = dictBuffer; - cctx->localDict.dict = dictBuffer; + /* copy dictionary content inside CCtx to own its lifetime */ + void* dictBuffer; + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "static CCtx can't allocate for an internal copy of dictionary"); + dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); + RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, + "allocation failed for dictionary content"); + ZSTD_memcpy(dictBuffer, dict, dictSize); + cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ + cctx->localDict.dict = dictBuffer; /* read-only reference */ } cctx->localDict.dictSize = dictSize; cctx->localDict.dictContentType = dictContentType; return 0; } -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference( +size_t ZSTD_CCtx_loadDictionary_byReference( ZSTD_CCtx* cctx, const void* dict, size_t dictSize) { return ZSTD_CCtx_loadDictionary_advanced( cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); } -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) { return ZSTD_CCtx_loadDictionary_advanced( cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); @@ -915,13 +1328,22 @@ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, s size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { - RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a dict when ctx not in init stage."); /* Free the existing local cdict (if any) to save memory. */ ZSTD_clearAllDicts(cctx); cctx->cdict = cdict; return 0; } +size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a pool when ctx not in init stage."); + cctx->pool = pool; + return 0; +} + size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize) { return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent); @@ -930,11 +1352,14 @@ size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSiz size_t ZSTD_CCtx_refPrefix_advanced( ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) { - RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a prefix when ctx not in init stage."); ZSTD_clearAllDicts(cctx); - cctx->prefixDict.dict = prefix; - cctx->prefixDict.dictSize = prefixSize; - cctx->prefixDict.dictContentType = dictContentType; + if (prefix != NULL && prefixSize > 0) { + cctx->prefixDict.dict = prefix; + cctx->prefixDict.dictSize = prefixSize; + cctx->prefixDict.dictContentType = dictContentType; + } return 0; } @@ -949,7 +1374,8 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) } if ( (reset == ZSTD_reset_parameters) || (reset == ZSTD_reset_session_and_parameters) ) { - RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Reset parameters is only possible during init stage."); ZSTD_clearAllDicts(cctx); return ZSTD_CCtxParams_reset(&cctx->requestedParams); } @@ -968,7 +1394,7 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog); BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch); BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength); - BOUNDCHECK(ZSTD_c_strategy, cParams.strategy); + BOUNDCHECK(ZSTD_c_strategy, (int)cParams.strategy); return 0; } @@ -978,11 +1404,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) static ZSTD_compressionParameters ZSTD_clampCParams(ZSTD_compressionParameters cParams) { -# define CLAMP_TYPE(cParam, val, type) { \ - ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ - if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ - } +# define CLAMP_TYPE(cParam, val, type) \ + do { \ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ + if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ + } while (0) # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) CLAMP(ZSTD_c_windowLog, cParams.windowLog); CLAMP(ZSTD_c_chainLog, cParams.chainLog); @@ -996,50 +1423,189 @@ ZSTD_clampCParams(ZSTD_compressionParameters cParams) /** ZSTD_cycleLog() : * condition for correct operation : hashLog > 1 */ -static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) +U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) { U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2); return hashLog - btScale; } +/** ZSTD_dictAndWindowLog() : + * Returns an adjusted window log that is large enough to fit the source and the dictionary. + * The zstd format says that the entire dictionary is valid if one byte of the dictionary + * is within the window. So the hashLog and chainLog should be large enough to reference both + * the dictionary and the window. So we must use this adjusted dictAndWindowLog when downsizing + * the hashLog and windowLog. + * NOTE: srcSize must not be ZSTD_CONTENTSIZE_UNKNOWN. + */ +static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize) +{ + const U64 maxWindowSize = 1ULL << ZSTD_WINDOWLOG_MAX; + /* No dictionary ==> No change */ + if (dictSize == 0) { + return windowLog; + } + assert(windowLog <= ZSTD_WINDOWLOG_MAX); + assert(srcSize != ZSTD_CONTENTSIZE_UNKNOWN); /* Handled in ZSTD_adjustCParams_internal() */ + { + U64 const windowSize = 1ULL << windowLog; + U64 const dictAndWindowSize = dictSize + windowSize; + /* If the window size is already large enough to fit both the source and the dictionary + * then just use the window size. Otherwise adjust so that it fits the dictionary and + * the window. + */ + if (windowSize >= dictSize + srcSize) { + return windowLog; /* Window size large enough already */ + } else if (dictAndWindowSize >= maxWindowSize) { + return ZSTD_WINDOWLOG_MAX; /* Larger than max window log */ + } else { + return ZSTD_highbit32((U32)dictAndWindowSize - 1) + 1; + } + } +} + /** ZSTD_adjustCParams_internal() : * optimize `cPar` for a specified input (`srcSize` and `dictSize`). * mostly downsize to reduce memory consumption and initialization latency. * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known. - * note : for the time being, `srcSize==0` means "unknown" too, for compatibility with older convention. + * `mode` is the mode for parameter adjustment. See docs for `ZSTD_CParamMode_e`. + * note : `srcSize==0` means 0! * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */ static ZSTD_compressionParameters ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, unsigned long long srcSize, - size_t dictSize) + size_t dictSize, + ZSTD_CParamMode_e mode, + ZSTD_ParamSwitch_e useRowMatchFinder) { - static const U64 minSrcSize = 513; /* (1<<9) + 1 */ - static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); assert(ZSTD_checkCParams(cPar)==0); - if (dictSize && (srcSize+1<2) /* ZSTD_CONTENTSIZE_UNKNOWN and 0 mean "unknown" */ ) - srcSize = minSrcSize; /* presumed small when there is a dictionary */ - else if (srcSize == 0) - srcSize = ZSTD_CONTENTSIZE_UNKNOWN; /* 0 == unknown : presumed large */ + /* Cascade the selected strategy down to the next-highest one built into + * this binary. */ +#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_btultra2) { + cPar.strategy = ZSTD_btultra; + } + if (cPar.strategy == ZSTD_btultra) { + cPar.strategy = ZSTD_btopt; + } +#endif +#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_btopt) { + cPar.strategy = ZSTD_btlazy2; + } +#endif +#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_btlazy2) { + cPar.strategy = ZSTD_lazy2; + } +#endif +#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_lazy2) { + cPar.strategy = ZSTD_lazy; + } +#endif +#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_lazy) { + cPar.strategy = ZSTD_greedy; + } +#endif +#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_greedy) { + cPar.strategy = ZSTD_dfast; + } +#endif +#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + if (cPar.strategy == ZSTD_dfast) { + cPar.strategy = ZSTD_fast; + cPar.targetLength = 0; + } +#endif + + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: + /* If we don't know the source size, don't make any + * assumptions about it. We will already have selected + * smaller parameters if a dictionary is in use. + */ + break; + case ZSTD_cpm_createCDict: + /* Assume a small source size when creating a dictionary + * with an unknown source size. + */ + if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) + srcSize = minSrcSize; + break; + case ZSTD_cpm_attachDict: + /* Dictionary has its own dedicated parameters which have + * already been selected. We are selecting parameters + * for only the source. + */ + dictSize = 0; + break; + default: + assert(0); + break; + } /* resize windowLog if input is small enough, to use less memory */ - if ( (srcSize < maxWindowResize) - && (dictSize < maxWindowResize) ) { + if ( (srcSize <= maxWindowResize) + && (dictSize <= maxWindowResize) ) { U32 const tSize = (U32)(srcSize + dictSize); static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : ZSTD_highbit32(tSize-1) + 1; if (cPar.windowLog > srcLog) cPar.windowLog = srcLog; } - if (cPar.hashLog > cPar.windowLog+1) cPar.hashLog = cPar.windowLog+1; - { U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); - if (cycleLog > cPar.windowLog) - cPar.chainLog -= (cycleLog - cPar.windowLog); + if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize); + U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); + if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1; + if (cycleLog > dictAndWindowLog) + cPar.chainLog -= (cycleLog - dictAndWindowLog); } if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + /* We can't use more than 32 bits of hash in total, so that means that we require: + * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 + */ + if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { + U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; + if (cPar.hashLog > maxShortCacheHashLog) { + cPar.hashLog = maxShortCacheHashLog; + } + if (cPar.chainLog > maxShortCacheHashLog) { + cPar.chainLog = maxShortCacheHashLog; + } + } + + + /* At this point, we aren't 100% sure if we are using the row match finder. + * Unless it is explicitly disabled, conservatively assume that it is enabled. + * In this case it will only be disabled for small sources, so shrinking the + * hash log a little bit shouldn't result in any ratio loss. + */ + if (useRowMatchFinder == ZSTD_ps_auto) + useRowMatchFinder = ZSTD_ps_enable; + + /* We can't hash more than 32-bits in total. So that means that we require: + * (hashLog - rowLog + 8) <= 32 + */ + if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { + /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); + U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; + U32 const maxHashLog = maxRowHashLog + rowLog; + assert(cPar.hashLog >= rowLog); + if (cPar.hashLog > maxHashLog) { + cPar.hashLog = maxHashLog; + } + } + return cPar; } @@ -1049,34 +1615,52 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, size_t dictSize) { cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ - return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize); + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); +} + +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); +static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); + +static void ZSTD_overrideCParams( + ZSTD_compressionParameters* cParams, + const ZSTD_compressionParameters* overrides) +{ + if (overrides->windowLog) cParams->windowLog = overrides->windowLog; + if (overrides->hashLog) cParams->hashLog = overrides->hashLog; + if (overrides->chainLog) cParams->chainLog = overrides->chainLog; + if (overrides->searchLog) cParams->searchLog = overrides->searchLog; + if (overrides->minMatch) cParams->minMatch = overrides->minMatch; + if (overrides->targetLength) cParams->targetLength = overrides->targetLength; + if (overrides->strategy) cParams->strategy = overrides->strategy; } ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( - const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize) + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) { ZSTD_compressionParameters cParams; if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { - srcSizeHint = CCtxParams->srcSizeHint; - } - cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize); - if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; - if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog; - if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog; - if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog; - if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog; - if (CCtxParams->cParams.minMatch) cParams.minMatch = CCtxParams->cParams.minMatch; - if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength; - if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy; + assert(CCtxParams->srcSizeHint>=0); + srcSizeHint = (U64)CCtxParams->srcSizeHint; + } + cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); + if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; + ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); assert(!ZSTD_checkCParams(cParams)); - return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize); + /* srcSizeHint == 0 means 0 */ + return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); } static size_t ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + const ZSTD_ParamSwitch_e useRowMatchFinder, + const int enableDedicatedDictSearch, const U32 forCCtx) { - size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + /* chain table size should be 0 for fast or row-hash strategies */ + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx) + ? ((size_t)1 << cParams->chainLog) + : 0; size_t const hSize = ((size_t)1) << cParams->hashLog; U32 const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; @@ -1086,58 +1670,131 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + hSize * sizeof(U32) + h3Size * sizeof(U32); size_t const optPotentialSpace = - ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((1<strategy, useRowMatchFinder) + ? ZSTD_cwksp_aligned64_alloc_size(hSize) + : 0; size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) ? optPotentialSpace : 0; + size_t const slackSpace = ZSTD_cwksp_slack_space_required(); + + /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */ + ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4); + assert(useRowMatchFinder != ZSTD_ps_auto); + DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", (U32)chainSize, (U32)hSize, (U32)h3Size); - return tableSpace + optSpace; + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; +} + +/* Helper function for calculating memory requirements. + * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ +static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { + U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; + return blockSize / divider; +} + +static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_compressionParameters* cParams, + const ldmParams_t* ldmParams, + const int isStatic, + const ZSTD_ParamSwitch_e useRowMatchFinder, + const size_t buffInSize, + const size_t buffOutSize, + const U64 pledgedSrcSize, + int useSequenceProducer, + size_t maxBlockSize) +{ + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); + size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); + size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_aligned64_alloc_size(maxNbSeq * sizeof(SeqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); + size_t const tmpWorkSpace = ZSTD_cwksp_alloc_size(TMP_WORKSPACE_SIZE); + size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); + size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); + + size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); + size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? + ZSTD_cwksp_aligned64_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; + + + size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + + ZSTD_cwksp_alloc_size(buffOutSize); + + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + + size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); + size_t const externalSeqSpace = useSequenceProducer + ? ZSTD_cwksp_aligned64_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) + : 0; + + size_t const neededSpace = + cctxSpace + + tmpWorkSpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + + bufferSpace + + externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; } size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) { - RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); - { ZSTD_compressionParameters const cParams = - ZSTD_getCParamsFromCCtxParams(params, 0, 0); - size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); - U32 const divider = (cParams.minMatch==3) ? 3 : 4; - size_t const maxNbSeq = blockSize / divider; - size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) - + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) - + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); - size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE); - size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); - size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1); - - size_t const ldmSpace = ZSTD_ldm_getTableSize(params->ldmParams); - size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize) * sizeof(rawSeq)); + ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); + ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, + &cParams); - size_t const neededSpace = entropySpace + blockStateSpace + tokenSpace + - matchStateSize + ldmSpace + ldmSeqSpace; - size_t const cctxSpace = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)); - - DEBUGLOG(5, "sizeof(ZSTD_CCtx) : %u", (U32)cctxSpace); - DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); - return cctxSpace + neededSpace; - } + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + /* estimateCCtxSize is for one-shot compression. So no buffers should + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); } size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) { - ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); - return ZSTD_estimateCCtxSize_usingCCtxParams(¶ms); + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder = ZSTD_ps_disable; + noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + initialParams.useRowMatchFinder = ZSTD_ps_enable; + rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + } } static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) { - ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0); - return ZSTD_estimateCCtxSize_usingCParams(cParams); + int tier = 0; + size_t largestSize = 0; + static const unsigned long long srcSizeTiers[4] = {16 KB, 128 KB, 256 KB, ZSTD_CONTENTSIZE_UNKNOWN}; + for (; tier < 4; ++tier) { + /* Choose the set of cParams for a given level across all srcSizes that give the largest cctxSize */ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeTiers[tier], 0, ZSTD_cpm_noAttachDict); + largestSize = MAX(ZSTD_estimateCCtxSize_usingCParams(cParams), largestSize); + } + return largestSize; } size_t ZSTD_estimateCCtxSize(int compressionLevel) @@ -1145,6 +1802,7 @@ size_t ZSTD_estimateCCtxSize(int compressionLevel) int level; size_t memBudget = 0; for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { + /* Ensure monotonically increasing memory usage as compression level increases */ size_t const newMB = ZSTD_estimateCCtxSize_internal(level); if (newMB > memBudget) memBudget = newMB; } @@ -1155,27 +1813,42 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) { RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); { ZSTD_compressionParameters const cParams = - ZSTD_getCParamsFromCCtxParams(params, 0, 0); - size_t const CCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params); - size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); - size_t const inBuffSize = ((size_t)1 << cParams.windowLog) + blockSize; - size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1; - size_t const streamingSize = ZSTD_cwksp_alloc_size(inBuffSize) - + ZSTD_cwksp_alloc_size(outBuffSize); - - return CCtxSize + streamingSize; + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); + size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); + size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) + ? ((size_t)1 << cParams.windowLog) + blockSize + : 0; + size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; + ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); + + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, + ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); } } size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) { - ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); - return ZSTD_estimateCStreamSize_usingCCtxParams(¶ms); + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder = ZSTD_ps_disable; + noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + initialParams.useRowMatchFinder = ZSTD_ps_enable; + rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + } } static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) { - ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, 0); + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); return ZSTD_estimateCStreamSize_usingCParams(cParams); } @@ -1243,7 +1916,7 @@ static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1, assert(cParams1.strategy == cParams2.strategy); } -static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) +void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) { int i; for (i = 0; i < ZSTD_REP_NUM; ++i) @@ -1258,7 +1931,7 @@ static void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) * Invalidate all the matches in the match finder tables. * Requires nextSrc and base to be set (can be NULL). */ -static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms) +static void ZSTD_invalidateMatchState(ZSTD_MatchState_t* ms) { ZSTD_window_clear(&ms->window); @@ -1268,16 +1941,6 @@ static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms) ms->dictMatchState = NULL; } -/** - * Indicates whether this compression proceeds directly from user-provided - * source buffer to user-provided destination buffer (ZSTDb_not_buffered), or - * whether the context needs to buffer the input/output (ZSTDb_buffered). - */ -typedef enum { - ZSTDb_not_buffered, - ZSTDb_buffered -} ZSTD_buffered_policy_e; - /** * Controls, for this matchState reset, whether the tables need to be cleared / * prepared for the coming compression (ZSTDcrp_makeClean), or whether the @@ -1305,29 +1968,47 @@ typedef enum { ZSTD_resetTarget_CCtx } ZSTD_resetTarget_e; +/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ +static U64 ZSTD_bitmix(U64 val, U64 len) { + val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); + val *= 0x9FB21C651E98DF25ULL; + val ^= (val >> 35) + len ; + val *= 0x9FB21C651E98DF25ULL; + return val ^ (val >> 28); +} + +/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ +static void ZSTD_advanceHashSalt(ZSTD_MatchState_t* ms) { + ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); +} + static size_t -ZSTD_reset_matchState(ZSTD_matchState_t* ms, +ZSTD_reset_matchState(ZSTD_MatchState_t* ms, ZSTD_cwksp* ws, const ZSTD_compressionParameters* cParams, + const ZSTD_ParamSwitch_e useRowMatchFinder, const ZSTD_compResetPolicy_e crp, const ZSTD_indexResetPolicy_e forceResetIndex, const ZSTD_resetTarget_e forWho) { - size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + /* disable chain table allocation for fast or row-based strategies */ + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, + ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict)) + ? ((size_t)1 << cParams->chainLog) + : 0; size_t const hSize = ((size_t)1) << cParams->hashLog; U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset); + assert(useRowMatchFinder != ZSTD_ps_auto); if (forceResetIndex == ZSTDirp_reset) { - memset(&ms->window, 0, sizeof(ms->window)); - ms->window.dictLimit = 1; /* start from 1, so that 1st position is valid */ - ms->window.lowLimit = 1; /* it ensures first and later CCtx usages compress the same */ - ms->window.nextSrc = ms->window.base + 1; /* see issue #1241 */ + ZSTD_window_init(&ms->window); ZSTD_cwksp_mark_tables_dirty(ws); } ms->hashLog3 = hashLog3; + ms->lazySkipping = 0; ZSTD_invalidateMatchState(ms); @@ -1349,22 +2030,42 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ZSTD_cwksp_clean_tables(ws); } + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { + /* Row match finder needs an additional table of hashes ("tags") */ + size_t const tagTableSize = hSize; + /* We want to generate a new salt in case we reset a Cctx, but we always want to use + * 0 when we reset a Cdict */ + if(forWho == ZSTD_resetTarget_CCtx) { + ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); + ZSTD_advanceHashSalt(ms); + } else { + /* When we are not salting we want to always memset the memory */ + ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned64(ws, tagTableSize); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ms->hashSalt = 0; + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); + assert(cParams->hashLog >= rowLog); + ms->rowHashLog = cParams->hashLog - rowLog; + } + } + /* opt parser space */ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { DEBUGLOG(4, "reserving optimal parser space"); - ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); - ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); - ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); - ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); - ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxLL+1) * sizeof(unsigned)); + ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxML+1) * sizeof(unsigned)); + ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxOff+1) * sizeof(unsigned)); + ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); + ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); } ms->cParams = *cParams; RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, "failed a workspace allocation in ZSTD_reset_matchState"); - return 0; } @@ -1381,75 +2082,86 @@ static int ZSTD_indexTooCloseToMax(ZSTD_window_t w) return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN); } +/** ZSTD_dictTooBig(): + * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in + * one go generically. So we ensure that in that case we reset the tables to zero, + * so that we can load as much of the dictionary as possible. + */ +static int ZSTD_dictTooBig(size_t const loadedDictSize) +{ + return loadedDictSize > ZSTD_CHUNKSIZE_MAX; +} + /*! ZSTD_resetCCtx_internal() : - note : `params` are assumed fully validated at this stage */ + * @param loadedDictSize The size of the dictionary to be loaded + * into the context, if any. If no dictionary is used, or the + * dictionary is being attached / copied, then pass 0. + * note : `params` are assumed fully validated at this stage. + */ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - ZSTD_CCtx_params params, + ZSTD_CCtx_params const* params, U64 const pledgedSrcSize, + size_t const loadedDictSize, ZSTD_compResetPolicy_e const crp, ZSTD_buffered_policy_e const zbuff) { ZSTD_cwksp* const ws = &zc->workspace; - DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u", - (U32)pledgedSrcSize, params.cParams.windowLog); - assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d", + (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); zc->isFirstBlock = 1; - if (params.ldmParams.enableLdm) { + /* Set applied params early so we can modify them for LDM, + * and point params at the applied params. + */ + zc->appliedParams = *params; + params = &zc->appliedParams; + + assert(params->useRowMatchFinder != ZSTD_ps_auto); + assert(params->postBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); + assert(params->maxBlockSize != 0); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* Adjust long distance matching parameters */ - ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); - assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); - assert(params.ldmParams.hashRateLog < 32); - zc->ldmState.hashPower = ZSTD_rollingHash_primePower(params.ldmParams.minMatchLength); - } - - { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize)); - size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); - U32 const divider = (params.cParams.minMatch==3) ? 3 : 4; - size_t const maxNbSeq = blockSize / divider; - size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) - + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) - + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); - size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0; - size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0; - size_t const matchStateSize = ZSTD_sizeof_matchState(¶ms.cParams, /* forCCtx */ 1); - size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize); - - ZSTD_indexResetPolicy_e needsIndexReset = ZSTDirp_continue; - - if (ZSTD_indexTooCloseToMax(zc->blockState.matchState.window)) { - needsIndexReset = ZSTDirp_reset; - } + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); + assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog); + assert(params->ldmParams.hashRateLog < 32); + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); + size_t const blockSize = MIN(params->maxBlockSize, windowSize); + size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; + size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered) + ? windowSize + blockSize + : 0; + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize); + + int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window); + int const dictTooBig = ZSTD_dictTooBig(loadedDictSize); + ZSTD_indexResetPolicy_e needsIndexReset = + (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue; - ZSTD_cwksp_bump_oversized_duration(ws, 0); - - /* Check if workspace is large enough, alloc a new one if needed */ - { size_t const cctxSpace = zc->staticSize ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; - size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE); - size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); - size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + ZSTD_cwksp_alloc_size(buffOutSize); - size_t const ldmSpace = ZSTD_ldm_getTableSize(params.ldmParams); - size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq)); - - size_t const neededSpace = - cctxSpace + - entropySpace + - blockStateSpace + - ldmSpace + - ldmSeqSpace + - matchStateSize + - tokenSpace + - bufferSpace; + size_t const neededSpace = + ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, + buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); + + if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); + + { /* Check if workspace is large enough, alloc a new one if needed */ int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); - - DEBUGLOG(4, "Need %zuKB workspace, including %zuKB for match state, and %zuKB for buffers", - neededSpace>>10, matchStateSize>>10, bufferSpace>>10); + int resizeWorkspace = workspaceTooSmall || workspaceWasteful; + DEBUGLOG(4, "Need %zu B workspace", neededSpace); DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); - if (workspaceTooSmall || workspaceWasteful) { + if (resizeWorkspace) { DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB", ZSTD_cwksp_sizeof(ws) >> 10, neededSpace >> 10); @@ -1459,26 +2171,27 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, needsIndexReset = ZSTDirp_reset; ZSTD_cwksp_free(ws, zc->customMem); - FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem)); + FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), ""); DEBUGLOG(5, "reserving object space"); /* Statically sized space. - * entropyWorkspace never moves, + * tmpWorkspace never moves, * though prev/next block swap places */ assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t))); zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock"); zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); - zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, HUF_WORKSPACE_SIZE); - RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); + zc->tmpWorkspace = ZSTD_cwksp_reserve_object(ws, TMP_WORKSPACE_SIZE); + RETURN_ERROR_IF(zc->tmpWorkspace == NULL, memory_allocation, "couldn't allocate tmpWorkspace"); + zc->tmpWkspSize = TMP_WORKSPACE_SIZE; } } ZSTD_cwksp_clear(ws); /* init params */ - zc->appliedParams = params; - zc->blockState.matchState.cParams = params.cParams; + zc->blockState.matchState.cParams = params->cParams; + zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; zc->consumedSrcSize = 0; zc->producedCSize = 0; @@ -1486,34 +2199,69 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->appliedParams.fParams.contentSizeFlag = 0; DEBUGLOG(4, "pledged content size : %u ; flag : %u", (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag); - zc->blockSize = blockSize; + zc->blockSizeMax = blockSize; XXH64_reset(&zc->xxhState, 0); zc->stage = ZSTDcs_init; zc->dictID = 0; + zc->dictContentSize = 0; ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + FORWARD_IF_ERROR(ZSTD_reset_matchState( + &zc->blockState.matchState, + ws, + ¶ms->cParams, + params->useRowMatchFinder, + crp, + needsIndexReset, + ZSTD_resetTarget_CCtx), ""); + + zc->seqStore.sequencesStart = (SeqDef*)ZSTD_cwksp_reserve_aligned64(ws, maxNbSeq * sizeof(SeqDef)); + + /* ldm hash table */ + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* TODO: avoid memset? */ + size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; + zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned64(ws, ldmHSize * sizeof(ldmEntry_t)); + ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); + zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned64(ws, maxNbLdmSeq * sizeof(rawSeq)); + zc->maxNbLdmSequences = maxNbLdmSeq; + + ZSTD_window_init(&zc->ldmState.window); + zc->ldmState.loadedDictEnd = 0; + } + + /* reserve space for block-level external sequences */ + if (ZSTD_hasExtSeqProd(params)) { + size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); + zc->extSeqBufCapacity = maxNbExternalSeq; + zc->extSeqBuf = + (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned64(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); + } + + /* buffers */ + /* ZSTD_wildcopy() is used to copy into the literals buffer, * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. */ zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); zc->seqStore.maxNbLit = blockSize; - /* buffers */ + zc->bufferedPolicy = zbuff; zc->inBuffSize = buffInSize; zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); zc->outBuffSize = buffOutSize; zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); /* ldm bucketOffsets table */ - if (params.ldmParams.enableLdm) { + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* TODO: avoid memset? */ - size_t const ldmBucketSize = - ((size_t)1) << (params.ldmParams.hashLog - - params.ldmParams.bucketSizeLog); - zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, ldmBucketSize); - memset(zc->ldmState.bucketOffsets, 0, ldmBucketSize); + size_t const numBuckets = + ((size_t)1) << (params->ldmParams.hashLog - + params->ldmParams.bucketSizeLog); + zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); + ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); } /* sequences storage */ @@ -1522,30 +2270,11 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); - - FORWARD_IF_ERROR(ZSTD_reset_matchState( - &zc->blockState.matchState, - ws, - ¶ms.cParams, - crp, - needsIndexReset, - ZSTD_resetTarget_CCtx)); - - /* ldm hash table */ - if (params.ldmParams.enableLdm) { - /* TODO: avoid memset? */ - size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog; - zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); - memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); - zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); - zc->maxNbLdmSequences = maxNbLdmSeq; - - memset(&zc->ldmState.window, 0, sizeof(zc->ldmState.window)); - ZSTD_window_clear(&zc->ldmState.window); - } DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); + assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); + + zc->initialized = 1; return 0; } @@ -1583,12 +2312,14 @@ static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict, U64 pledgedSrcSize) { size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy]; - return ( pledgedSrcSize <= cutoff - || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN - || params->attachDictPref == ZSTD_dictForceAttach ) - && params->attachDictPref != ZSTD_dictForceCopy - && !params->forceWindow; /* dictMatchState isn't correctly - * handled in _enforceMaxDist */ + int const dedicatedDictSearch = cdict->matchState.dedicatedDictSearch; + return dedicatedDictSearch + || ( ( pledgedSrcSize <= cutoff + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || params->attachDictPref == ZSTD_dictForceAttach ) + && params->attachDictPref != ZSTD_dictForceCopy + && !params->forceWindow ); /* dictMatchState isn't correctly + * handled in _enforceMaxDist */ } static size_t @@ -1598,16 +2329,29 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { - { const ZSTD_compressionParameters* const cdict_cParams = &cdict->matchState.cParams; + DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu", + (unsigned long long)pledgedSrcSize); + { + ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams; unsigned const windowLog = params.cParams.windowLog; assert(windowLog != 0); /* Resize working context table params for input only, since the dict * has its own tables. */ - params.cParams = ZSTD_adjustCParams_internal(*cdict_cParams, pledgedSrcSize, 0); - params.cParams.windowLog = windowLog; - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, - ZSTDcrp_makeClean, zbuff)); - assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); + /* pledgedSrcSize == 0 means 0! */ + + if (cdict->matchState.dedicatedDictSearch) { + ZSTD_dedicatedDictSearch_revertCParams(&adjusted_cdict_cParams); + } + + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, + cdict->dictContentSize, ZSTD_cpm_attachDict, + params.useRowMatchFinder); + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_makeClean, zbuff), ""); + assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy); } { const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc @@ -1632,13 +2376,30 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, } } cctx->dictID = cdict->dictID; + cctx->dictContentSize = cdict->dictContentSize; /* copy block state */ - memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); + ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); return 0; } +static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, + ZSTD_compressionParameters const* cParams) { + if (ZSTD_CDictIndicesAreTagged(cParams)){ + /* Remove tags from the CDict table if they are present. + * See docs on "short cache" in zstd_compress_internal.h for context. */ + size_t i; + for (i = 0; i < tableSize; i++) { + U32 const taggedIndex = src[i]; + U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; + dst[i] = index; + } + } else { + ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); + } +} + static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, ZSTD_CCtx_params params, @@ -1647,55 +2408,76 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, { const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams; - DEBUGLOG(4, "copying dictionary into context"); + assert(!cdict->matchState.dedicatedDictSearch); + DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu", + (unsigned long long)pledgedSrcSize); { unsigned const windowLog = params.cParams.windowLog; assert(windowLog != 0); /* Copy only compression parameters related to tables. */ params.cParams = *cdict_cParams; params.cParams.windowLog = windowLog; - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, - ZSTDcrp_leaveDirty, zbuff)); + params.useRowMatchFinder = cdict->useRowMatchFinder; + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff), ""); assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog); assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog); } ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); + assert(params.useRowMatchFinder != ZSTD_ps_auto); /* copy tables */ - { size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog); + { size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */) + ? ((size_t)1 << cdict_cParams->chainLog) + : 0; size_t const hSize = (size_t)1 << cdict_cParams->hashLog; - memcpy(cctx->blockState.matchState.hashTable, - cdict->matchState.hashTable, - hSize * sizeof(U32)); - memcpy(cctx->blockState.matchState.chainTable, - cdict->matchState.chainTable, - chainSize * sizeof(U32)); + ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, + cdict->matchState.hashTable, + hSize, cdict_cParams); + + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { + ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, + cdict->matchState.chainTable, + chainSize, cdict_cParams); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { + size_t const tagTableSize = hSize; + ZSTD_memcpy(cctx->blockState.matchState.tagTable, + cdict->matchState.tagTable, + tagTableSize); + cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; + } } /* Zero the hashTable3, since the cdict never fills it */ - { int const h3log = cctx->blockState.matchState.hashLog3; + assert(cctx->blockState.matchState.hashLog3 <= 31); + { U32 const h3log = cctx->blockState.matchState.hashLog3; size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; assert(cdict->matchState.hashLog3 == 0); - memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); + ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); } ZSTD_cwksp_mark_tables_clean(&cctx->workspace); /* copy dictionary offsets */ - { ZSTD_matchState_t const* srcMatchState = &cdict->matchState; - ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState; + { ZSTD_MatchState_t const* srcMatchState = &cdict->matchState; + ZSTD_MatchState_t* dstMatchState = &cctx->blockState.matchState; dstMatchState->window = srcMatchState->window; dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; } cctx->dictID = cdict->dictID; + cctx->dictContentSize = cdict->dictContentSize; /* copy block state */ - memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); + ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); return 0; } @@ -1735,15 +2517,23 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { + RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong, + "Can't copy a ctx that's not in init stage."); DEBUGLOG(5, "ZSTD_copyCCtx_internal"); - RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong); - - memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); + ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); { ZSTD_CCtx_params params = dstCCtx->requestedParams; /* Copy only compression parameters related to tables. */ params.cParams = srcCCtx->appliedParams.cParams; + assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto); + assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto); + assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto); + params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; + params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; params.fParams = fParams; - ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, + params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff); assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog); assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); @@ -1755,18 +2545,22 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace); /* copy tables */ - { size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog); + { size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy, + srcCCtx->appliedParams.useRowMatchFinder, + 0 /* forDDSDict */) + ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) + : 0; size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; - int const h3log = srcCCtx->blockState.matchState.hashLog3; + U32 const h3log = srcCCtx->blockState.matchState.hashLog3; size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; - memcpy(dstCCtx->blockState.matchState.hashTable, + ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable, srcCCtx->blockState.matchState.hashTable, hSize * sizeof(U32)); - memcpy(dstCCtx->blockState.matchState.chainTable, + ZSTD_memcpy(dstCCtx->blockState.matchState.chainTable, srcCCtx->blockState.matchState.chainTable, chainSize * sizeof(U32)); - memcpy(dstCCtx->blockState.matchState.hashTable3, + ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable3, srcCCtx->blockState.matchState.hashTable3, h3Size * sizeof(U32)); } @@ -1775,16 +2569,17 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, /* copy dictionary offsets */ { - const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState; - ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState; + const ZSTD_MatchState_t* srcMatchState = &srcCCtx->blockState.matchState; + ZSTD_MatchState_t* dstMatchState = &dstCCtx->blockState.matchState; dstMatchState->window = srcMatchState->window; dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; } dstCCtx->dictID = srcCCtx->dictID; + dstCCtx->dictContentSize = srcCCtx->dictContentSize; /* copy block state */ - memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock)); + ZSTD_memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock)); return 0; } @@ -1797,7 +2592,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize) { ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; - ZSTD_buffered_policy_e const zbuff = (ZSTD_buffered_policy_e)(srcCCtx->inBuffSize>0); + ZSTD_buffered_policy_e const zbuff = srcCCtx->bufferedPolicy; ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1); if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN); @@ -1821,11 +2616,13 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa int const nbRows = (int)size / ZSTD_ROWSIZE; int cellNb = 0; int rowNb; + /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ + U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX; assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ - assert(size < (1U<<31)); /* can be casted to int */ + assert(size < (1U<<31)); /* can be cast to int */ -#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) - /* To validate that the table re-use logic is sound, and that we don't +#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the table reuse logic is sound, and that we don't * access table space that we haven't cleaned, we re-"poison" the table * space every time we mark it dirty. * @@ -1840,12 +2637,17 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa for (rowNb=0 ; rowNb < nbRows ; rowNb++) { int column; for (column=0; columncParams.hashLog; ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); } - if (params->cParams.strategy != ZSTD_fast) { + if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) { U32 const chainSize = (U32)1 << params->cParams.chainLog; if (params->cParams.strategy == ZSTD_btlazy2) ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); @@ -1889,210 +2691,286 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par /* See doc/zstd_compression_format.md for detailed format description */ -static size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) -{ - U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); - RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, - dstSize_tooSmall); - MEM_writeLE24(dst, cBlockHeader24); - memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize); - return ZSTD_blockHeaderSize + srcSize; -} - -void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) +int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr) { - const seqDef* const sequences = seqStorePtr->sequencesStart; + const SeqDef* const sequences = seqStorePtr->sequencesStart; BYTE* const llCodeTable = seqStorePtr->llCode; BYTE* const ofCodeTable = seqStorePtr->ofCode; BYTE* const mlCodeTable = seqStorePtr->mlCode; U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); U32 u; + int longOffsets = 0; assert(nbSeq <= seqStorePtr->maxNbSeq); for (u=0; u= STREAM_ACCUMULATOR_MIN)); + if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) + longOffsets = 1; } - if (seqStorePtr->longLengthID==1) + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) llCodeTable[seqStorePtr->longLengthPos] = MaxLL; - if (seqStorePtr->longLengthID==2) + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) mlCodeTable[seqStorePtr->longLengthPos] = MaxML; + return longOffsets; } -static int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams) +/* ZSTD_useTargetCBlockSize(): + * Returns if target compressed block size param is being used. + * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) { - switch (cctxParams->literalCompressionMode) { - case ZSTD_lcm_huffman: - return 0; - case ZSTD_lcm_uncompressed: - return 1; - default: - assert(0 /* impossible: pre-validated */); - /* fall-through */ - case ZSTD_lcm_auto: - return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0); - } + DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize); + return (cctxParams->targetCBlockSize != 0); } -/* ZSTD_compressSequences_internal(): - * actually compresses both literals and sequences */ -MEM_STATIC size_t -ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, - const ZSTD_entropyCTables_t* prevEntropy, - ZSTD_entropyCTables_t* nextEntropy, - const ZSTD_CCtx_params* cctxParams, - void* dst, size_t dstCapacity, - void* entropyWorkspace, size_t entropyWkspSize, - const int bmi2) +/* ZSTD_blockSplitterEnabled(): + * Returns if block splitting param is being used + * If used, compression will do best effort to split a block in order to improve compression ratio. + * At the time this function is called, the parameter must be finalized. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) { - const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; - ZSTD_strategy const strategy = cctxParams->cParams.strategy; - unsigned count[MaxSeq+1]; - FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; - FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; - FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; - U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ - const seqDef* const sequences = seqStorePtr->sequencesStart; + DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter); + assert(cctxParams->postBlockSplitter != ZSTD_ps_auto); + return (cctxParams->postBlockSplitter == ZSTD_ps_enable); +} + +/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types + * and size of the sequences statistics + */ +typedef struct { + U32 LLtype; + U32 Offtype; + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ + int longOffsets; +} ZSTD_symbolEncodingTypeStats_t; + +/* ZSTD_buildSequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field. + * Modifies `nextEntropy` to have the appropriate values as a side effect. + * nbSeq must be greater than 0. + * + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildSequencesStatistics( + const SeqStore_t* seqStorePtr, size_t nbSeq, + const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, + BYTE* dst, const BYTE* const dstEnd, + ZSTD_strategy strategy, unsigned* countWorkspace, + void* entropyWorkspace, size_t entropyWkspSize) +{ + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; + FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; const BYTE* const ofCodeTable = seqStorePtr->ofCode; const BYTE* const llCodeTable = seqStorePtr->llCode; const BYTE* const mlCodeTable = seqStorePtr->mlCode; - BYTE* const ostart = (BYTE*)dst; - BYTE* const oend = ostart + dstCapacity; - BYTE* op = ostart; - size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); - BYTE* seqHead; - BYTE* lastNCount = NULL; - - DEBUGLOG(5, "ZSTD_compressSequences_internal (nbSeq=%zu)", nbSeq); - ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<litStart; - size_t const litSize = (size_t)(seqStorePtr->lit - literals); - size_t const cSize = ZSTD_compressLiterals( - &prevEntropy->huf, &nextEntropy->huf, - cctxParams->cParams.strategy, - ZSTD_disableLiteralsCompression(cctxParams), - op, dstCapacity, - literals, litSize, - entropyWorkspace, entropyWkspSize, - bmi2); - FORWARD_IF_ERROR(cSize); - assert(cSize <= dstCapacity); - op += cSize; - } - - /* Sequences Header */ - RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, - dstSize_tooSmall); - if (nbSeq < 128) { - *op++ = (BYTE)nbSeq; - } else if (nbSeq < LONGNBSEQ) { - op[0] = (BYTE)((nbSeq>>8) + 0x80); - op[1] = (BYTE)nbSeq; - op+=2; - } else { - op[0]=0xFF; - MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)); - op+=3; - } - assert(op <= oend); - if (nbSeq==0) { - /* Copy the old tables over as if we repeated them */ - memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); - return (size_t)(op - ostart); - } - - /* seqHead : flags for FSE encoding type */ - seqHead = op++; - assert(op <= oend); + ZSTD_symbolEncodingTypeStats_t stats; + stats.lastCountSize = 0; /* convert length/distances into codes */ - ZSTD_seqToCodes(seqStorePtr); + stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ /* build CTable for Literal Lengths */ { unsigned max = MaxLL; - size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ DEBUGLOG(5, "Building LL table"); - nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode; - LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, - count, max, mostFrequent, nbSeq, - LLFSELog, prevEntropy->fse.litlengthCTable, + nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; + stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + LLFSELog, prevEntropy->litlengthCTable, LL_defaultNorm, LL_defaultNormLog, ZSTD_defaultAllowed, strategy); assert(set_basic < set_compressed && set_rle < set_compressed); - assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ { size_t const countSize = ZSTD_buildCTable( op, (size_t)(oend - op), - CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, - count, max, llCodeTable, nbSeq, + CTable_LitLength, LLFSELog, (SymbolEncodingType_e)stats.LLtype, + countWorkspace, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, - prevEntropy->fse.litlengthCTable, - sizeof(prevEntropy->fse.litlengthCTable), + prevEntropy->litlengthCTable, + sizeof(prevEntropy->litlengthCTable), entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize); - if (LLtype == set_compressed) - lastNCount = op; + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed"); + stats.size = countSize; + return stats; + } + if (stats.LLtype == set_compressed) + stats.lastCountSize = countSize; op += countSize; assert(op <= oend); } } /* build CTable for Offsets */ { unsigned max = MaxOff; size_t const mostFrequent = HIST_countFast_wksp( - count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ - ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + ZSTD_DefaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; DEBUGLOG(5, "Building OF table"); - nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode; - Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, - count, max, mostFrequent, nbSeq, - OffFSELog, prevEntropy->fse.offcodeCTable, + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + OffFSELog, prevEntropy->offcodeCTable, OF_defaultNorm, OF_defaultNormLog, defaultPolicy, strategy); - assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ { size_t const countSize = ZSTD_buildCTable( op, (size_t)(oend - op), - CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, - count, max, ofCodeTable, nbSeq, + CTable_OffsetBits, OffFSELog, (SymbolEncodingType_e)stats.Offtype, + countWorkspace, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, - prevEntropy->fse.offcodeCTable, - sizeof(prevEntropy->fse.offcodeCTable), + prevEntropy->offcodeCTable, + sizeof(prevEntropy->offcodeCTable), entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize); - if (Offtype == set_compressed) - lastNCount = op; + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed"); + stats.size = countSize; + return stats; + } + if (stats.Offtype == set_compressed) + stats.lastCountSize = countSize; op += countSize; assert(op <= oend); } } /* build CTable for MatchLengths */ { unsigned max = MaxML; size_t const mostFrequent = HIST_countFast_wksp( - count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); - nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode; - MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, - count, max, mostFrequent, nbSeq, - MLFSELog, prevEntropy->fse.matchlengthCTable, + nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; + stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + MLFSELog, prevEntropy->matchlengthCTable, ML_defaultNorm, ML_defaultNormLog, ZSTD_defaultAllowed, strategy); - assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ { size_t const countSize = ZSTD_buildCTable( op, (size_t)(oend - op), - CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, - count, max, mlCodeTable, nbSeq, + CTable_MatchLength, MLFSELog, (SymbolEncodingType_e)stats.MLtype, + countWorkspace, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, - prevEntropy->fse.matchlengthCTable, - sizeof(prevEntropy->fse.matchlengthCTable), + prevEntropy->matchlengthCTable, + sizeof(prevEntropy->matchlengthCTable), entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize); - if (MLtype == set_compressed) - lastNCount = op; + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed"); + stats.size = countSize; + return stats; + } + if (stats.MLtype == set_compressed) + stats.lastCountSize = countSize; op += countSize; assert(op <= oend); } } + stats.size = (size_t)(op-ostart); + return stats; +} + +/* ZSTD_entropyCompressSeqStore_internal(): + * compresses both literals and sequences + * Returns compressed size of block, or a zstd error. + */ +#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 +MEM_STATIC size_t +ZSTD_entropyCompressSeqStore_internal( + void* dst, size_t dstCapacity, + const void* literals, size_t litSize, + const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* entropyWorkspace, size_t entropyWkspSize, + const int bmi2) +{ + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned* count = (unsigned*)entropyWorkspace; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; + const SeqDef* const sequences = seqStorePtr->sequencesStart; + const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t lastCountSize; + int longOffsets = 0; + + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + + DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); + + /* Compress literals */ + { size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + /* Base suspicion of uncompressibility on ratio of literals to sequences */ + int const suspectUncompressible = (numSequences == 0) || (litSize / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); + + size_t const cSize = ZSTD_compressLiterals( + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, + &prevEntropy->huf, &nextEntropy->huf, + cctxParams->cParams.strategy, + ZSTD_literalsCompressionIsDisabled(cctxParams), + suspectUncompressible, bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; + } - *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, "Can't fit seq hdr in output buf!"); + if (nbSeq < 128) { + *op++ = (BYTE)nbSeq; + } else if (nbSeq < LONGNBSEQ) { + op[0] = (BYTE)((nbSeq>>8) + 0x80); + op[1] = (BYTE)nbSeq; + op+=2; + } else { + op[0]=0xFF; + MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)); + op+=3; + } + assert(op <= oend); + if (nbSeq==0) { + /* Copy the old tables over as if we repeated them */ + ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } + { BYTE* const seqHead = op++; + /* build stats for sequences */ + const ZSTD_symbolEncodingTypeStats_t stats = + ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; + longOffsets = stats.longOffsets; + } { size_t const bitstreamSize = ZSTD_encodeSequences( op, (size_t)(oend - op), @@ -2101,7 +2979,7 @@ ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, CTable_LitLength, llCodeTable, sequences, nbSeq, longOffsets, bmi2); - FORWARD_IF_ERROR(bitstreamSize); + FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed"); op += bitstreamSize; assert(op <= oend); /* zstd versions <= 1.3.4 mistakenly report corruption when @@ -2112,9 +2990,9 @@ ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, * In this exceedingly rare case, we will simply emit an uncompressed * block, since it isn't worth optimizing. */ - if (lastNCount && (op - lastNCount) < 4) { - /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ - assert(op - lastNCount == 3); + if (lastCountSize && (lastCountSize + bitstreamSize) < 4) { + /* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ + assert(lastCountSize + bitstreamSize == 3); DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " "emitting an uncompressed block."); return 0; @@ -2125,107 +3003,280 @@ ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, return (size_t)(op - ostart); } -MEM_STATIC size_t -ZSTD_compressSequences(seqStore_t* seqStorePtr, - const ZSTD_entropyCTables_t* prevEntropy, - ZSTD_entropyCTables_t* nextEntropy, - const ZSTD_CCtx_params* cctxParams, - void* dst, size_t dstCapacity, - size_t srcSize, - void* entropyWorkspace, size_t entropyWkspSize, - int bmi2) -{ - size_t const cSize = ZSTD_compressSequences_internal( - seqStorePtr, prevEntropy, nextEntropy, cctxParams, +static size_t +ZSTD_entropyCompressSeqStore_wExtLitBuffer( + void* dst, size_t dstCapacity, + const void* literals, size_t litSize, + size_t blockSize, + const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* entropyWorkspace, size_t entropyWkspSize, + int bmi2) +{ + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( dst, dstCapacity, + literals, litSize, + seqStorePtr, prevEntropy, nextEntropy, cctxParams, entropyWorkspace, entropyWkspSize, bmi2); if (cSize == 0) return 0; /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. */ - if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) + if ((cSize == ERROR(dstSize_tooSmall)) & (blockSize <= dstCapacity)) { + DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); return 0; /* block not compressed */ - FORWARD_IF_ERROR(cSize); + } + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); /* Check compressibility */ - { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); + { size_t const maxCSize = blockSize - ZSTD_minGain(blockSize, cctxParams->cParams.strategy); if (cSize >= maxCSize) return 0; /* block not compressed */ } - + DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); + /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. + * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. + */ + assert(cSize < ZSTD_BLOCKSIZE_MAX); return cSize; } +static size_t +ZSTD_entropyCompressSeqStore( + const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + size_t srcSize, + void* entropyWorkspace, size_t entropyWkspSize, + int bmi2) +{ + return ZSTD_entropyCompressSeqStore_wExtLitBuffer( + dst, dstCapacity, + seqStorePtr->litStart, (size_t)(seqStorePtr->lit - seqStorePtr->litStart), + srcSize, + seqStorePtr, + prevEntropy, nextEntropy, + cctxParams, + entropyWorkspace, entropyWkspSize, + bmi2); +} + /* ZSTD_selectBlockCompressor() : * Not static, but internal use only (used by long distance matcher) * assumption : strat is a valid strategy */ -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode) +ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) { - static const ZSTD_blockCompressor blockCompressor[3][ZSTD_STRATEGY_MAX+1] = { + static const ZSTD_BlockCompressor_f blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { { ZSTD_compressBlock_fast /* default for 0 */, ZSTD_compressBlock_fast, - ZSTD_compressBlock_doubleFast, - ZSTD_compressBlock_greedy, - ZSTD_compressBlock_lazy, - ZSTD_compressBlock_lazy2, - ZSTD_compressBlock_btlazy2, - ZSTD_compressBlock_btopt, - ZSTD_compressBlock_btultra, - ZSTD_compressBlock_btultra2 }, + ZSTD_COMPRESSBLOCK_DOUBLEFAST, + ZSTD_COMPRESSBLOCK_GREEDY, + ZSTD_COMPRESSBLOCK_LAZY, + ZSTD_COMPRESSBLOCK_LAZY2, + ZSTD_COMPRESSBLOCK_BTLAZY2, + ZSTD_COMPRESSBLOCK_BTOPT, + ZSTD_COMPRESSBLOCK_BTULTRA, + ZSTD_COMPRESSBLOCK_BTULTRA2 + }, { ZSTD_compressBlock_fast_extDict /* default for 0 */, ZSTD_compressBlock_fast_extDict, - ZSTD_compressBlock_doubleFast_extDict, - ZSTD_compressBlock_greedy_extDict, - ZSTD_compressBlock_lazy_extDict, - ZSTD_compressBlock_lazy2_extDict, - ZSTD_compressBlock_btlazy2_extDict, - ZSTD_compressBlock_btopt_extDict, - ZSTD_compressBlock_btultra_extDict, - ZSTD_compressBlock_btultra_extDict }, + ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, + ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, + ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, + ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, + ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, + ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, + ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, + ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT + }, { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, ZSTD_compressBlock_fast_dictMatchState, - ZSTD_compressBlock_doubleFast_dictMatchState, - ZSTD_compressBlock_greedy_dictMatchState, - ZSTD_compressBlock_lazy_dictMatchState, - ZSTD_compressBlock_lazy2_dictMatchState, - ZSTD_compressBlock_btlazy2_dictMatchState, - ZSTD_compressBlock_btopt_dictMatchState, - ZSTD_compressBlock_btultra_dictMatchState, - ZSTD_compressBlock_btultra_dictMatchState } + ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, + ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE + }, + { NULL /* default for 0 */, + NULL, + NULL, + ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, + ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, + ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, + NULL, + NULL, + NULL, + NULL } }; - ZSTD_blockCompressor selectedCompressor; + ZSTD_BlockCompressor_f selectedCompressor; ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); - assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); - selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; + assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); + DEBUGLOG(5, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { + static const ZSTD_BlockCompressor_f rowBasedBlockCompressors[4][3] = { + { + ZSTD_COMPRESSBLOCK_GREEDY_ROW, + ZSTD_COMPRESSBLOCK_LAZY_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_ROW + }, + { + ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, + ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW + }, + { + ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, + ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW + }, + { + ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, + ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, + ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW + } + }; + DEBUGLOG(5, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder != ZSTD_ps_auto); + selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; + } else { + selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; + } assert(selectedCompressor != NULL); return selectedCompressor; } -static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr, +static void ZSTD_storeLastLiterals(SeqStore_t* seqStorePtr, const BYTE* anchor, size_t lastLLSize) { - memcpy(seqStorePtr->lit, anchor, lastLLSize); + ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize); seqStorePtr->lit += lastLLSize; } -void ZSTD_resetSeqStore(seqStore_t* ssPtr) +void ZSTD_resetSeqStore(SeqStore_t* ssPtr) { ssPtr->lit = ssPtr->litStart; ssPtr->sequences = ssPtr->sequencesStart; - ssPtr->longLengthID = 0; + ssPtr->longLengthType = ZSTD_llt_none; +} + +/* ZSTD_postProcessSequenceProducerResult() : + * Validates and post-processes sequences obtained through the external matchfinder API: + * - Checks whether nbExternalSeqs represents an error condition. + * - Appends a block delimiter to outSeqs if one is not already present. + * See zstd.h for context regarding block delimiters. + * Returns the number of sequences after post-processing, or an error code. */ +static size_t ZSTD_postProcessSequenceProducerResult( + ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize +) { + RETURN_ERROR_IF( + nbExternalSeqs > outSeqsCapacity, + sequenceProducer_failed, + "External sequence producer returned error code %lu", + (unsigned long)nbExternalSeqs + ); + + RETURN_ERROR_IF( + nbExternalSeqs == 0 && srcSize > 0, + sequenceProducer_failed, + "Got zero sequences from external sequence producer for a non-empty src buffer!" + ); + + if (srcSize == 0) { + ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); + return 1; + } + + { + ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; + + /* We can return early if lastSeq is already a block delimiter. */ + if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { + return nbExternalSeqs; + } + + /* This error condition is only possible if the external matchfinder + * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ + RETURN_ERROR_IF( + nbExternalSeqs == outSeqsCapacity, + sequenceProducer_failed, + "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" + ); + + /* lastSeq is not a block delimiter, so we need to append one. */ + ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); + return nbExternalSeqs + 1; + } +} + +/* ZSTD_fastSequenceLengthSum() : + * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. + * Similar to another function in zstd_compress.c (determine_blockSize), + * except it doesn't check for a block delimiter to end summation. + * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). + * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ +static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { + size_t matchLenSum, litLenSum, i; + matchLenSum = 0; + litLenSum = 0; + for (i = 0; i < seqBufSize; i++) { + litLenSum += seqBuf[i].litLength; + matchLenSum += seqBuf[i].matchLength; + } + return litLenSum + matchLenSum; +} + +/** + * Function to validate sequences produced by a block compressor. + */ +static void ZSTD_validateSeqStore(const SeqStore_t* seqStore, const ZSTD_compressionParameters* cParams) +{ +#if DEBUGLEVEL >= 1 + const SeqDef* seq = seqStore->sequencesStart; + const SeqDef* const seqEnd = seqStore->sequences; + size_t const matchLenLowerBound = cParams->minMatch == 3 ? 3 : 4; + for (; seq < seqEnd; ++seq) { + const ZSTD_SequenceLength seqLength = ZSTD_getSequenceLength(seqStore, seq); + assert(seqLength.matchLength >= matchLenLowerBound); + (void)seqLength; + (void)matchLenLowerBound; + } +#else + (void)seqStore; + (void)cParams; +#endif } -typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; +static size_t +ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, + ZSTD_SequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize, + ZSTD_ParamSwitch_e externalRepSearch); + +typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_BuildSeqStore_e; static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) { - ZSTD_matchState_t* const ms = &zc->blockState.matchState; + ZSTD_MatchState_t* const ms = &zc->blockState.matchState; DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize); assert(srcSize <= ZSTD_BLOCKSIZE_MAX); /* Assert that we have correctly flushed the ctx params into the ms's copy */ ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); - if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { - ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch); + /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding + * additional 1. We need to revisit and change this logic to be more consistent */ + if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { + ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); + } else { + ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch); + } return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */ } ZSTD_resetSeqStore(&(zc->seqStore)); @@ -2241,10 +3292,10 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) /* limited update after a very long match */ { const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; - const U32 current = (U32)(istart-base); + const U32 curr = (U32)(istart-base); if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1)); /* ensure no overflow */ - if (current > ms->nextToUpdate + 384) - ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384)); + if (curr > ms->nextToUpdate + 384) + ms->nextToUpdate = curr - MIN(192, (U32)(curr - ms->nextToUpdate - 384)); } /* select and store sequences */ @@ -2255,99 +3306,237 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i]; } if (zc->externSeqStore.pos < zc->externSeqStore.size) { - assert(!zc->appliedParams.ldmParams.enableLdm); + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); + + /* External matchfinder + LDM is technically possible, just not implemented yet. + * We need to revisit soon and implement it. */ + RETURN_ERROR_IF( + ZSTD_hasExtSeqProd(&zc->appliedParams), + parameter_combination_unsupported, + "Long-distance matching with external sequence producer enabled is not currently supported." + ); + /* Updates ldmSeqStore.pos */ lastLLSize = ZSTD_ldm_blockCompress(&zc->externSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, src, srcSize); assert(zc->externSeqStore.pos <= zc->externSeqStore.size); - } else if (zc->appliedParams.ldmParams.enableLdm) { - rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0}; + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + RawSeqStore_t ldmSeqStore = kNullRawSeqStore; + + /* External matchfinder + LDM is technically possible, just not implemented yet. + * We need to revisit soon and implement it. */ + RETURN_ERROR_IF( + ZSTD_hasExtSeqProd(&zc->appliedParams), + parameter_combination_unsupported, + "Long-distance matching with external sequence producer enabled is not currently supported." + ); ldmSeqStore.seq = zc->ldmSequences; ldmSeqStore.capacity = zc->maxNbLdmSequences; /* Updates ldmSeqStore.size */ FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore, &zc->appliedParams.ldmParams, - src, srcSize)); + src, srcSize), ""); /* Updates ldmSeqStore.pos */ lastLLSize = ZSTD_ldm_blockCompress(&ldmSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos == ldmSeqStore.size); - } else { /* not long range mode */ - ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode); + } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { + assert( + zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) + ); + assert(zc->appliedParams.extSeqProdFunc != NULL); + + { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; + + size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( + zc->appliedParams.extSeqProdState, + zc->extSeqBuf, + zc->extSeqBufCapacity, + src, srcSize, + NULL, 0, /* dict and dictSize, currently not supported */ + zc->appliedParams.compressionLevel, + windowSize + ); + + size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( + zc->extSeqBuf, + nbExternalSeqs, + zc->extSeqBufCapacity, + srcSize + ); + + /* Return early if there is no error, since we don't need to worry about last literals */ + if (!ZSTD_isError(nbPostProcessedSeqs)) { + ZSTD_SequencePosition seqPos = {0,0,0}; + size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); + RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); + FORWARD_IF_ERROR( + ZSTD_transferSequences_wBlockDelim( + zc, &seqPos, + zc->extSeqBuf, nbPostProcessedSeqs, + src, srcSize, + zc->appliedParams.searchForExternalRepcodes + ), + "Failed to copy external sequences to seqStore!" + ); + ms->ldmSeqStore = NULL; + DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); + return ZSTDbss_compress; + } + + /* Propagate the error if fallback is disabled */ + if (!zc->appliedParams.enableMatchFinderFallback) { + return nbPostProcessedSeqs; + } + + /* Fallback to software matchfinder */ + { ZSTD_BlockCompressor_f const blockCompressor = + ZSTD_selectBlockCompressor( + zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); + ms->ldmSeqStore = NULL; + DEBUGLOG( + 5, + "External sequence producer returned error code %lu. Falling back to internal parser.", + (unsigned long)nbExternalSeqs + ); + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } } + } else { /* not long range mode and no external matchfinder */ + ZSTD_BlockCompressor_f const blockCompressor = ZSTD_selectBlockCompressor( + zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); + ms->ldmSeqStore = NULL; lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize; ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize); } } + ZSTD_validateSeqStore(&zc->seqStore, &zc->appliedParams.cParams); return ZSTDbss_compress; } -static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) +static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const SeqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) { - const seqStore_t* seqStore = ZSTD_getSeqStore(zc); - const seqDef* seqs = seqStore->sequencesStart; - size_t seqsSize = seqStore->sequences - seqs; - - ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; - size_t i; size_t position; int repIdx; + const SeqDef* inSeqs = seqStore->sequencesStart; + const size_t nbInSequences = (size_t)(seqStore->sequences - inSeqs); + const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); - assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); - for (i = 0, position = 0; i < seqsSize; ++i) { - outSeqs[i].offset = seqs[i].offset; - outSeqs[i].litLength = seqs[i].litLength; - outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH; + ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; + const size_t nbOutSequences = nbInSequences + 1; + size_t nbOutLiterals = 0; + Repcodes_t repcodes; + size_t i; + /* Bounds check that we have enough space for every input sequence + * and the block delimiter + */ + assert(seqCollector->seqIndex <= seqCollector->maxSequences); + RETURN_ERROR_IF( + nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), + dstSize_tooSmall, + "Not enough space to copy sequences"); + + ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); + for (i = 0; i < nbInSequences; ++i) { + U32 rawOffset; + outSeqs[i].litLength = inSeqs[i].litLength; + outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; + outSeqs[i].rep = 0; + + /* Handle the possible single length >= 64K + * There can only be one because we add MINMATCH to every match length, + * and blocks are at most 128K. + */ if (i == seqStore->longLengthPos) { - if (seqStore->longLengthID == 1) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { outSeqs[i].litLength += 0x10000; - } else if (seqStore->longLengthID == 2) { + } else if (seqStore->longLengthType == ZSTD_llt_matchLength) { outSeqs[i].matchLength += 0x10000; } } - if (outSeqs[i].offset <= ZSTD_REP_NUM) { - outSeqs[i].rep = outSeqs[i].offset; - repIdx = (unsigned int)i - outSeqs[i].offset; - - if (outSeqs[i].litLength == 0) { - if (outSeqs[i].offset < 3) { - --repIdx; + /* Determine the raw offset given the offBase, which may be a repcode. */ + if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { + const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); + assert(repcode > 0); + outSeqs[i].rep = repcode; + if (outSeqs[i].litLength != 0) { + rawOffset = repcodes.rep[repcode - 1]; + } else { + if (repcode == 3) { + assert(repcodes.rep[0] > 1); + rawOffset = repcodes.rep[0] - 1; } else { - repIdx = (unsigned int)i - 1; + rawOffset = repcodes.rep[repcode]; } - ++outSeqs[i].rep; - } - assert(repIdx >= -3); - outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1]; - if (outSeqs[i].rep == 4) { - --outSeqs[i].offset; } } else { - outSeqs[i].offset -= ZSTD_REP_NUM; + rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); } + outSeqs[i].offset = rawOffset; - position += outSeqs[i].litLength; - outSeqs[i].matchPos = (unsigned int)position; - position += outSeqs[i].matchLength; + /* Update repcode history for the sequence */ + ZSTD_updateRep(repcodes.rep, + inSeqs[i].offBase, + inSeqs[i].litLength == 0); + + nbOutLiterals += outSeqs[i].litLength; + } + /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. + * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker + * for the block boundary, according to the API. + */ + assert(nbInLiterals >= nbOutLiterals); + { + const size_t lastLLSize = nbInLiterals - nbOutLiterals; + outSeqs[nbInSequences].litLength = (U32)lastLLSize; + outSeqs[nbInSequences].matchLength = 0; + outSeqs[nbInSequences].offset = 0; + assert(nbOutSequences == nbInSequences + 1); } - zc->seqCollector.seqIndex += seqsSize; + seqCollector->seqIndex += nbOutSequences; + assert(seqCollector->seqIndex <= seqCollector->maxSequences); + + return 0; } -size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - size_t outSeqsSize, const void* src, size_t srcSize) +size_t ZSTD_sequenceBound(size_t srcSize) { + const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; + const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; + return maxNbSeq + maxNbDelims; +} + +size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize) { const size_t dstCapacity = ZSTD_compressBound(srcSize); - void* dst = ZSTD_malloc(dstCapacity, ZSTD_defaultCMem); + void* dst; /* Make C90 happy. */ SeqCollector seqCollector; + { + int targetCBlockSize; + FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); + RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); + } + { + int nbWorkers; + FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); + RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); + } - RETURN_ERROR_IF(dst == NULL, memory_allocation); + dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); + RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); seqCollector.collectSequences = 1; seqCollector.seqStart = outSeqs; @@ -2355,96 +3544,995 @@ size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, seqCollector.maxSequences = outSeqsSize; zc->seqCollector = seqCollector; - ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); - ZSTD_free(dst, ZSTD_defaultCMem); + { + const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); + ZSTD_customFree(dst, ZSTD_defaultCMem); + FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); + } + assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); return zc->seqCollector.seqIndex; } -/* Returns true if the given block is a RLE block */ -static int ZSTD_isRLE(const BYTE *ip, size_t length) { +size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize) { + size_t in = 0; + size_t out = 0; + for (; in < seqsSize; ++in) { + if (sequences[in].offset == 0 && sequences[in].matchLength == 0) { + if (in != seqsSize - 1) { + sequences[in+1].litLength += sequences[in].litLength; + } + } else { + sequences[out] = sequences[in]; + ++out; + } + } + return out; +} + +/* Unrolled loop to read four size_ts of input at a time. Returns 1 if is RLE, 0 if not. */ +static int ZSTD_isRLE(const BYTE* src, size_t length) { + const BYTE* ip = src; + const BYTE value = ip[0]; + const size_t valueST = (size_t)((U64)value * 0x0101010101010101ULL); + const size_t unrollSize = sizeof(size_t) * 4; + const size_t unrollMask = unrollSize - 1; + const size_t prefixLength = length & unrollMask; size_t i; - if (length < 2) return 1; - for (i = 1; i < length; ++i) { - if (ip[0] != ip[i]) return 0; + if (length == 1) return 1; + /* Check if prefix is RLE first before using unrolled loop */ + if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { + return 0; } + for (i = prefixLength; i != length; i += unrollSize) { + size_t u; + for (u = 0; u < unrollSize; u += sizeof(size_t)) { + if (MEM_readST(ip + i + u) != valueST) { + return 0; + } } } return 1; } -static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, U32 frame) +/* Returns true if the given block may be RLE. + * This is just a heuristic based on the compressibility. + * It may return both false positives and false negatives. + */ +static int ZSTD_maybeRLE(SeqStore_t const* seqStore) { - /* This the upper bound for the length of an rle block. - * This isn't the actual upper bound. Finding the real threshold - * needs further investigation. - */ - const U32 rleMaxLength = 25; - size_t cSize; - const BYTE* ip = (const BYTE*)src; - BYTE* op = (BYTE*)dst; - DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", - (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, - (unsigned)zc->blockState.matchState.nextToUpdate); + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart); - { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); - FORWARD_IF_ERROR(bss); - if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } - } + return nbSeqs < 4 && nbLits < 10; +} - if (zc->seqCollector.collectSequences) { - ZSTD_copyBlockSequences(zc); +static void +ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) +{ + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; + bs->nextCBlock = tmp; +} + +/* Writes the block header */ +static void +writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) +{ + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); + DEBUGLOG(5, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); +} + +/** ZSTD_buildBlockEntropyStats_literals() : + * Builds entropy for the literals. + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace + * @return : size of huffman description table, or an error code + */ +static size_t +ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, + const ZSTD_hufCTables_t* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_hufCTablesMetadata_t* hufMetadata, + const int literalsCompressionIsDisabled, + void* workspace, size_t wkspSize, + int hufFlags) +{ + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; + BYTE* const countWkspStart = wkspStart; + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; + const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; + unsigned huffLog = LitHufLog; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + + if (literalsCompressionIsDisabled) { + DEBUGLOG(5, "set_basic - disabled"); + hufMetadata->hType = set_basic; return 0; } - /* encode sequences and literals */ - cSize = ZSTD_compressSequences(&zc->seqStore, - &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, - &zc->appliedParams, - dst, dstCapacity, - srcSize, - zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */, - zc->bmi2); + /* small ? don't even attempt compression (speed opt) */ +#ifndef COMPRESS_LITERALS_SIZE_MIN +# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ +#endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; + } } - if (frame && - /* We don't want to emit our first block as a RLE even if it qualifies because - * doing so will cause the decoder (cli only) to throw a "should consume all input error." - * This is only an issue for zstd <= v1.4.3 - */ - !zc->isFirstBlock && - cSize < rleMaxLength && - ZSTD_isRLE(ip, srcSize)) - { - cSize = 1; - op[0] = ip[0]; + /* Scan input and build symbol stats */ + { size_t const largest = + HIST_count_wksp (countWksp, &maxSymbolValue, + (const BYTE*)src, srcSize, + workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { + /* only one literal symbol */ + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { + /* heuristic: likely not compressible */ + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; + } } + + /* Validate the previous Huffman table */ + if (repeat == HUF_repeat_check + && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; } -out: - if (!ZSTD_isError(cSize) && cSize > 1) { - /* confirm repcodes and entropy tables when emitting a compressed block */ - ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; - zc->blockState.prevCBlock = zc->blockState.nextCBlock; - zc->blockState.nextCBlock = tmp; + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); + assert(huffLog <= LitHufLog); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; } - /* We check that dictionaries have offset codes available for the first - * block. After the first block, the offcode table might not have large - * enough codes to represent the offsets in the data. - */ + { /* Build and write the CTable */ + size_t const newCSize = HUF_estimateCompressedSize( + (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); + size_t const hSize = HUF_writeCTable_wksp( + hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), + (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + /* Check against repeating the previous CTable */ + if (repeat != HUF_repeat_none) { + size_t const oldCSize = HUF_estimateCompressedSize( + (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); + if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { + DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_repeat; + return 0; + } } + if (newCSize + hSize >= srcSize) { + DEBUGLOG(5, "set_basic - no gains"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_basic; + return 0; + } + DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); + hufMetadata->hType = set_compressed; + nextHuf->repeatMode = HUF_repeat_check; + return hSize; + } +} + + +/* ZSTD_buildDummySequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic, + * and updates nextEntropy to the appropriate repeatMode. + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) +{ + ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; + return stats; +} + +/** ZSTD_buildBlockEntropyStats_sequences() : + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. + * @return : size of fse tables or error code */ +static size_t +ZSTD_buildBlockEntropyStats_sequences( + const SeqStore_t* seqStorePtr, + const ZSTD_fseCTables_t* prevEntropy, + ZSTD_fseCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize) +{ + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; + unsigned* countWorkspace = (unsigned*)workspace; + unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1); + size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace); + ZSTD_symbolEncodingTypeStats_t stats; + + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq); + stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + prevEntropy, nextEntropy, op, oend, + strategy, countWorkspace, + entropyWorkspace, entropyWorkspaceSize) + : ZSTD_buildDummySequencesStatistics(nextEntropy); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); + fseMetadata->llType = (SymbolEncodingType_e) stats.LLtype; + fseMetadata->ofType = (SymbolEncodingType_e) stats.Offtype; + fseMetadata->mlType = (SymbolEncodingType_e) stats.MLtype; + fseMetadata->lastCountSize = stats.lastCountSize; + return stats.size; +} + + +/** ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE + * @return : 0 on success, or an error code + * Note : also employed in superblock + */ +size_t ZSTD_buildBlockEntropyStats( + const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize) +{ + size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); + int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); + int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; + + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), + workspace, wkspSize, hufFlags); + + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, + &prevEntropy->fse, &nextEntropy->fse, + cctxParams, + &entropyMetadata->fseMetadata, + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed"); + return 0; +} + +/* Returns the size estimate for the literals section (header + content) of a block */ +static size_t +ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, + const ZSTD_hufCTables_t* huf, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; + size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB); + U32 singleStream = litSize < 256; + + if (hufMetadata->hType == set_basic) return litSize; + else if (hufMetadata->hType == set_rle) return 1; + else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { + size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); + if (ZSTD_isError(largest)) return litSize; + { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); + if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; + if (!singleStream) cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */ + return cLitSizeEstimate + literalSectionHeaderSize; + } } + assert(0); /* impossible */ + return 0; +} + +/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +static size_t +ZSTD_estimateBlockSize_symbolType(SymbolEncodingType_e type, + const BYTE* codeTable, size_t nbSeq, unsigned maxCode, + const FSE_CTable* fseCTable, + const U8* additionalBits, + short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, + void* workspace, size_t wkspSize) +{ + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; + const BYTE* const ctStart = ctp; + const BYTE* const ctEnd = ctStart + nbSeq; + size_t cSymbolTypeSizeEstimateInBits = 0; + unsigned max = maxCode; + + HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */ + if (type == set_basic) { + /* We selected this encoding type, so it must be valid. */ + assert(max <= defaultMax); + (void)defaultMax; + cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max); + } else if (type == set_rle) { + cSymbolTypeSizeEstimateInBits = 0; + } else if (type == set_compressed || type == set_repeat) { + cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); + } + if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) { + return nbSeq * 10; + } + while (ctp < ctEnd) { + if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; + else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ + ctp++; + } + return cSymbolTypeSizeEstimateInBits >> 3; +} + +/* Returns the size estimate for the sequences section (header + content) of a block */ +static size_t +ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, + fseTables->offcodeCTable, NULL, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, + fseTables->litlengthCTable, LL_bits, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, + fseTables->matchlengthCTable, ML_bits, + ML_defaultNorm, ML_defaultNormLog, MaxML, + workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; +} + +/* Returns the size estimate for a given stream of literals, of, ll, ml */ +static size_t +ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, + int writeLitEntropy, int writeSeqEntropy) +{ + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, + &entropy->huf, &entropyMetadata->hufMetadata, + workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; +} + +/* Builds entropy statistics and uses them for blocksize estimation. + * + * @return: estimated compressed size of the seqStore, or a zstd error. + */ +static size_t +ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(SeqStore_t* seqStore, ZSTD_CCtx* zc) +{ + ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, + zc->tmpWorkspace, zc->tmpWkspSize), ""); + return ZSTD_estimateBlockSize( + seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), + &zc->blockState.nextCBlock->entropy, + entropyMetadata, + zc->tmpWorkspace, zc->tmpWkspSize, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); +} + +/* Returns literals bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreLiteralsBytes(const SeqStore_t* const seqStore) +{ + size_t literalsBytes = 0; + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { + SeqDef const seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; + } } + return literalsBytes; +} + +/* Returns match bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreMatchBytes(const SeqStore_t* const seqStore) +{ + size_t matchBytes = 0; + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { + SeqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; + } } + return matchBytes; +} + +/* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). + * Stores the result in resultSeqStore. + */ +static void ZSTD_deriveSeqStoreChunk(SeqStore_t* resultSeqStore, + const SeqStore_t* originalSeqStore, + size_t startIdx, size_t endIdx) +{ + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ + if (originalSeqStore->longLengthType != ZSTD_llt_none) { + if (originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) { + resultSeqStore->longLengthType = ZSTD_llt_none; + } else { + resultSeqStore->longLengthPos -= (U32)startIdx; + } + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ + assert(resultSeqStore->lit == originalSeqStore->lit); + } else { + size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; + resultSeqStore->ofCode += startIdx; +} + +/** + * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. + * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ +static U32 +ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) +{ + U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ + assert(OFFBASE_IS_REPCODE(offBase)); + if (adjustedRepCode == ZSTD_REP_NUM) { + assert(ll0); + /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 + * This is only valid if it results in a valid offset value, aka > 0. + * Note : it may happen that `rep[0]==1` in exceptional circumstances. + * In which case this function will return 0, which is an invalid offset. + * It's not an issue though, since this value will be + * compared and discarded within ZSTD_seqStore_resolveOffCodes(). + */ + return rep[0] - 1; + } + return rep[adjustedRepCode]; +} + +/** + * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise + * due to emission of RLE/raw blocks that disturb the offset history, + * and replaces any repcodes within the seqStore that may be invalid. + * + * dRepcodes are updated as would be on the decompression side. + * cRepcodes are updated exactly in accordance with the seqStore. + * + * Note : this function assumes seq->offBase respects the following numbering scheme : + * 0 : invalid + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +static void +ZSTD_seqStore_resolveOffCodes(Repcodes_t* const dRepcodes, Repcodes_t* const cRepcodes, + const SeqStore_t* const seqStore, U32 const nbSeq) +{ + U32 idx = 0; + U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; + for (; idx < nbSeq; ++idx) { + SeqDef* const seq = seqStore->sequencesStart + idx; + U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); + U32 const offBase = seq->offBase; + assert(offBase > 0); + if (OFFBASE_IS_REPCODE(offBase)) { + U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); + U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { + seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ + ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); + ZSTD_updateRep(cRepcodes->rep, offBase, ll0); + } +} + +/* ZSTD_compressSeqStore_singleBlock(): + * Compresses a seqStore into a block with a block header, into the buffer dst. + * + * Returns the total size of that block (including header) or a ZSTD error code. + */ +static size_t +ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, + const SeqStore_t* const seqStore, + Repcodes_t* const dRep, Repcodes_t* const cRep, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) +{ + const U32 rleMaxLength = 25; + BYTE* op = (BYTE*)dst; + const BYTE* ip = (const BYTE*)src; + size_t cSize; + size_t cSeqsSize; + + /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ + Repcodes_t const dRepOriginal = *dRep; + DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); + if (isPartition) + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit"); + cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, + srcSize, + zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); + + if (!zc->isFirstBlock && + cSeqsSize < rleMaxLength && + ZSTD_isRLE((BYTE const*)src, srcSize)) { + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + cSeqsSize = 1; + } + + /* Sequence collection not supported when block splitting */ + if (zc->seqCollector.collectSequences) { + FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } + + if (cSeqsSize == 0) { + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "Nocompress block failed"); + DEBUGLOG(5, "Writing out nocompress block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else if (cSeqsSize == 1) { + cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "RLE compress block failed"); + DEBUGLOG(5, "Writing out RLE block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); + cSize = ZSTD_blockHeaderSize + cSeqsSize; + DEBUGLOG(5, "Writing out compressed block, size: %zu", cSize); + } + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +/* Struct to keep track of where we are in our recursive calls. */ +typedef struct { + U32* splitLocations; /* Array of split indices */ + size_t idx; /* The current index within splitLocations being worked on */ +} seqStoreSplits; + +#define MIN_SEQUENCES_BLOCK_SPLITTING 300 + +/* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. + * If advantageous to split, then we recurse down the two sub-blocks. + * If not, or if an error occurred in estimation, then we do not recurse. + * + * Note: The recursion depth is capped by a heuristic minimum number of sequences, + * defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * + * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. + * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ +static void +ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, + ZSTD_CCtx* zc, const SeqStore_t* origSeqStore) +{ + SeqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; + SeqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; + SeqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + + DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + assert(endIdx >= startIdx); + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { + DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); + return; + } + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); + DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { + DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; + ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore); + } +} + +/* Base recursive function. + * Populates a table with intra-block partition indices that can improve compression ratio. + * + * @return: number of splits made (which equals the size of the partition table - 1). + */ +static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) +{ + seqStoreSplits splits; + splits.splitLocations = partitions; + splits.idx = 0; + if (nbSeq <= 4) { + DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } + ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore); + splits.splitLocations[splits.idx] = nbSeq; + DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx+1); + return splits.idx; +} + +/* ZSTD_compressBlock_splitBlock(): + * Attempts to split a given block into multiple blocks to improve compression ratio. + * + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ +static size_t +ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t blockSize, + U32 lastBlock, U32 nbSeq) +{ + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; + U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ + SeqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; + SeqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; + size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two + * separate repcode histories that simulate repcode history on compression and decompression side, + * and use the histories to determine whether we must replace a particular repcode with its raw offset. + * + * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed + * or RLE. This allows us to retrieve the offset value that an invalid repcode references within + * a nocompress/RLE block. + * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use + * the replacement offset value rather than the original repcode to update the repcode history. + * dRep also will be the final repcode history sent to the next block. + * + * See ZSTD_seqStore_resolveOffCodes() for more details. + */ + Repcodes_t dRep; + Repcodes_t cRep; + ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + ZSTD_memset(nextSeqStore, 0, sizeof(SeqStore_t)); + + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { + size_t cSizeSingleBlock = + ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, + &dRep, &cRep, + op, dstCapacity, + ip, blockSize, + lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); + assert(zc->blockSizeMax <= ZSTD_BLOCKSIZE_MAX); + assert(cSizeSingleBlock <= zc->blockSizeMax + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + + size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ + srcBytes += blockSize - srcBytesTotal; + lastBlockEntireSrc = lastBlock; + } else { + ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]); + } + + cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore, + &dRep, &cRep, + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); + DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", + ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; + op += cSizeChunk; + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; + assert(cSizeChunk <= zc->blockSizeMax + ZSTD_blockHeaderSize); + } + /* cRep and dRep may have diverged during the compression. + * If so, we use the dRep repcodes for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(Repcodes_t)); + return cSize; +} + +static size_t +ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) +{ + U32 nbSeq; + size_t cSize; + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock"); + assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); + cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; + } + nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); + } + + cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq); + FORWARD_IF_ERROR(cSize, "Splitting blocks failed!"); + return cSize; +} + +static size_t +ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) +{ + /* This is an estimated upper bound for the length of an rle block. + * This isn't the actual upper bound. + * Finding the real threshold needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { + RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); + cSize = 0; + goto out; + } + } + + if (zc->seqCollector.collectSequences) { + FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } + + /* encode sequences and literals */ + cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + dst, dstCapacity, + srcSize, + zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + + if (frame && + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + !zc->isFirstBlock && + cSize < rleMaxLength && + ZSTD_isRLE(ip, srcSize)) + { + cSize = 1; + op[0] = ip[0]; + } + +out: + if (!ZSTD_isError(cSize) && cSize > 1) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + } + /* We check that dictionaries have offset codes available for the first + * block. After the first block, the offcode table might not have large + * enough codes to represent the offsets in the data. + */ if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; return cSize; } +static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const size_t bss, U32 lastBlock) +{ + DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()"); + if (bss == ZSTDbss_compress) { + if (/* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + !zc->isFirstBlock && + ZSTD_maybeRLE(&zc->seqStore) && + ZSTD_isRLE((BYTE const*)src, srcSize)) + { + return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock); + } + /* Attempt superblock compression. + * + * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the + * standard ZSTD_compressBound(). This is a problem, because even if we have + * space now, taking an extra byte now could cause us to run out of space later + * and violate ZSTD_compressBound(). + * + * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize. + * + * In order to respect ZSTD_compressBound() we must attempt to emit a raw + * uncompressed block in these cases: + * * cSize == 0: Return code for an uncompressed block. + * * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize). + * ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of + * output space. + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ + { size_t const cSize = + ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { + size_t const maxCSize = + srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return cSize; + } + } + } + } /* if (bss == ZSTDbss_compress)*/ + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. + * The decoder will be able to stream this block since it is uncompressed. + */ + return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); +} + +static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock) +{ + size_t cSize = 0; + const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + + cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed"); -static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +static void ZSTD_overflowCorrectIfNeeded(ZSTD_MatchState_t* ms, ZSTD_cwksp* ws, ZSTD_CCtx_params const* params, void const* ip, void const* iend) { - if (ZSTD_window_needOverflowCorrection(ms->window, iend)) { - U32 const maxDist = (U32)1 << params->cParams.windowLog; - U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); + U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); + U32 const maxDist = (U32)1 << params->cParams.windowLog; + if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) { U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip); ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30); ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); @@ -2460,60 +4548,134 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, } } +#include "zstd_preSplit.h" + +static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings) +{ + /* split level based on compression strategy, from `fast` to `btultra2` */ + static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 }; + /* note: conservatively only split full blocks (128 KB) currently. + * While it's possible to go lower, let's keep it simple for a first implementation. + * Besides, benefits of splitting are reduced when blocks are already small. + */ + if (srcSize < 128 KB || blockSizeMax < 128 KB) + return MIN(srcSize, blockSizeMax); + /* do not split incompressible data though: + * require verified savings to allow pre-splitting. + * Note: as a consequence, the first full block is not split. + */ + if (savings < 3) { + DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings); + return 128 KB; + } + /* apply @splitLevel, or use default value (which depends on @strat). + * note that splitting heuristic is still conditioned by @savings >= 3, + * so the first block will not reach this code path */ + if (splitLevel == 1) return 128 KB; + if (splitLevel == 0) { + assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2); + splitLevel = splitLevels[strat]; + } else { + assert(2 <= splitLevel && splitLevel <= 6); + splitLevel -= 2; + } + return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize); +} + /*! ZSTD_compress_frameChunk() : * Compress a chunk of data into one or multiple blocks. * All blocks will be terminated, all input will be consumed. * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. * Frame is supposed already started (header already produced) -* @return : compressed size, or an error code +* @return : compressed size, or an error code */ -static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, +static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastFrameChunk) { - size_t blockSize = cctx->blockSize; + size_t blockSizeMax = cctx->blockSizeMax; size_t remaining = srcSize; const BYTE* ip = (const BYTE*)src; BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog; + S64 savings = (S64)cctx->consumedSrcSize - (S64)cctx->producedCSize; + assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX); - DEBUGLOG(5, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize); + DEBUGLOG(5, "ZSTD_compress_frameChunk (srcSize=%u, blockSizeMax=%u)", (unsigned)srcSize, (unsigned)blockSizeMax); if (cctx->appliedParams.fParams.checksumFlag && srcSize) XXH64_update(&cctx->xxhState, src, srcSize); while (remaining) { - ZSTD_matchState_t* const ms = &cctx->blockState.matchState; - U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); - - RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, + ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; + size_t const blockSize = ZSTD_optimalBlockSize(cctx, + ip, remaining, + blockSizeMax, + cctx->appliedParams.preBlockSplitter_level, + cctx->appliedParams.cParams.strategy, + savings); + U32 const lastBlock = lastFrameChunk & (blockSize == remaining); + assert(blockSize <= remaining); + + /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding + * additional 1. We need to revisit and change this logic to be more consistent */ + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, dstSize_tooSmall, "not enough space to store compressed block"); - if (remaining < blockSize) blockSize = remaining; ZSTD_overflowCorrectIfNeeded( ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); + ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); /* Ensure hash/chain table insertion resumes no sooner than lowlimit */ if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; - { size_t cSize = ZSTD_compressBlock_internal(cctx, - op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, - ip, blockSize, 1 /* frame */); - FORWARD_IF_ERROR(cSize); - if (cSize == 0) { /* block is not compressible */ - cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); - FORWARD_IF_ERROR(cSize); + { size_t cSize; + if (ZSTD_useTargetCBlockSize(&cctx->appliedParams)) { + cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed"); + assert(cSize > 0); + assert(cSize <= blockSize + ZSTD_blockHeaderSize); + } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) { + cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed"); + assert(cSize > 0 || cctx->seqCollector.collectSequences == 1); } else { - const U32 cBlockHeader = cSize == 1 ? - lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : - lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); - MEM_writeLE24(op, cBlockHeader); - cSize += ZSTD_blockHeaderSize; - } + cSize = ZSTD_compressBlock_internal(cctx, + op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, + ip, blockSize, 1 /* frame */); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed"); + + if (cSize == 0) { /* block is not compressible */ + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + } else { + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } + } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ + + /* @savings is employed to ensure that splitting doesn't worsen expansion of incompressible data. + * Without splitting, the maximum expansion is 3 bytes per full block. + * An adversarial input could attempt to fudge the split detector, + * and make it split incompressible data, resulting in more block headers. + * Note that, since ZSTD_COMPRESSBOUND() assumes a worst case scenario of 1KB per block, + * and the splitter never creates blocks that small (current lower limit is 8 KB), + * there is already no risk to expand beyond ZSTD_COMPRESSBOUND() limit. + * But if the goal is to not expand by more than 3-bytes per 128 KB full block, + * then yes, it becomes possible to make the block splitter oversplit incompressible data. + * Using @savings, we enforce an even more conservative condition, + * requiring the presence of enough savings (at least 3 bytes) to authorize splitting, + * otherwise only full blocks are used. + * But being conservative is fine, + * since splitting barely compressible blocks is not fruitful anyway */ + savings += (S64)blockSize - (S64)cSize; ip += blockSize; assert(remaining >= blockSize); @@ -2532,8 +4694,10 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, - const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID) -{ BYTE* const op = (BYTE*)dst; + const ZSTD_CCtx_params* params, + U64 pledgedSrcSize, U32 dictID) +{ + BYTE* const op = (BYTE*)dst; U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */ U32 const checksumFlag = params->fParams.checksumFlag>0; @@ -2546,10 +4710,10 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, size_t pos=0; assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)); - RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall); + RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall, + "dst buf is too small to fit worst-case frame header size."); DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u", !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode); - if (params->format == ZSTD_f_zstd1) { MEM_writeLE32(dst, ZSTD_MAGICNUMBER); pos = 4; @@ -2558,7 +4722,9 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, if (!singleSegment) op[pos++] = windowLogByte; switch(dictIDSizeCode) { - default: assert(0); /* impossible */ + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; case 0 : break; case 1 : op[pos] = (BYTE)(dictID); pos++; break; case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break; @@ -2566,7 +4732,9 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, } switch(fcsCode) { - default: assert(0); /* impossible */ + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break; case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break; case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break; @@ -2575,6 +4743,26 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, return pos; } +/* ZSTD_writeSkippableFrame_advanced() : + * Writes out a skippable frame with the specified magic number variant (16 are supported), + * from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15, and the desired source data. + * + * Returns the total number of bytes written, or a ZSTD error code. + */ +size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, unsigned magicVariant) { + BYTE* op = (BYTE*)dst; + RETURN_ERROR_IF(dstCapacity < srcSize + ZSTD_SKIPPABLEHEADERSIZE /* Skippable frame overhead */, + dstSize_tooSmall, "Not enough room for skippable frame"); + RETURN_ERROR_IF(srcSize > (unsigned)0xFFFFFFFF, srcSize_wrong, "Src size too large for skippable frame"); + RETURN_ERROR_IF(magicVariant > 15, parameter_outOfBound, "Skippable frame magic number variant not supported"); + + MEM_writeLE32(op, (U32)(ZSTD_MAGIC_SKIPPABLE_START + magicVariant)); + MEM_writeLE32(op+4, (U32)srcSize); + ZSTD_memcpy(op+8, src, srcSize); + return srcSize + ZSTD_SKIPPABLEHEADERSIZE; +} + /* ZSTD_writeLastEmptyBlock() : * output an empty Block with end-of-frame mark to complete a frame * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) @@ -2582,23 +4770,23 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, */ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity) { - RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall); + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, + "dst buf is too small to write frame trailer empty block."); { U32 const cBlockHeader24 = 1 /*lastBlock*/ + (((U32)bt_raw)<<1); /* 0 size */ MEM_writeLE24(dst, cBlockHeader24); return ZSTD_blockHeaderSize; } } -size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) +void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) { - RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong); - RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm, - parameter_unsupported); + assert(cctx->stage == ZSTDcs_init); + assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); cctx->externSeqStore.seq = seq; cctx->externSeqStore.size = nbSeq; cctx->externSeqStore.capacity = nbSeq; cctx->externSeqStore.pos = 0; - return 0; + cctx->externSeqStore.posInSequence = 0; } @@ -2607,7 +4795,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, const void* src, size_t srcSize, U32 frame, U32 lastFrameChunk) { - ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; size_t fhSize = 0; DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u", @@ -2618,7 +4806,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, if (frame && (cctx->stage==ZSTDcs_init)) { fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, cctx->pledgedSrcSizePlusOne-1, cctx->dictID); - FORWARD_IF_ERROR(fhSize); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); assert(fhSize <= dstCapacity); dstCapacity -= fhSize; dst = (char*)dst + fhSize; @@ -2627,11 +4815,12 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, if (!srcSize) return fhSize; /* do not generate an empty block if no input */ - if (!ZSTD_window_update(&ms->window, src, srcSize)) { + if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) { + ms->forceNonContiguous = 0; ms->nextToUpdate = ms->window.dictLimit; } - if (cctx->appliedParams.ldmParams.enableLdm) { - ZSTD_window_update(&cctx->ldmState.window, src, srcSize); + if (cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0); } if (!frame) { @@ -2641,11 +4830,11 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, src, (BYTE const*)src + srcSize); } - DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize); + DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSizeMax); { size_t const cSize = frame ? ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */); - FORWARD_IF_ERROR(cSize); + FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed"); cctx->consumedSrcSize += srcSize; cctx->producedCSize += (cSize + fhSize); assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); @@ -2662,87 +4851,189 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, } } -size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize) +size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); } +/* NOTE: Must just wrap ZSTD_compressContinue_public() */ +size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); +} -size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) +static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) { ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; assert(!ZSTD_checkCParams(cParams)); - return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); + return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); } -size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ +size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) +{ + return ZSTD_getBlockSize_deprecated(cctx); +} + +/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ +size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); - { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); - RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong); } + { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); } +/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ +size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); +} + /*! ZSTD_loadDictionaryContent() : * @return : 0, or an error code */ -static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, - ZSTD_cwksp* ws, - ZSTD_CCtx_params const* params, - const void* src, size_t srcSize, - ZSTD_dictTableLoadMethod_e dtlm) +static size_t +ZSTD_loadDictionaryContent(ZSTD_MatchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* src, size_t srcSize, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp) { const BYTE* ip = (const BYTE*) src; const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; - ZSTD_window_update(&ms->window, src, srcSize); - ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); - - /* Assert that we the ms params match the params we're being given */ + /* Assert that the ms params match the params we're being given */ ZSTD_assertEqualCParams(params->cParams, ms->cParams); - if (srcSize <= HASH_READ_SIZE) return 0; + { /* Ensure large dictionaries can't cause index overflow */ - while (iend - ip > HASH_READ_SIZE) { - size_t const remaining = (size_t)(iend - ip); - size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX); - const BYTE* const ichunk = ip + chunk; + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ + U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; + + int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); + if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { + /* Some dictionary matchfinders in zstd use "short cache", + * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each + * CDict hashtable entry as a tag rather than as part of an index. + * When short cache is used, we need to truncate the dictionary + * so that its indices don't overlap with the tag. */ + U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; + maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); + assert(!loadLdmDict); + } - ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk); + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; + src = ip; + srcSize = maxDictSize; + } + } - switch(params->cParams.strategy) - { - case ZSTD_fast: - ZSTD_fillHashTable(ms, ichunk, dtlm); - break; - case ZSTD_dfast: - ZSTD_fillDoubleHashTable(ms, ichunk, dtlm); - break; + if (srcSize > ZSTD_CHUNKSIZE_MAX) { + /* We must have cleared our windows when our source is this large. */ + assert(ZSTD_window_isEmpty(ms->window)); + if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); + } + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); - case ZSTD_greedy: - case ZSTD_lazy: - case ZSTD_lazy2: - if (chunk >= HASH_READ_SIZE) - ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE); - break; + DEBUGLOG(4, "ZSTD_loadDictionaryContent: useRowMatchFinder=%d", (int)params->useRowMatchFinder); - case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ - case ZSTD_btopt: - case ZSTD_btultra: - case ZSTD_btultra2: - if (chunk >= HASH_READ_SIZE) - ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk); - break; + if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ + DEBUGLOG(4, "ZSTD_loadDictionaryContent: Trigger loadLdmDict"); + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); + ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); + DEBUGLOG(4, "ZSTD_loadDictionaryContent: ZSTD_ldm_fillHashTable completes"); + } - default: - assert(0); /* not possible : not a valid strategy id */ + /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ + { U32 maxDictSize = 1U << MIN(MAX(params->cParams.hashLog + 3, params->cParams.chainLog + 1), 31); + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; + src = ip; + srcSize = maxDictSize; + } + } + + ms->nextToUpdate = (U32)(ip - ms->window.base); + ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); + ms->forceNonContiguous = params->deterministicRefPrefix; + + if (srcSize <= HASH_READ_SIZE) return 0; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); + + switch(params->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, iend, dtlm, tfp); + break; + case ZSTD_dfast: +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable != NULL); + ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE); + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { + size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); + } else { + ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); + } } +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + DEBUGLOG(4, "Fill %u bytes into the Binary Tree", (unsigned)srcSize); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif + break; - ip = ichunk; + default: + assert(0); /* not possible : not a valid strategy id */ } ms->nextToUpdate = (U32)(iend - ms->window.base); @@ -2751,102 +5042,90 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, /* Dictionaries that assign zero probability to symbols that show up causes problems - when FSE encoding. Refuse dictionaries that assign zero probability to symbols - that we may encounter during compression. - NOTE: This behavior is not standard and could be improved in the future. */ -static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) { + * when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check + * and only dictionaries with 100% valid symbols can be assumed valid. + */ +static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) +{ U32 s; - RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted); + if (dictMaxSymbolValue < maxSymbolValue) { + return FSE_repeat_check; + } for (s = 0; s <= maxSymbolValue; ++s) { - RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted); + if (normalizedCounter[s] == 0) { + return FSE_repeat_check; + } } - return 0; + return FSE_repeat_valid; } - -/* Dictionary format : - * See : - * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format - */ -/*! ZSTD_loadZstdDictionary() : - * @return : dictID, or an error code - * assumptions : magic number supposed already checked - * dictSize supposed >= 8 - */ -static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, - ZSTD_matchState_t* ms, - ZSTD_cwksp* ws, - ZSTD_CCtx_params const* params, - const void* dict, size_t dictSize, - ZSTD_dictTableLoadMethod_e dtlm, - void* workspace) +size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + const void* const dict, size_t dictSize) { - const BYTE* dictPtr = (const BYTE*)dict; - const BYTE* const dictEnd = dictPtr + dictSize; short offcodeNCount[MaxOff+1]; unsigned offcodeMaxValue = MaxOff; - size_t dictID; + const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */ + const BYTE* const dictEnd = dictPtr + dictSize; + dictPtr += 8; + bs->entropy.huf.repeatMode = HUF_repeat_check; - ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= 8); - assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); + { unsigned maxSymbolValue = 255; + unsigned hasZeroWeights = 1; + size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, + (size_t)(dictEnd-dictPtr), &hasZeroWeights); - dictPtr += 4; /* skip magic number */ - dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr); - dictPtr += 4; + /* We only set the loaded table as valid if it contains all non-zero + * weights. Otherwise, we set it to check */ + if (!hasZeroWeights && maxSymbolValue == 255) + bs->entropy.huf.repeatMode = HUF_repeat_valid; - { unsigned maxSymbolValue = 255; - size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, dictEnd-dictPtr); - RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted); - RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted); + RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); dictPtr += hufHeaderSize; } { unsigned offcodeLog; - size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); - RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted); - RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted); - /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); /* fill all offset symbols to avoid garbage at end of table */ RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( bs->entropy.fse.offcodeCTable, offcodeNCount, MaxOff, offcodeLog, workspace, HUF_WORKSPACE_SIZE)), - dictionary_corrupted); + dictionary_corrupted, ""); + /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ dictPtr += offcodeHeaderSize; } { short matchlengthNCount[MaxML+1]; unsigned matchlengthMaxValue = MaxML, matchlengthLog; - size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); - RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted); - RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted); - /* Every match length code must have non-zero probability */ - FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML)); + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( bs->entropy.fse.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE)), - dictionary_corrupted); + dictionary_corrupted, ""); + bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML); dictPtr += matchlengthHeaderSize; } { short litlengthNCount[MaxLL+1]; unsigned litlengthMaxValue = MaxLL, litlengthLog; - size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); - RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted); - RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted); - /* Every literal length code must have non-zero probability */ - FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL)); + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( bs->entropy.fse.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE)), - dictionary_corrupted); + dictionary_corrupted, ""); + bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL); dictPtr += litlengthHeaderSize; } - RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted); + RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, ""); bs->rep[0] = MEM_readLE32(dictPtr+0); bs->rep[1] = MEM_readLE32(dictPtr+4); bs->rep[2] = MEM_readLE32(dictPtr+8); @@ -2858,40 +5137,75 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */ offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */ } - /* All offset values <= dictContentSize + 128 KB must be representable */ - FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff))); - /* All repCodes must be <= dictContentSize and != 0*/ + /* All offset values <= dictContentSize + 128 KB must be representable for a valid table */ + bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)); + + /* All repCodes must be <= dictContentSize and != 0 */ { U32 u; for (u=0; u<3; u++) { - RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted); - RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted); - } } + RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, ""); + RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); + } } } + + return (size_t)(dictPtr - (const BYTE*)dict); +} + +/* Dictionary format : + * See : + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#dictionary-format + */ +/*! ZSTD_loadZstdDictionary() : + * @return : dictID, or an error code + * assumptions : magic number supposed already checked + * dictSize supposed >= 8 + */ +static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp, + void* workspace) +{ + const BYTE* dictPtr = (const BYTE*)dict; + const BYTE* const dictEnd = dictPtr + dictSize; + size_t dictID; + size_t eSize; + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= 8); + assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); + + dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */ ); + eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize); + FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed"); + dictPtr += eSize; - bs->entropy.huf.repeatMode = HUF_repeat_valid; - bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid; - bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid; - bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid; + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( - ms, ws, params, dictPtr, dictContentSize, dtlm)); - return dictID; + ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); } + return dictID; } /** ZSTD_compress_insertDictionary() : * @return : dictID, or an error code */ static size_t ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, - ZSTD_matchState_t* ms, + ZSTD_MatchState_t* ms, + ldmState_t* ls, ZSTD_cwksp* ws, const ZSTD_CCtx_params* params, const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp, void* workspace) { DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); if ((dict==NULL) || (dictSize<8)) { - RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong); + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); return 0; } @@ -2899,27 +5213,28 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, /* dict restricted modes */ if (dictContentType == ZSTD_dct_rawContent) - return ZSTD_loadDictionaryContent(ms, ws, params, dict, dictSize, dtlm); + return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { if (dictContentType == ZSTD_dct_auto) { DEBUGLOG(4, "raw content dictionary detected"); return ZSTD_loadDictionaryContent( - ms, ws, params, dict, dictSize, dtlm); + ms, ls, ws, params, dict, dictSize, dtlm, tfp); } - RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong); + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); assert(0); /* impossible */ } /* dict as full zstd dictionary */ return ZSTD_loadZstdDictionary( - bs, ms, ws, params, dict, dictSize, dtlm, workspace); + bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); } #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) -#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6) +#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) /*! ZSTD_compressBegin_internal() : + * Assumption : either @dict OR @cdict (or none) is non-NULL, never both * @return : 0, or an error code */ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, @@ -2929,6 +5244,10 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { + size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize; +#if ZSTD_TRACE + cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0; +#endif DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog); /* params are supposed to be fully validated at this point */ assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); @@ -2943,20 +5262,23 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff); } - FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSize, - ZSTDcrp_makeClean, zbuff) ); + FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + dictContentSize, + ZSTDcrp_makeClean, zbuff) , ""); { size_t const dictID = cdict ? ZSTD_compress_insertDictionary( cctx->blockState.prevCBlock, &cctx->blockState.matchState, - &cctx->workspace, params, cdict->dictContent, cdict->dictContentSize, - dictContentType, dtlm, cctx->entropyWorkspace) + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, cdict->dictContentType, dtlm, + ZSTD_tfp_forCCtx, cctx->tmpWorkspace) : ZSTD_compress_insertDictionary( cctx->blockState.prevCBlock, &cctx->blockState.matchState, - &cctx->workspace, params, dict, dictSize, - dictContentType, dtlm, cctx->entropyWorkspace); - FORWARD_IF_ERROR(dictID); + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, + dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->tmpWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= UINT_MAX); cctx->dictID = (U32)dictID; + cctx->dictContentSize = dictContentSize; } return 0; } @@ -2971,7 +5293,7 @@ size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx, { DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog); /* compression parameters verification and optimization */ - FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) ); + FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) , ""); return ZSTD_compressBegin_internal(cctx, dict, dictSize, dictContentType, dtlm, cdict, @@ -2985,27 +5307,35 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize) { - ZSTD_CCtx_params const cctxParams = - ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params); + ZSTD_CCtx_params cctxParams; + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, ZSTD_NO_CLEVEL); return ZSTD_compressBegin_advanced_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL /*cdict*/, &cctxParams, pledgedSrcSize); } -size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) +static size_t +ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); - ZSTD_CCtx_params const cctxParams = - ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params); + ZSTD_CCtx_params cctxParams; + { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); } +size_t +ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) +{ + return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); +} + size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) { - return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); + return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); } @@ -3016,15 +5346,14 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) { BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; - size_t fhSize = 0; DEBUGLOG(4, "ZSTD_writeEpilogue"); RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); /* special case : empty frame */ if (cctx->stage == ZSTDcs_init) { - fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); - FORWARD_IF_ERROR(fhSize); + size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); dstCapacity -= fhSize; op += fhSize; cctx->stage = ZSTDcs_ongoing; @@ -3033,35 +5362,60 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) if (cctx->stage != ZSTDcs_ending) { /* write one last empty block, make it the "last" block */ U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; - RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall); - MEM_writeLE32(op, cBlockHeader24); + ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); + RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); + MEM_writeLE24(op, cBlockHeader24); op += ZSTD_blockHeaderSize; dstCapacity -= ZSTD_blockHeaderSize; } if (cctx->appliedParams.fParams.checksumFlag) { U32 const checksum = (U32) XXH64_digest(&cctx->xxhState); - RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum); MEM_writeLE32(op, checksum); op += 4; } cctx->stage = ZSTDcs_created; /* return to "created but no init" status */ - return op-ostart; + return (size_t)(op-ostart); } -size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize) +void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) +{ +#if ZSTD_TRACE + if (cctx->traceCtx && ZSTD_trace_compress_end != NULL) { + int const streaming = cctx->inBuffSize > 0 || cctx->outBuffSize > 0 || cctx->appliedParams.nbWorkers > 0; + ZSTD_Trace trace; + ZSTD_memset(&trace, 0, sizeof(trace)); + trace.version = ZSTD_VERSION_NUMBER; + trace.streaming = streaming; + trace.dictionaryID = cctx->dictID; + trace.dictionarySize = cctx->dictContentSize; + trace.uncompressedSize = cctx->consumedSrcSize; + trace.compressedSize = cctx->producedCSize + extraCSize; + trace.params = &cctx->appliedParams; + trace.cctx = cctx; + ZSTD_trace_compress_end(cctx->traceCtx, &trace); + } + cctx->traceCtx = 0; +#else + (void)cctx; + (void)extraCSize; +#endif +} + +size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) { size_t endResult; size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 1 /* last chunk */); - FORWARD_IF_ERROR(cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed"); endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize); - FORWARD_IF_ERROR(endResult); + FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed"); assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */ ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1); @@ -3073,24 +5427,16 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, (unsigned)cctx->pledgedSrcSizePlusOne-1, (unsigned)cctx->consumedSrcSize); } + ZSTD_CCtx_trace(cctx, endResult); return cSize + endResult; } - -static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const void* dict,size_t dictSize, - ZSTD_parameters params) +/* NOTE: Must just wrap ZSTD_compressEnd_public() */ +size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) { - ZSTD_CCtx_params const cctxParams = - ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params); - DEBUGLOG(4, "ZSTD_compress_internal"); - return ZSTD_compress_advanced_internal(cctx, - dst, dstCapacity, - src, srcSize, - dict, dictSize, - &cctxParams); + return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); } size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, @@ -3100,12 +5446,13 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, ZSTD_parameters params) { DEBUGLOG(4, "ZSTD_compress_advanced"); - FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams)); - return ZSTD_compress_internal(cctx, - dst, dstCapacity, - src, srcSize, - dict, dictSize, - params); + FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, ZSTD_NO_CLEVEL); + return ZSTD_compress_advanced_internal(cctx, + dst, dstCapacity, + src, srcSize, + dict, dictSize, + &cctx->simpleApiParams); } /* Internal */ @@ -3119,8 +5466,8 @@ size_t ZSTD_compress_advanced_internal( DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize); FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, - params, srcSize, ZSTDb_not_buffered) ); - return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); + params, srcSize, ZSTDb_not_buffered) , ""); + return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); } size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, @@ -3129,10 +5476,13 @@ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, srcSize + (!srcSize), dict ? dictSize : 0); - ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params); - assert(params.fParams.contentSizeFlag == 1); - return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams); + { + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict); + assert(params.fParams.contentSizeFlag == 1); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); + } + DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams); } size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, @@ -3150,10 +5500,17 @@ size_t ZSTD_compress(void* dst, size_t dstCapacity, int compressionLevel) { size_t result; +#if ZSTD_COMPRESS_HEAPMODE + ZSTD_CCtx* cctx = ZSTD_createCCtx(); + RETURN_ERROR_IF(!cctx, memory_allocation, "ZSTD_createCCtx failed"); + result = ZSTD_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel); + ZSTD_freeCCtx(cctx); +#else ZSTD_CCtx ctxBody; ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem); result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel); ZSTD_freeCCtxContent(&ctxBody); /* can't free ctxBody itself, as it's on stack; free only heap content */ +#endif return result; } @@ -3169,14 +5526,17 @@ size_t ZSTD_estimateCDictSize_advanced( DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict)); return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) - + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small + * in case we are using DDS with row-hash. */ + + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), + /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); } size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) { - ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, 0, dictSize); + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy); } @@ -3194,20 +5554,22 @@ static size_t ZSTD_initCDict_internal( const void* dictBuffer, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, - ZSTD_compressionParameters cParams) + ZSTD_CCtx_params params) { DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType); - assert(!ZSTD_checkCParams(cParams)); - cdict->matchState.cParams = cParams; + assert(!ZSTD_checkCParams(params.cParams)); + cdict->matchState.cParams = params.cParams; + cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch; if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) { cdict->dictContent = dictBuffer; } else { void *internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*))); - RETURN_ERROR_IF(!internalBuffer, memory_allocation); + RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!"); cdict->dictContent = internalBuffer; - memcpy(internalBuffer, dictBuffer, dictSize); + ZSTD_memcpy(internalBuffer, dictBuffer, dictSize); } cdict->dictContentSize = dictSize; + cdict->dictContentType = dictContentType; cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE); @@ -3217,23 +5579,21 @@ static size_t ZSTD_initCDict_internal( FORWARD_IF_ERROR(ZSTD_reset_matchState( &cdict->matchState, &cdict->workspace, - &cParams, + ¶ms.cParams, + params.useRowMatchFinder, ZSTDcrp_makeClean, ZSTDirp_reset, - ZSTD_resetTarget_CDict)); + ZSTD_resetTarget_CDict), ""); /* (Maybe) load the dictionary * Skips loading the dictionary if it is < 8 bytes. */ - { ZSTD_CCtx_params params; - memset(¶ms, 0, sizeof(params)); - params.compressionLevel = ZSTD_CLEVEL_DEFAULT; + { params.compressionLevel = ZSTD_CLEVEL_DEFAULT; params.fParams.contentSizeFlag = 1; - params.cParams = cParams; { size_t const dictID = ZSTD_compress_insertDictionary( - &cdict->cBlockState, &cdict->matchState, &cdict->workspace, + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, ¶ms, cdict->dictContent, cdict->dictContentSize, - dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); - FORWARD_IF_ERROR(dictID); + dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= (size_t)(U32)-1); cdict->dictID = (U32)dictID; } @@ -3242,66 +5602,132 @@ static size_t ZSTD_initCDict_internal( return 0; } -ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, - ZSTD_dictLoadMethod_e dictLoadMethod, - ZSTD_dictContentType_e dictContentType, - ZSTD_compressionParameters cParams, ZSTD_customMem customMem) +static ZSTD_CDict* +ZSTD_createCDict_advanced_internal(size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_compressionParameters cParams, + ZSTD_ParamSwitch_e useRowMatchFinder, + int enableDedicatedDictSearch, + ZSTD_customMem customMem) { - DEBUGLOG(3, "ZSTD_createCDict_advanced, mode %u", (unsigned)dictContentType); - if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; + DEBUGLOG(3, "ZSTD_createCDict_advanced_internal (dictSize=%u)", (unsigned)dictSize); { size_t const workspaceSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + - ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); - void* const workspace = ZSTD_malloc(workspaceSize, customMem); + void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); ZSTD_cwksp ws; ZSTD_CDict* cdict; if (!workspace) { - ZSTD_free(workspace, customMem); + ZSTD_customFree(workspace, customMem); return NULL; } - ZSTD_cwksp_init(&ws, workspace, workspaceSize); + ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_dynamic_alloc); cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); assert(cdict != NULL); ZSTD_cwksp_move(&cdict->workspace, &ws); cdict->customMem = customMem; - cdict->compressionLevel = 0; /* signals advanced API usage */ + cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */ + cdict->useRowMatchFinder = useRowMatchFinder; + return cdict; + } +} - if (ZSTD_isError( ZSTD_initCDict_internal(cdict, - dictBuffer, dictSize, - dictLoadMethod, dictContentType, - cParams) )) { - ZSTD_freeCDict(cdict); - return NULL; - } +ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, + ZSTD_customMem customMem) +{ + ZSTD_CCtx_params cctxParams; + ZSTD_memset(&cctxParams, 0, sizeof(cctxParams)); + DEBUGLOG(3, "ZSTD_createCDict_advanced, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + ZSTD_CCtxParams_init(&cctxParams, 0); + cctxParams.cParams = cParams; + cctxParams.customMem = customMem; + return ZSTD_createCDict_advanced2( + dictBuffer, dictSize, + dictLoadMethod, dictContentType, + &cctxParams, customMem); +} + +ZSTD_CDict* ZSTD_createCDict_advanced2( + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + const ZSTD_CCtx_params* originalCctxParams, + ZSTD_customMem customMem) +{ + ZSTD_CCtx_params cctxParams = *originalCctxParams; + ZSTD_compressionParameters cParams; + ZSTD_CDict* cdict; - return cdict; + DEBUGLOG(3, "ZSTD_createCDict_advanced2, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + if (cctxParams.enableDedicatedDictSearch) { + cParams = ZSTD_dedicatedDictSearch_getCParams( + cctxParams.compressionLevel, dictSize); + ZSTD_overrideCParams(&cParams, &cctxParams.cParams); + } else { + cParams = ZSTD_getCParamsFromCCtxParams( + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + } + + if (!ZSTD_dedicatedDictSearch_isSupported(&cParams)) { + /* Fall back to non-DDSS params */ + cctxParams.enableDedicatedDictSearch = 0; + cParams = ZSTD_getCParamsFromCCtxParams( + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + } + + DEBUGLOG(3, "ZSTD_createCDict_advanced2: DedicatedDictSearch=%u", cctxParams.enableDedicatedDictSearch); + cctxParams.cParams = cParams; + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); + + cdict = ZSTD_createCDict_advanced_internal(dictSize, + dictLoadMethod, cctxParams.cParams, + cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, + customMem); + + if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + cctxParams) )) { + ZSTD_freeCDict(cdict); + return NULL; } + + return cdict; } ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize); - ZSTD_CDict* cdict = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, cParams, ZSTD_defaultCMem); if (cdict) - cdict->compressionLevel = compressionLevel == 0 ? ZSTD_CLEVEL_DEFAULT : compressionLevel; + cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel; return cdict; } ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_compressionParameters cParams = ZSTD_getCParams(compressionLevel, 0, dictSize); - return ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + ZSTD_CDict* const cdict = ZSTD_createCDict_advanced(dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, cParams, ZSTD_defaultCMem); + if (cdict) + cdict->compressionLevel = (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel; + return cdict; } size_t ZSTD_freeCDict(ZSTD_CDict* cdict) @@ -3311,7 +5737,7 @@ size_t ZSTD_freeCDict(ZSTD_CDict* cdict) int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict); ZSTD_cwksp_free(&cdict->workspace, cMem); if (!cdictInWorkspace) { - ZSTD_free(cdict, cMem); + ZSTD_customFree(cdict, cMem); } return 0; } @@ -3325,7 +5751,7 @@ size_t ZSTD_freeCDict(ZSTD_CDict* cdict) * workspaceSize: Use ZSTD_estimateCDictSize() * to determine how large workspace must be. * cParams : use ZSTD_getCParams() to transform a compression level - * into its relevants cParams. + * into its relevant cParams. * @return : pointer to ZSTD_CDict*, or NULL if error (size too small) * Note : there is no corresponding "free" function. * Since workspace was allocated externally, it must be freed externally. @@ -3337,32 +5763,40 @@ const ZSTD_CDict* ZSTD_initStaticCDict( ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams) { - size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0); + ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); + /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + matchStateSize; ZSTD_CDict* cdict; + ZSTD_CCtx_params params; + DEBUGLOG(4, "ZSTD_initStaticCDict (dictSize==%u)", (unsigned)dictSize); if ((size_t)workspace & 7) return NULL; /* 8-aligned */ { ZSTD_cwksp ws; - ZSTD_cwksp_init(&ws, workspace, workspaceSize); + ZSTD_cwksp_init(&ws, workspace, workspaceSize, ZSTD_cwksp_static_alloc); cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); if (cdict == NULL) return NULL; ZSTD_cwksp_move(&cdict->workspace, &ws); } - DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u", - (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize)); if (workspaceSize < neededSize) return NULL; + ZSTD_CCtxParams_init(¶ms, 0); + params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; + cdict->compressionLevel = ZSTD_NO_CLEVEL; + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, dictLoadMethod, dictContentType, - cParams) )) + params) )) return NULL; return cdict; @@ -3374,59 +5808,101 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict) return cdict->matchState.cParams; } -/* ZSTD_compressBegin_usingCDict_advanced() : - * cdict must be != NULL */ -size_t ZSTD_compressBegin_usingCDict_advanced( +/*! ZSTD_getDictID_fromCDict() : + * Provides the dictID of the dictionary loaded into `cdict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict) +{ + if (cdict==NULL) return 0; + return cdict->dictID; +} + +/* ZSTD_compressBegin_usingCDict_internal() : + * Implementation of various ZSTD_compressBegin_usingCDict* functions. + */ +static size_t ZSTD_compressBegin_usingCDict_internal( ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) { - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced"); - RETURN_ERROR_IF(cdict==NULL, dictionary_wrong); - { ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_CCtx_params cctxParams; + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal"); + RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!"); + /* Initialize the cctxParams from the cdict */ + { + ZSTD_parameters params; + params.fParams = fParams; params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN - || cdict->compressionLevel == 0 ) - && (params.attachDictPref != ZSTD_dictForceLoad) ? + || cdict->compressionLevel == 0 ) ? ZSTD_getCParamsFromCDict(cdict) : ZSTD_getCParams(cdict->compressionLevel, pledgedSrcSize, cdict->dictContentSize); - /* Increase window log to fit the entire dictionary and source if the - * source size is known. Limit the increase to 19, which is the - * window log for compression level 1 with the largest source size. - */ - if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) { - U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19); - U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1; - params.cParams.windowLog = MAX(params.cParams.windowLog, limitedSrcLog); - } - params.fParams = fParams; - return ZSTD_compressBegin_internal(cctx, - NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, - cdict, - ¶ms, pledgedSrcSize, - ZSTDb_not_buffered); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, cdict->compressionLevel); + } + /* Increase window log to fit the entire dictionary and source if the + * source size is known. Limit the increase to 19, which is the + * window log for compression level 1 with the largest source size. + */ + if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19); + U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1; + cctxParams.cParams.windowLog = MAX(cctxParams.cParams.windowLog, limitedSrcLog); } + return ZSTD_compressBegin_internal(cctx, + NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, + cdict, + &cctxParams, pledgedSrcSize, + ZSTDb_not_buffered); +} + + +/* ZSTD_compressBegin_usingCDict_advanced() : + * This function is DEPRECATED. + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_advanced( + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, + ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) +{ + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize); } /* ZSTD_compressBegin_usingCDict() : - * pledgedSrcSize=0 means "unknown" - * if pledgedSrcSize>0, it will enable contentSizeFlag */ -size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag); - return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); } +size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); +} + +/*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) +{ + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ + return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); +} + +/*! ZSTD_compress_usingCDict_advanced(): + * This function is DEPRECATED. + */ size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) { - FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize)); /* will check if cdict != NULL */ - return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); } /*! ZSTD_compress_usingCDict() : @@ -3440,7 +5916,7 @@ size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; - return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); } @@ -3481,32 +5957,12 @@ size_t ZSTD_CStreamOutSize(void) return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; } -static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx, - const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType, - const ZSTD_CDict* const cdict, - ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize) +static ZSTD_CParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) { - DEBUGLOG(4, "ZSTD_resetCStream_internal"); - /* Finalize the compression parameters */ - params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, dictSize); - /* params are supposed to be fully validated at this point */ - assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); - assert(!((dict) && (cdict))); /* either dict or cdict, not both */ - - FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, - dict, dictSize, dictContentType, ZSTD_dtlm_fast, - cdict, - ¶ms, pledgedSrcSize, - ZSTDb_buffered) ); - - cctx->inToCompress = 0; - cctx->inBuffPos = 0; - cctx->inBuffTarget = cctx->blockSize - + (cctx->blockSize == pledgedSrcSize); /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */ - cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0; - cctx->streamStage = zcss_load; - cctx->frameEnded = 0; - return 0; /* ready to go */ + if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) + return ZSTD_cpm_attachDict; + else + return ZSTD_cpm_noAttachDict; } /* ZSTD_resetCStream(): @@ -3519,8 +5975,8 @@ size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss) */ U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize); - FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) ); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); return 0; } @@ -3534,16 +5990,16 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize) { DEBUGLOG(4, "ZSTD_initCStream_internal"); - FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) ); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); zcs->requestedParams = *params; assert(!((dict) && (cdict))); /* either dict or cdict, not both */ if (dict) { - FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) ); + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); } else { /* Dictionary is cleared if !cdict */ - FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) ); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); } return 0; } @@ -3556,10 +6012,10 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize) { DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced"); - FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) ); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); zcs->requestedParams.fParams = fParams; - FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) ); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); return 0; } @@ -3567,8 +6023,8 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict) { DEBUGLOG(4, "ZSTD_initCStream_usingCDict"); - FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) ); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); return 0; } @@ -3587,20 +6043,20 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, */ U64 const pledgedSrcSize = (pss==0 && params.fParams.contentSizeFlag==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; DEBUGLOG(4, "ZSTD_initCStream_advanced"); - FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) ); - FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) ); - zcs->requestedParams = ZSTD_assignParamsToCCtxParams(&zcs->requestedParams, params); - FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) ); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); + ZSTD_CCtxParams_setZstdParams(&zcs->requestedParams, ¶ms); + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); return 0; } size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel) { DEBUGLOG(4, "ZSTD_initCStream_usingDict"); - FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) ); - FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) ); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); return 0; } @@ -3612,19 +6068,19 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigne */ U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; DEBUGLOG(4, "ZSTD_initCStream_srcSize"); - FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) ); - FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) ); - FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) ); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); return 0; } size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) { DEBUGLOG(4, "ZSTD_initCStream"); - FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) ); - FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) ); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); return 0; } @@ -3632,44 +6088,54 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) { - size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; - if (hintInSize==0) hintInSize = cctx->blockSize; - return hintInSize; -} - -static size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, - const void* src, size_t srcSize) -{ - size_t const length = MIN(dstCapacity, srcSize); - if (length) memcpy(dst, src, length); - return length; + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + return cctx->blockSizeMax - cctx->stableIn_notConsumed; + } + assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); + { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; + if (hintInSize==0) hintInSize = cctx->blockSizeMax; + return hintInSize; + } } /** ZSTD_compressStream_generic(): * internal function for all *compressStream*() variants - * non-static, because can be called from zstdmt_compress.c - * @return : hint size for next input */ + * @return : hint size for next input to complete ongoing block */ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input, ZSTD_EndDirective const flushMode) { - const char* const istart = (const char*)input->src; - const char* const iend = istart + input->size; - const char* ip = istart + input->pos; - char* const ostart = (char*)output->dst; - char* const oend = ostart + output->size; - char* op = ostart + output->pos; + const char* const istart = (assert(input != NULL), (const char*)input->src); + const char* const iend = (istart != NULL) ? istart + input->size : istart; + const char* ip = (istart != NULL) ? istart + input->pos : istart; + char* const ostart = (assert(output != NULL), (char*)output->dst); + char* const oend = (ostart != NULL) ? ostart + output->size : ostart; + char* op = (ostart != NULL) ? ostart + output->pos : ostart; U32 someMoreWork = 1; /* check expectations */ - DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); - assert(zcs->inBuff != NULL); - assert(zcs->inBuffSize > 0); - assert(zcs->outBuff != NULL); - assert(zcs->outBuffSize > 0); - assert(output->pos <= output->size); + DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); + assert(zcs != NULL); + if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { + assert(input->pos >= zcs->stableIn_notConsumed); + input->pos -= zcs->stableIn_notConsumed; + if (ip) ip -= zcs->stableIn_notConsumed; + zcs->stableIn_notConsumed = 0; + } + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); + } + if (zcs->appliedParams.outBufferMode == ZSTD_bm_buffered) { + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + } + if (input->src == NULL) assert(input->size == 0); assert(input->pos <= input->size); + if (output->dst == NULL) assert(output->size == 0); + assert(output->pos <= output->size); + assert((U32)flushMode <= (U32)ZSTD_e_end); while (someMoreWork) { switch(zcs->streamStage) @@ -3679,26 +6145,29 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, case zcss_load: if ( (flushMode == ZSTD_e_end) - && ((size_t)(oend-op) >= ZSTD_compressBound(iend-ip)) /* enough dstCapacity */ + && ( (size_t)(oend-op) >= ZSTD_compressBound((size_t)(iend-ip)) /* Enough output space */ + || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ && (zcs->inBuffPos == 0) ) { /* shortcut to compression pass directly into output buffer */ - size_t const cSize = ZSTD_compressEnd(zcs, - op, oend-op, ip, iend-ip); + size_t const cSize = ZSTD_compressEnd_public(zcs, + op, (size_t)(oend-op), + ip, (size_t)(iend-ip)); DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); - FORWARD_IF_ERROR(cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); ip = iend; op += cSize; zcs->frameEnded = 1; ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); someMoreWork = 0; break; } - /* complete loading into inBuffer */ - { size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; + /* complete loading into inBuffer in buffered mode */ + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; size_t const loaded = ZSTD_limitCopy( zcs->inBuff + zcs->inBuffPos, toLoad, - ip, iend-ip); + ip, (size_t)(iend-ip)); zcs->inBuffPos += loaded; - ip += loaded; + if (ip) ip += loaded; if ( (flushMode == ZSTD_e_continue) && (zcs->inBuffPos < zcs->inBuffTarget) ) { /* not enough input to fill full block : stop here */ @@ -3709,34 +6178,62 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, /* empty */ someMoreWork = 0; break; } + } else { + assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); + if ( (flushMode == ZSTD_e_continue) + && ( (size_t)(iend - ip) < zcs->blockSizeMax) ) { + /* can't compress a full block : stop here */ + zcs->stableIn_notConsumed = (size_t)(iend - ip); + ip = iend; /* pretend to have consumed input */ + someMoreWork = 0; break; + } + if ( (flushMode == ZSTD_e_flush) + && (ip == iend) ) { + /* empty */ + someMoreWork = 0; break; + } } /* compress current block (note : this stage cannot be stopped in the middle) */ DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); - { void* cDst; + { int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered); + void* cDst; size_t cSize; - size_t const iSize = zcs->inBuffPos - zcs->inToCompress; - size_t oSize = oend-op; - unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); - if (oSize >= ZSTD_compressBound(iSize)) + size_t oSize = (size_t)(oend-op); + size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress + : MIN((size_t)(iend - ip), zcs->blockSizeMax); + if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) cDst = op; /* compress into output buffer, to skip flush stage */ else cDst = zcs->outBuff, oSize = zcs->outBuffSize; - cSize = lastBlock ? - ZSTD_compressEnd(zcs, cDst, oSize, - zcs->inBuff + zcs->inToCompress, iSize) : - ZSTD_compressContinue(zcs, cDst, oSize, - zcs->inBuff + zcs->inToCompress, iSize); - FORWARD_IF_ERROR(cSize); - zcs->frameEnded = lastBlock; - /* prepare next block */ - zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize; - if (zcs->inBuffTarget > zcs->inBuffSize) - zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; - DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", - (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize); - if (!lastBlock) - assert(zcs->inBuffTarget <= zcs->inBuffSize); - zcs->inToCompress = zcs->inBuffPos; + if (inputBuffered) { + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + cSize = lastBlock ? + ZSTD_compressEnd_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : + ZSTD_compressContinue_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; + /* prepare next block */ + zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSizeMax; + if (zcs->inBuffTarget > zcs->inBuffSize) + zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSizeMax; + DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", + (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize); + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; + } else { /* !inputBuffered, hence ZSTD_bm_stable */ + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); + cSize = lastBlock ? + ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : + ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); + /* Consume the input prior to error checking to mirror buffered mode. */ + if (ip) ip += iSize; + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; + if (lastBlock) assert(ip == iend); + } if (cDst == op) { /* no need to flush */ op += cSize; if (zcs->frameEnded) { @@ -3750,15 +6247,17 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, zcs->outBuffFlushedSize = 0; zcs->streamStage = zcss_flush; /* pass-through to flush stage */ } - /* fall-through */ + ZSTD_FALLTHROUGH; case zcss_flush: DEBUGLOG(5, "flush stage"); + assert(zcs->appliedParams.outBufferMode == ZSTD_bm_buffered); { size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend-op), zcs->outBuff + zcs->outBuffFlushedSize, toFlush); DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u", (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed); - op += flushed; + if (flushed) + op += flushed; zcs->outBuffFlushedSize += flushed; if (toFlush!=flushed) { /* flush not fully completed, presumably because dst is too small */ @@ -3782,8 +6281,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, } } - input->pos = ip - istart; - output->pos = op - ostart; + input->pos = (size_t)(ip - istart); + output->pos = (size_t)(op - ostart); if (zcs->frameEnded) return 0; return ZSTD_nextInputSizeHint(zcs); } @@ -3802,94 +6301,246 @@ static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx) size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) { - FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) ); + FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) , ""); return ZSTD_nextInputSizeHint_MTorST(zcs); } - -size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, - ZSTD_outBuffer* output, - ZSTD_inBuffer* input, - ZSTD_EndDirective endOp) +/* After a compression call set the expected input/output buffer. + * This is validated at the start of the next compression call. + */ +static void +ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) { - DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp); - /* check conditions */ - RETURN_ERROR_IF(output->pos > output->size, GENERIC); - RETURN_ERROR_IF(input->pos > input->size, GENERIC); - assert(cctx!=NULL); + DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + cctx->expectedInBuffer = *input; + } + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + cctx->expectedOutBufferSize = output->size - output->pos; + } +} - /* transparent initialization stage */ - if (cctx->streamStage == zcss_init) { - ZSTD_CCtx_params params = cctx->requestedParams; - ZSTD_prefixDict const prefixDict = cctx->prefixDict; - FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) ); /* Init the local dict if present. */ - memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ - assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ - DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); - if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1; /* auto-fix pledgedSrcSize */ +/* Validate that the input/output buffers match the expectations set by + * ZSTD_setBufferExpectations. + */ +static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, + ZSTD_outBuffer const* output, + ZSTD_inBuffer const* input, + ZSTD_EndDirective endOp) +{ + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + ZSTD_inBuffer const expect = cctx->expectedInBuffer; + if (expect.src != input->src || expect.pos != input->pos) + RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); + } + (void)endOp; + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + size_t const outBufferSize = output->size - output->pos; + if (cctx->expectedOutBufferSize != outBufferSize) + RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); + } + return 0; +} + +/* + * If @endOp == ZSTD_e_end, @inSize becomes pledgedSrcSize. + * Otherwise, it's ignored. + * @return: 0 on success, or a ZSTD_error code otherwise. + */ +static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + ZSTD_EndDirective endOp, + size_t inSize) +{ + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ + ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ + assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ + if (cctx->cdict && !cctx->localDict.cdict) { + /* Let the cdict's compression level take priority over the requested params. + * But do not take the cdict's compression level if the "cdict" is actually a localDict + * generated from ZSTD_initLocalDict(). + */ + params.compressionLevel = cctx->cdict->compressionLevel; + } + DEBUGLOG(4, "ZSTD_CCtx_init_compressStream2 : transparent init stage"); + if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ + + { size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); + ZSTD_CParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); params.cParams = ZSTD_getCParamsFromCCtxParams( - &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/); + ¶ms, cctx->pledgedSrcSizePlusOne-1, + dictSize, mode); + } + params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); + params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); + params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); + params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); #ifdef ZSTD_MULTITHREAD - if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) { - params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */ + /* If external matchfinder is enabled, make sure to fail before checking job size (for consistency) */ + RETURN_ERROR_IF( + ZSTD_hasExtSeqProd(¶ms) && params.nbWorkers >= 1, + parameter_combination_unsupported, + "External sequence producer isn't supported with nbWorkers >= 1" + ); + + if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) { + params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */ + } + if (params.nbWorkers > 0) { +# if ZSTD_TRACE + cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0; +# endif + /* mt context creation */ + if (cctx->mtctx == NULL) { + DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u", + params.nbWorkers); + cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem, cctx->pool); + RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation, "NULL pointer!"); } - if (params.nbWorkers > 0) { - /* mt context creation */ - if (cctx->mtctx == NULL) { - DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u", - params.nbWorkers); - cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem); - RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation); + /* mt compression */ + DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers); + FORWARD_IF_ERROR( ZSTDMT_initCStream_internal( + cctx->mtctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, + cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) , ""); + cctx->dictID = cctx->cdict ? cctx->cdict->dictID : 0; + cctx->dictContentSize = cctx->cdict ? cctx->cdict->dictContentSize : prefixDict.dictSize; + cctx->consumedSrcSize = 0; + cctx->producedCSize = 0; + cctx->streamStage = zcss_load; + cctx->appliedParams = params; + } else +#endif /* ZSTD_MULTITHREAD */ + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, ZSTD_dtlm_fast, + cctx->cdict, + ¶ms, pledgedSrcSize, + ZSTDb_buffered) , ""); + assert(cctx->appliedParams.nbWorkers == 0); + cctx->inToCompress = 0; + cctx->inBuffPos = 0; + if (cctx->appliedParams.inBufferMode == ZSTD_bm_buffered) { + /* for small input: avoid automatic flush on reaching end of block, since + * it would require to add a 3-bytes null block to end frame + */ + cctx->inBuffTarget = cctx->blockSizeMax + (cctx->blockSizeMax == pledgedSrcSize); + } else { + cctx->inBuffTarget = 0; + } + cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0; + cctx->streamStage = zcss_load; + cctx->frameEnded = 0; + } + return 0; +} + +/* @return provides a minimum amount of data remaining to be flushed from internal buffers + */ +size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp) +{ + DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp); + /* check conditions */ + RETURN_ERROR_IF(output->pos > output->size, dstSize_tooSmall, "invalid output buffer"); + RETURN_ERROR_IF(input->pos > input->size, srcSize_wrong, "invalid input buffer"); + RETURN_ERROR_IF((U32)endOp > (U32)ZSTD_e_end, parameter_outOfBound, "invalid endDirective"); + assert(cctx != NULL); + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { + size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ + size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; + if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ + && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ + && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ + if (cctx->stableIn_notConsumed) { /* not the first time */ + /* check stable source guarantees */ + RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); + RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); } - /* mt compression */ - DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers); - FORWARD_IF_ERROR( ZSTDMT_initCStream_internal( - cctx->mtctx, - prefixDict.dict, prefixDict.dictSize, ZSTD_dct_rawContent, - cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) ); - cctx->streamStage = zcss_load; - cctx->appliedParams.nbWorkers = params.nbWorkers; - } else -#endif - { FORWARD_IF_ERROR( ZSTD_resetCStream_internal(cctx, - prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, - cctx->cdict, - params, cctx->pledgedSrcSizePlusOne-1) ); - assert(cctx->streamStage == zcss_load); - assert(cctx->appliedParams.nbWorkers == 0); - } } + /* pretend input was consumed, to give a sense forward progress */ + input->pos = input->size; + /* save stable inBuffer, for later control, and flush/end */ + cctx->expectedInBuffer = *input; + /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ + cctx->stableIn_notConsumed += inputSize; + /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ + return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ + } + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); + ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ + } /* end of transparent initialization stage */ + FORWARD_IF_ERROR(ZSTD_checkBufferStability(cctx, output, input, endOp), "invalid buffers"); /* compression stage */ #ifdef ZSTD_MULTITHREAD if (cctx->appliedParams.nbWorkers > 0) { - int const forceMaxProgress = (endOp == ZSTD_e_flush || endOp == ZSTD_e_end); size_t flushMin; - assert(forceMaxProgress || endOp == ZSTD_e_continue /* Protection for a new flush type */); if (cctx->cParamsChanged) { ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams); cctx->cParamsChanged = 0; } - do { + if (cctx->stableIn_notConsumed) { + assert(cctx->appliedParams.inBufferMode == ZSTD_bm_stable); + /* some early data was skipped - make it available for consumption */ + assert(input->pos >= cctx->stableIn_notConsumed); + input->pos -= cctx->stableIn_notConsumed; + cctx->stableIn_notConsumed = 0; + } + for (;;) { + size_t const ipos = input->pos; + size_t const opos = output->pos; flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp); + cctx->consumedSrcSize += (U64)(input->pos - ipos); + cctx->producedCSize += (U64)(output->pos - opos); if ( ZSTD_isError(flushMin) || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */ + if (flushMin == 0) + ZSTD_CCtx_trace(cctx, 0); ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); } - FORWARD_IF_ERROR(flushMin); - } while (forceMaxProgress && flushMin != 0 && output->pos < output->size); + FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed"); + + if (endOp == ZSTD_e_continue) { + /* We only require some progress with ZSTD_e_continue, not maximal progress. + * We're done if we've consumed or produced any bytes, or either buffer is + * full. + */ + if (input->pos != ipos || output->pos != opos || input->pos == input->size || output->pos == output->size) + break; + } else { + assert(endOp == ZSTD_e_flush || endOp == ZSTD_e_end); + /* We require maximal progress. We're done when the flush is complete or the + * output buffer is full. + */ + if (flushMin == 0 || output->pos == output->size) + break; + } + } DEBUGLOG(5, "completed ZSTD_compressStream2 delegating to ZSTDMT_compressStream_generic"); /* Either we don't require maximum forward progress, we've finished the * flush, or we are out of output space. */ - assert(!forceMaxProgress || flushMin == 0 || output->pos == output->size); + assert(endOp == ZSTD_e_continue || flushMin == 0 || output->pos == output->size); + ZSTD_setBufferExpectations(cctx, output, input); return flushMin; } -#endif - FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) ); +#endif /* ZSTD_MULTITHREAD */ + FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) , ""); DEBUGLOG(5, "completed ZSTD_compressStream2"); + ZSTD_setBufferExpectations(cctx, output, input); return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */ } @@ -3899,52 +6550,1113 @@ size_t ZSTD_compressStream2_simpleArgs ( const void* src, size_t srcSize, size_t* srcPos, ZSTD_EndDirective endOp) { - ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; - ZSTD_inBuffer input = { src, srcSize, *srcPos }; + ZSTD_outBuffer output; + ZSTD_inBuffer input; + output.dst = dst; + output.size = dstCapacity; + output.pos = *dstPos; + input.src = src; + input.size = srcSize; + input.pos = *srcPos; /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ - size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); - *dstPos = output.pos; - *srcPos = input.pos; - return cErr; + { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; + } } size_t ZSTD_compress2(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { + ZSTD_bufferMode_e const originalInBufferMode = cctx->requestedParams.inBufferMode; + ZSTD_bufferMode_e const originalOutBufferMode = cctx->requestedParams.outBufferMode; + DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize); ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); + /* Enable stable input/output buffers. */ + cctx->requestedParams.inBufferMode = ZSTD_bm_stable; + cctx->requestedParams.outBufferMode = ZSTD_bm_stable; { size_t oPos = 0; size_t iPos = 0; size_t const result = ZSTD_compressStream2_simpleArgs(cctx, dst, dstCapacity, &oPos, src, srcSize, &iPos, ZSTD_e_end); - FORWARD_IF_ERROR(result); + /* Reset to the original values. */ + cctx->requestedParams.inBufferMode = originalInBufferMode; + cctx->requestedParams.outBufferMode = originalOutBufferMode; + + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); if (result != 0) { /* compression not completed, due to lack of output space */ assert(oPos == dstCapacity); - RETURN_ERROR(dstSize_tooSmall); + RETURN_ERROR(dstSize_tooSmall, ""); } assert(iPos == srcSize); /* all input is expected consumed */ return oPos; } } +/* ZSTD_validateSequence() : + * @offBase : must use the format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ +static size_t +ZSTD_validateSequence(U32 offBase, U32 matchLength, U32 minMatch, + size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) +{ + U32 const windowSize = 1u << windowLog; + /* posInSrc represents the amount of data the decoder would decode up to this point. + * As long as the amount of data decoded is less than or equal to window size, offsets may be + * larger than the total length of output decoded in order to reference the dict, even larger than + * window size. After output surpasses windowSize, we're limited to windowSize offsets again. + */ + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; + size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; + RETURN_ERROR_IF(offBase > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); + /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ + RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); + return 0; +} + +/* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ +static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) +{ + U32 offBase = OFFSET_TO_OFFBASE(rawOffset); + + if (!ll0 && rawOffset == rep[0]) { + offBase = REPCODE1_TO_OFFBASE; + } else if (rawOffset == rep[1]) { + offBase = REPCODE_TO_OFFBASE(2 - ll0); + } else if (rawOffset == rep[2]) { + offBase = REPCODE_TO_OFFBASE(3 - ll0); + } else if (ll0 && rawOffset == rep[0] - 1) { + offBase = REPCODE3_TO_OFFBASE; + } + return offBase; +} + +/* This function scans through an array of ZSTD_Sequence, + * storing the sequences it reads, until it reaches a block delimiter. + * Note that the block delimiter includes the last literals of the block. + * @blockSize must be == sum(sequence_lengths). + * @returns @blockSize on success, and a ZSTD_error otherwise. + */ +static size_t +ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, + ZSTD_SequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize, + ZSTD_ParamSwitch_e externalRepSearch) +{ + U32 idx = seqPos->idx; + U32 const startIdx = idx; + BYTE const* ip = (BYTE const*)(src); + const BYTE* const iend = ip + blockSize; + Repcodes_t updatedRepcodes; + U32 dictSize; + + DEBUGLOG(5, "ZSTD_transferSequences_wBlockDelim (blockSize = %zu)", blockSize); + + if (cctx->cdict) { + dictSize = (U32)cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { + dictSize = (U32)cctx->prefixDict.dictSize; + } else { + dictSize = 0; + } + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { + U32 const litLength = inSeqs[idx].litLength; + U32 const matchLength = inSeqs[idx].matchLength; + U32 offBase; + + if (externalRepSearch == ZSTD_ps_disable) { + offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); + } else { + U32 const ll0 = (litLength == 0); + offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; + FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, + seqPos->posInSrc, + cctx->appliedParams.cParams.windowLog, dictSize, + ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } + RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + } + RETURN_ERROR_IF(idx == inSeqsSize, externalSequences_invalid, "Block delimiter not found."); + + /* If we skipped repcode search while parsing, we need to update repcodes now */ + assert(externalRepSearch != ZSTD_ps_auto); + assert(idx >= startIdx); + if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { + U32* const rep = updatedRepcodes.rep; + U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ + + if (lastSeqIdx >= startIdx + 2) { + rep[2] = inSeqs[lastSeqIdx - 2].offset; + rep[1] = inSeqs[lastSeqIdx - 1].offset; + rep[0] = inSeqs[lastSeqIdx].offset; + } else if (lastSeqIdx == startIdx + 1) { + rep[2] = rep[0]; + rep[1] = inSeqs[lastSeqIdx - 1].offset; + rep[0] = inSeqs[lastSeqIdx].offset; + } else { + assert(lastSeqIdx == startIdx); + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = inSeqs[lastSeqIdx].offset; + } + } + + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + if (inSeqs[idx].litLength) { + DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength); + ZSTD_storeLastLiterals(&cctx->seqStore, ip, inSeqs[idx].litLength); + ip += inSeqs[idx].litLength; + seqPos->posInSrc += inSeqs[idx].litLength; + } + RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); + seqPos->idx = idx+1; + return blockSize; +} + +/* + * This function attempts to scan through @blockSize bytes in @src + * represented by the sequences in @inSeqs, + * storing any (partial) sequences. + * + * Occasionally, we may want to reduce the actual number of bytes consumed from @src + * to avoid splitting a match, notably if it would produce a match smaller than MINMATCH. + * + * @returns the number of bytes consumed from @src, necessarily <= @blockSize. + * Otherwise, it may return a ZSTD error if something went wrong. + */ +static size_t +ZSTD_transferSequences_noDelim(ZSTD_CCtx* cctx, + ZSTD_SequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize, + ZSTD_ParamSwitch_e externalRepSearch) +{ + U32 idx = seqPos->idx; + U32 startPosInSequence = seqPos->posInSequence; + U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize; + size_t dictSize; + const BYTE* const istart = (const BYTE*)(src); + const BYTE* ip = istart; + const BYTE* iend = istart + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */ + Repcodes_t updatedRepcodes; + U32 bytesAdjustment = 0; + U32 finalMatchSplit = 0; + + /* TODO(embg) support fast parsing mode in noBlockDelim mode */ + (void)externalRepSearch; + + if (cctx->cdict) { + dictSize = cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { + dictSize = cctx->prefixDict.dictSize; + } else { + dictSize = 0; + } + DEBUGLOG(5, "ZSTD_transferSequences_noDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); + DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { + const ZSTD_Sequence currSeq = inSeqs[idx]; + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; + U32 offBase; + + /* Modify the sequence depending on where endPosInSequence lies */ + if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { + if (startPosInSequence >= litLength) { + startPosInSequence -= litLength; + litLength = 0; + matchLength -= startPosInSequence; + } else { + litLength -= startPosInSequence; + } + /* Move to the next sequence */ + endPosInSequence -= currSeq.litLength + currSeq.matchLength; + startPosInSequence = 0; + } else { + /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence + does not reach the end of the match. So, we have to split the sequence */ + DEBUGLOG(6, "Require a split: diff: %u, idx: %u PIS: %u", + currSeq.litLength + currSeq.matchLength - endPosInSequence, idx, endPosInSequence); + if (endPosInSequence > litLength) { + U32 firstHalfMatchLength; + litLength = startPosInSequence >= litLength ? 0 : litLength - startPosInSequence; + firstHalfMatchLength = endPosInSequence - startPosInSequence - litLength; + if (matchLength > blockSize && firstHalfMatchLength >= cctx->appliedParams.cParams.minMatch) { + /* Only ever split the match if it is larger than the block size */ + U32 secondHalfMatchLength = currSeq.matchLength + currSeq.litLength - endPosInSequence; + if (secondHalfMatchLength < cctx->appliedParams.cParams.minMatch) { + /* Move the endPosInSequence backward so that it creates match of minMatch length */ + endPosInSequence -= cctx->appliedParams.cParams.minMatch - secondHalfMatchLength; + bytesAdjustment = cctx->appliedParams.cParams.minMatch - secondHalfMatchLength; + firstHalfMatchLength -= bytesAdjustment; + } + matchLength = firstHalfMatchLength; + /* Flag that we split the last match - after storing the sequence, exit the loop, + but keep the value of endPosInSequence */ + finalMatchSplit = 1; + } else { + /* Move the position in sequence backwards so that we don't split match, and break to store + * the last literals. We use the original currSeq.litLength as a marker for where endPosInSequence + * should go. We prefer to do this whenever it is not necessary to split the match, or if doing so + * would cause the first half of the match to be too small + */ + bytesAdjustment = endPosInSequence - currSeq.litLength; + endPosInSequence = currSeq.litLength; + break; + } + } else { + /* This sequence ends inside the literals, break to store the last literals */ + break; + } + } + /* Check if this offset can be represented with a repcode */ + { U32 const ll0 = (litLength == 0); + offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; + FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, + cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } + DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + if (!finalMatchSplit) + idx++; /* Next Sequence */ + } + DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); + seqPos->idx = idx; + seqPos->posInSequence = endPosInSequence; + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + iend -= bytesAdjustment; + if (ip != iend) { + /* Store any last literals */ + U32 const lastLLSize = (U32)(iend - ip); + assert(ip <= iend); + DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize); + ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize); + seqPos->posInSrc += lastLLSize; + } + + return (size_t)(iend-istart); +} + +/* @seqPos represents a position within @inSeqs, + * it is read and updated by this function, + * once the goal to produce a block of size @blockSize is reached. + * @return: nb of bytes consumed from @src, necessarily <= @blockSize. + */ +typedef size_t (*ZSTD_SequenceCopier_f)(ZSTD_CCtx* cctx, + ZSTD_SequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, + const void* src, size_t blockSize, + ZSTD_ParamSwitch_e externalRepSearch); + +static ZSTD_SequenceCopier_f ZSTD_selectSequenceCopier(ZSTD_SequenceFormat_e mode) +{ + assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, (int)mode)); + if (mode == ZSTD_sf_explicitBlockDelimiters) { + return ZSTD_transferSequences_wBlockDelim; + } + assert(mode == ZSTD_sf_noBlockDelimiters); + return ZSTD_transferSequences_noDelim; +} + +/* Discover the size of next block by searching for the delimiter. + * Note that a block delimiter **must** exist in this mode, + * otherwise it's an input error. + * The block size retrieved will be later compared to ensure it remains within bounds */ +static size_t +blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_SequencePosition seqPos) +{ + int end = 0; + size_t blockSize = 0; + size_t spos = seqPos.idx; + DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); + assert(spos <= inSeqsSize); + while (spos < inSeqsSize) { + end = (inSeqs[spos].offset == 0); + blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; + if (end) { + if (inSeqs[spos].matchLength != 0) + RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); + break; + } + spos++; + } + if (!end) + RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); + return blockSize; +} + +static size_t determine_blockSize(ZSTD_SequenceFormat_e mode, + size_t blockSize, size_t remaining, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + ZSTD_SequencePosition seqPos) +{ + DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); + if (mode == ZSTD_sf_noBlockDelimiters) { + /* Note: more a "target" block size */ + return MIN(remaining, blockSize); + } + assert(mode == ZSTD_sf_explicitBlockDelimiters); + { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); + FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); + if (explicitBlockSize > blockSize) + RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); + if (explicitBlockSize > remaining) + RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); + return explicitBlockSize; + } +} + +/* Compress all provided sequences, block-by-block. + * + * Returns the cumulative size of all compressed blocks (including their headers), + * otherwise a ZSTD error. + */ +static size_t +ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) +{ + size_t cSize = 0; + size_t remaining = srcSize; + ZSTD_SequencePosition seqPos = {0, 0, 0}; + + const BYTE* ip = (BYTE const*)src; + BYTE* op = (BYTE*)dst; + ZSTD_SequenceCopier_f const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); + + DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize); + /* Special case: empty frame */ + if (remaining == 0) { + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "No room for empty frame block header"); + MEM_writeLE32(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + cSize += ZSTD_blockHeaderSize; + } + + while (remaining) { + size_t compressedSeqsSize; + size_t cBlockSize; + size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, + cctx->blockSizeMax, remaining, + inSeqs, inSeqsSize, seqPos); + U32 const lastBlock = (blockSize == remaining); + FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); + assert(blockSize <= remaining); + ZSTD_resetSeqStore(&cctx->seqStore); + + blockSize = sequenceCopier(cctx, + &seqPos, inSeqs, inSeqsSize, + ip, blockSize, + cctx->appliedParams.searchForExternalRepcodes); + FORWARD_IF_ERROR(blockSize, "Bad sequence copy"); + + /* If blocks are too small, emit as a nocompress block */ + /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding + * additional 1. We need to revisit and change this logic to be more consistent */ + if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); + DEBUGLOG(5, "Block too small (%zu): data remains uncompressed: cSize=%zu", blockSize, cBlockSize); + cSize += cBlockSize; + ip += blockSize; + op += cBlockSize; + remaining -= blockSize; + dstCapacity -= cBlockSize; + continue; + } + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, + op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, + blockSize, + cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); + DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + + if (!cctx->isFirstBlock && + ZSTD_maybeRLE(&cctx->seqStore) && + ZSTD_isRLE(ip, blockSize)) { + /* Note: don't emit the first block as RLE even if it qualifies because + * doing so will cause the decoder (cli <= v1.4.3 only) to throw an (invalid) error + * "should consume all input error." + */ + compressedSeqsSize = 1; + } + + if (compressedSeqsSize == 0) { + /* ZSTD_noCompressBlock writes the block header as well */ + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); + } else if (compressedSeqsSize == 1) { + cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); + DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); + } else { + U32 cBlockHeader; + /* Error checking and repcodes update */ + ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); + if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + /* Write block header into beginning of block*/ + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; + DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; + + if (lastBlock) { + break; + } else { + ip += blockSize; + op += cBlockSize; + remaining -= blockSize; + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + } + DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + } + + DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; +} + +size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) +{ + BYTE* op = (BYTE*)dst; + size_t cSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ + DEBUGLOG(4, "ZSTD_compressSequences (nbSeqs=%zu,dstCapacity=%zu)", inSeqsSize, dstCapacity); + assert(cctx != NULL); + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); + + /* Begin writing output, starting with frame header */ + { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, + &cctx->appliedParams, srcSize, cctx->dictID); + op += frameHeaderSize; + assert(frameHeaderSize <= dstCapacity); + dstCapacity -= frameHeaderSize; + cSize += frameHeaderSize; + } + if (cctx->appliedParams.fParams.checksumFlag && srcSize) { + XXH64_update(&cctx->xxhState, src, srcSize); + } + + /* Now generate compressed blocks */ + { size_t const cBlocksSize = ZSTD_compressSequences_internal(cctx, + op, dstCapacity, + inSeqs, inSeqsSize, + src, srcSize); + FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); + cSize += cBlocksSize; + assert(cBlocksSize <= dstCapacity); + dstCapacity -= cBlocksSize; + } + + /* Complete with frame checksum, if needed */ + if (cctx->appliedParams.fParams.checksumFlag) { + U32 const checksum = (U32) XXH64_digest(&cctx->xxhState); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); + DEBUGLOG(4, "Write checksum : %08X", (unsigned)checksum); + MEM_writeLE32((char*)dst + cSize, checksum); + cSize += 4; + } + + DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; +} + + +#if defined(__AVX2__) + +#include /* AVX2 intrinsics */ + +/* + * Convert 2 sequences per iteration, using AVX2 intrinsics: + * - offset -> offBase = offset + 2 + * - litLength -> (U16) litLength + * - matchLength -> (U16)(matchLength - 3) + * - rep is ignored + * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]). + * + * At the end, instead of extracting two __m128i, + * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1, + * then store the lower 16 bytes in one go. + * + * @returns 0 on succes, with no long length detected + * @returns > 0 if there is one long length (> 65535), + * indicating the position, and type. + */ +static size_t convertSequences_noRepcodes( + SeqDef* dstSeqs, + const ZSTD_Sequence* inSeqs, + size_t nbSequences) +{ + /* + * addition: + * For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0) + */ + const __m256i addition = _mm256_setr_epi32( + ZSTD_REP_NUM, 0, -MINMATCH, 0, /* for sequence i */ + ZSTD_REP_NUM, 0, -MINMATCH, 0 /* for sequence i+1 */ + ); + + /* limit: check if there is a long length */ + const __m256i limit = _mm256_set1_epi32(65535); + + /* + * shuffle mask for byte-level rearrangement in each 128-bit half: + * + * Input layout (after addition) per 128-bit half: + * [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ] + * We only need: + * offBase (4 bytes) = offset+2 + * litLength (2 bytes) = low 2 bytes of litLength + * mlBase (2 bytes) = low 2 bytes of (matchLength) + * => Bytes [0..3, 4..5, 8..9], zero the rest. + */ + const __m256i mask = _mm256_setr_epi8( + /* For the lower 128 bits => sequence i */ + 0, 1, 2, 3, /* offset+2 */ + 4, 5, /* litLength (16 bits) */ + 8, 9, /* matchLength (16 bits) */ + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, + + /* For the upper 128 bits => sequence i+1 */ + 16,17,18,19, /* offset+2 */ + 20,21, /* litLength */ + 24,25, /* matchLength */ + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, + (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80 + ); + + /* + * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8). + * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3]. + * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1. + */ +#define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ + + size_t longLen = 0, i = 0; + + /* AVX permutation depends on the specific definition of target structures */ + ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); + ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4); + ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6); + + /* Process 2 sequences per loop iteration */ + for (; i + 1 < nbSequences; i += 2) { + /* Load 2 ZSTD_Sequence (32 bytes) */ + __m256i vin = _mm256_loadu_si256((const __m256i*)(const void*)&inSeqs[i]); + + /* Add {2, 0, -3, 0} in each 128-bit half */ + __m256i vadd = _mm256_add_epi32(vin, addition); + + /* Check for long length */ + __m256i ll_cmp = _mm256_cmpgt_epi32(vadd, limit); /* 0xFFFFFFFF for element > 65535 */ + int ll_res = _mm256_movemask_epi8(ll_cmp); + + /* Shuffle bytes so each half gives us the 8 bytes we need */ + __m256i vshf = _mm256_shuffle_epi8(vadd, mask); + /* + * Now: + * Lane0 = seq0's 8 bytes + * Lane1 = 0 + * Lane2 = seq1's 8 bytes + * Lane3 = 0 + */ + + /* Permute 64-bit lanes => move Lane2 down into Lane1. */ + __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8); + /* + * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1]. + * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them. + */ + + /* Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ + _mm_storeu_si128((__m128i *)(void*)&dstSeqs[i], _mm256_castsi256_si128(vperm)); + /* + * This writes out 16 bytes total: + * - offset 0..7 => seq0 (offBase, litLength, mlBase) + * - offset 8..15 => seq1 (offBase, litLength, mlBase) + */ + + /* check (unlikely) long lengths > 65535 + * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27] + * => combined mask = 0x0FF00FF0 + */ + if (UNLIKELY((ll_res & 0x0FF00FF0) != 0)) { + /* long length detected: let's figure out which one*/ + if (inSeqs[i].matchLength > 65535+MINMATCH) { + assert(longLen == 0); + longLen = i + 1; + } + if (inSeqs[i].litLength > 65535) { + assert(longLen == 0); + longLen = i + nbSequences + 1; + } + if (inSeqs[i+1].matchLength > 65535+MINMATCH) { + assert(longLen == 0); + longLen = i + 1 + 1; + } + if (inSeqs[i+1].litLength > 65535) { + assert(longLen == 0); + longLen = i + 1 + nbSequences + 1; + } + } + } + + /* Handle leftover if @nbSequences is odd */ + if (i < nbSequences) { + /* process last sequence */ + assert(i == nbSequences - 1); + dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); + dstSeqs[i].litLength = (U16)inSeqs[i].litLength; + dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); + /* check (unlikely) long lengths > 65535 */ + if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) { + assert(longLen == 0); + longLen = i + 1; + } + if (UNLIKELY(inSeqs[i].litLength > 65535)) { + assert(longLen == 0); + longLen = i + nbSequences + 1; + } + } + + return longLen; +} + +/* the vector implementation could also be ported to SSSE3, + * but since this implementation is targeting modern systems (>= Sapphire Rapid), + * it's not useful to develop and maintain code for older pre-AVX2 platforms */ + +#else /* no AVX2 */ + +static size_t convertSequences_noRepcodes( + SeqDef* dstSeqs, + const ZSTD_Sequence* inSeqs, + size_t nbSequences) +{ + size_t longLen = 0; + size_t n; + for (n=0; n 65535 */ + if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) { + assert(longLen == 0); + longLen = n + 1; + } + if (UNLIKELY(inSeqs[n].litLength > 65535)) { + assert(longLen == 0); + longLen = n + nbSequences + 1; + } + } + return longLen; +} + +#endif + +/* + * Precondition: Sequences must end on an explicit Block Delimiter + * @return: 0 on success, or an error code. + * Note: Sequence validation functionality has been disabled (removed). + * This is helpful to generate a lean main pipeline, improving performance. + * It may be re-inserted later. + */ +size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, + const ZSTD_Sequence* const inSeqs, size_t nbSequences, + int repcodeResolution) +{ + Repcodes_t updatedRepcodes; + size_t seqNb = 0; + + DEBUGLOG(5, "ZSTD_convertBlockSequences (nbSequences = %zu)", nbSequences); + + RETURN_ERROR_IF(nbSequences >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); + + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + + /* check end condition */ + assert(nbSequences >= 1); + assert(inSeqs[nbSequences-1].matchLength == 0); + assert(inSeqs[nbSequences-1].offset == 0); + + /* Convert Sequences from public format to internal format */ + if (!repcodeResolution) { + size_t const longl = convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences-1); + cctx->seqStore.sequences = cctx->seqStore.sequencesStart + nbSequences-1; + if (longl) { + DEBUGLOG(5, "long length"); + assert(cctx->seqStore.longLengthType == ZSTD_llt_none); + if (longl <= nbSequences-1) { + DEBUGLOG(5, "long match length detected at pos %zu", longl-1); + cctx->seqStore.longLengthType = ZSTD_llt_matchLength; + cctx->seqStore.longLengthPos = (U32)(longl-1); + } else { + DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences); + assert(longl <= 2* (nbSequences-1)); + cctx->seqStore.longLengthType = ZSTD_llt_literalLength; + cctx->seqStore.longLengthPos = (U32)(longl-(nbSequences-1)-1); + } + } + } else { + for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { + U32 const litLength = inSeqs[seqNb].litLength; + U32 const matchLength = inSeqs[seqNb].matchLength; + U32 const ll0 = (litLength == 0); + U32 const offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); + + DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); + ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + } + + /* If we skipped repcode search while parsing, we need to update repcodes now */ + if (!repcodeResolution && nbSequences > 1) { + U32* const rep = updatedRepcodes.rep; + + if (nbSequences >= 4) { + U32 lastSeqIdx = (U32)nbSequences - 2; /* index of last full sequence */ + rep[2] = inSeqs[lastSeqIdx - 2].offset; + rep[1] = inSeqs[lastSeqIdx - 1].offset; + rep[0] = inSeqs[lastSeqIdx].offset; + } else if (nbSequences == 3) { + rep[2] = rep[0]; + rep[1] = inSeqs[0].offset; + rep[0] = inSeqs[1].offset; + } else { + assert(nbSequences == 2); + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = inSeqs[0].offset; + } + } + + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + return 0; +} + +#if defined(ZSTD_ARCH_X86_AVX2) + +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) +{ + size_t i; + __m256i const zeroVec = _mm256_setzero_si256(); + __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ + ZSTD_ALIGNED(32) U32 tmp[8]; /* temporary buffer for reduction */ + size_t mSum = 0, lSum = 0; + ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); + + /* Process 2 structs (32 bytes) at a time */ + for (i = 0; i + 2 <= nbSeqs; i += 2) { + /* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */ + __m256i data = _mm256_loadu_si256((const __m256i*)(const void*)&seqs[i]); + /* check end of block signal */ + __m256i cmp = _mm256_cmpeq_epi32(data, zeroVec); + int cmp_res = _mm256_movemask_epi8(cmp); + /* indices for match lengths correspond to bits [8..11], [24..27] + * => combined mask = 0x0F000F00 */ + ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); + if (cmp_res & 0x0F000F00) break; + /* Accumulate in sumVec */ + sumVec = _mm256_add_epi32(sumVec, data); + } + + /* Horizontal reduction */ + _mm256_store_si256((__m256i*)tmp, sumVec); + lSum = tmp[1] + tmp[5]; + mSum = tmp[2] + tmp[6]; + + /* Handle the leftover */ + for (; i < nbSeqs; i++) { + lSum += seqs[i].litLength; + mSum += seqs[i].matchLength; + if (seqs[i].matchLength == 0) break; /* end of block */ + } + + if (i==nbSeqs) { + /* reaching end of sequences: end of block signal was not present */ + BlockSummary bs; + bs.nbSequences = ERROR(externalSequences_invalid); + return bs; + } + { BlockSummary bs; + bs.nbSequences = i+1; + bs.blockSize = lSum + mSum; + bs.litSize = lSum; + return bs; + } +} + +#else + +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) +{ + size_t totalMatchSize = 0; + size_t litSize = 0; + size_t n; + assert(seqs); + for (n=0; nappliedParams.searchForExternalRepcodes == ZSTD_ps_enable); + assert(cctx->appliedParams.searchForExternalRepcodes != ZSTD_ps_auto); + + DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals_internal: nbSeqs=%zu, litSize=%zu", nbSequences, litSize); + RETURN_ERROR_IF(nbSequences == 0, externalSequences_invalid, "Requires at least 1 end-of-block"); + + /* Special case: empty frame */ + if ((nbSequences == 1) && (inSeqs[0].litLength == 0)) { + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1); + RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "No room for empty frame block header"); + MEM_writeLE24(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + cSize += ZSTD_blockHeaderSize; + } + + while (nbSequences) { + size_t compressedSeqsSize, cBlockSize, conversionStatus; + BlockSummary const block = ZSTD_get1BlockSummary(inSeqs, nbSequences); + U32 const lastBlock = (block.nbSequences == nbSequences); + FORWARD_IF_ERROR(block.nbSequences, "Error while trying to determine nb of sequences for a block"); + assert(block.nbSequences <= nbSequences); + RETURN_ERROR_IF(block.litSize > litSize, externalSequences_invalid, "discrepancy: Sequences require more literals than present in buffer"); + ZSTD_resetSeqStore(&cctx->seqStore); + + conversionStatus = ZSTD_convertBlockSequences(cctx, + inSeqs, block.nbSequences, + repcodeResolution); + FORWARD_IF_ERROR(conversionStatus, "Bad sequence conversion"); + inSeqs += block.nbSequences; + nbSequences -= block.nbSequences; + remaining -= block.blockSize; + + /* Note: when blockSize is very small, other variant send it uncompressed. + * Here, we still send the sequences, because we don't have the original source to send it uncompressed. + * One could imagine in theory reproducing the source from the sequences, + * but that's complex and costly memory intensive, and goes against the objectives of this variant. */ + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + + compressedSeqsSize = ZSTD_entropyCompressSeqStore_internal( + op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, + literals, block.litSize, + &cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, + cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); + /* note: the spec forbids for any compressed block to be larger than maximum block size */ + if (compressedSeqsSize > cctx->blockSizeMax) compressedSeqsSize = 0; + DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + litSize -= block.litSize; + literals = (const char*)literals + block.litSize; + + /* Note: difficult to check source for RLE block when only Literals are provided, + * but it could be considered from analyzing the sequence directly */ + + if (compressedSeqsSize == 0) { + /* Sending uncompressed blocks is out of reach, because the source is not provided. + * In theory, one could use the sequences to regenerate the source, like a decompressor, + * but it's complex, and memory hungry, killing the purpose of this variant. + * Current outcome: generate an error code. + */ + RETURN_ERROR(cannotProduce_uncompressedBlock, "ZSTD_compressSequencesAndLiterals cannot generate an uncompressed block"); + } else { + U32 cBlockHeader; + assert(compressedSeqsSize > 1); /* no RLE */ + /* Error checking and repcodes update */ + ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); + if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + /* Write block header into beginning of block*/ + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; + DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; + op += cBlockSize; + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + + if (lastBlock) { + assert(nbSequences == 0); + break; + } + } + + RETURN_ERROR_IF(litSize != 0, externalSequences_invalid, "literals must be entirely and exactly consumed"); + RETURN_ERROR_IF(remaining != 0, externalSequences_invalid, "Sequences must represent a total of exactly srcSize=%zu", srcSize); + DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; +} + +size_t +ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* literals, size_t litSize, size_t litCapacity, + size_t decompressedSize) +{ + BYTE* op = (BYTE*)dst; + size_t cSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ + DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals (dstCapacity=%zu)", dstCapacity); + assert(cctx != NULL); + if (litCapacity < litSize) { + RETURN_ERROR(workSpace_tooSmall, "literals buffer is not large enough: must be at least 8 bytes larger than litSize (risk of read out-of-bound)"); + } + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, decompressedSize), "CCtx initialization failed"); + + if (cctx->appliedParams.blockDelimiters == ZSTD_sf_noBlockDelimiters) { + RETURN_ERROR(frameParameter_unsupported, "This mode is only compatible with explicit delimiters"); + } + if (cctx->appliedParams.validateSequences) { + RETURN_ERROR(parameter_unsupported, "This mode is not compatible with Sequence validation"); + } + if (cctx->appliedParams.fParams.checksumFlag) { + RETURN_ERROR(frameParameter_unsupported, "this mode is not compatible with frame checksum"); + } + + /* Begin writing output, starting with frame header */ + { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, + &cctx->appliedParams, decompressedSize, cctx->dictID); + op += frameHeaderSize; + assert(frameHeaderSize <= dstCapacity); + dstCapacity -= frameHeaderSize; + cSize += frameHeaderSize; + } + + /* Now generate compressed blocks */ + { size_t const cBlocksSize = ZSTD_compressSequencesAndLiterals_internal(cctx, + op, dstCapacity, + inSeqs, inSeqsSize, + literals, litSize, decompressedSize); + FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); + cSize += cBlocksSize; + assert(cBlocksSize <= dstCapacity); + dstCapacity -= cBlocksSize; + } + + DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; +} + /*====== Finalize ======*/ +static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) +{ + const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; + const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); + return stableInput ? zcs->expectedInBuffer : nullInput; +} + /*! ZSTD_flushStream() : * @return : amount of data remaining to flush */ size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) { - ZSTD_inBuffer input = { NULL, 0, 0 }; + ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + input.size = input.pos; /* do not ingest more input during flush */ return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); } - size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) { - ZSTD_inBuffer input = { NULL, 0, 0 }; + ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); - FORWARD_IF_ERROR( remainingToFlush ); + FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ /* single thread mode : attempt to calculate remaining to flush more precisely */ { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; @@ -3957,147 +7669,178 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) /*-===== Pre-defined compression levels =====-*/ +#include "clevels.h" -#define ZSTD_MAX_CLEVEL 22 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } +int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; } -static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { -{ /* "default" - for any srcSize > 256 KB */ - /* W, C, H, S, L, TL, strat */ - { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */ - { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ - { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ - { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ - { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ - { 21, 18, 19, 2, 5, 2, ZSTD_greedy }, /* level 5 */ - { 21, 19, 19, 3, 5, 4, ZSTD_greedy }, /* level 6 */ - { 21, 19, 19, 3, 5, 8, ZSTD_lazy }, /* level 7 */ - { 21, 19, 19, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */ - { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ - { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */ - { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */ - { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */ - { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 13 */ - { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ - { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ - { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ - { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ - { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ - { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ - { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ - { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ - { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ -}, -{ /* for srcSize <= 256 KB */ - /* W, C, H, S, L, T, strat */ - { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ - { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ - { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ - { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ - { 18, 16, 17, 2, 5, 2, ZSTD_greedy }, /* level 4.*/ - { 18, 18, 18, 3, 5, 2, ZSTD_greedy }, /* level 5.*/ - { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ - { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ - { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ - { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ - { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ - { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ - { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ - { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ - { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ - { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ - { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ - { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ - { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ - { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ - { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ - { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ - { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ -}, -{ /* for srcSize <= 128 KB */ - /* W, C, H, S, L, T, strat */ - { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ - { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ - { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ - { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ - { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ - { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ - { 17, 17, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ - { 17, 17, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ - { 17, 17, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ - { 17, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ - { 17, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ - { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ - { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ - { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ - { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ - { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ - { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ - { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ - { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ - { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ - { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ - { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ - { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ -}, -{ /* for srcSize <= 16 KB */ - /* W, C, H, S, L, T, strat */ - { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ - { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ - { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ - { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ - { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ - { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ - { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ - { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ - { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ - { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ - { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ - { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ - { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ - { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ - { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ - { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ - { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ - { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ - { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ - { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ - { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ - { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ - { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ -}, -}; +static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize) +{ + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize, ZSTD_cpm_createCDict); + switch (cParams.strategy) { + case ZSTD_fast: + case ZSTD_dfast: + break; + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + cParams.hashLog += ZSTD_LAZY_DDSS_BUCKET_LOG; + break; + case ZSTD_btlazy2: + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + break; + } + return cParams; +} -/*! ZSTD_getCParams() : +static int ZSTD_dedicatedDictSearch_isSupported( + ZSTD_compressionParameters const* cParams) +{ + return (cParams->strategy >= ZSTD_greedy) + && (cParams->strategy <= ZSTD_lazy2) + && (cParams->hashLog > cParams->chainLog) + && (cParams->chainLog <= 24); +} + +/** + * Reverses the adjustment applied to cparams when enabling dedicated dict + * search. This is used to recover the params set to be used in the working + * context. (Otherwise, those tables would also grow.) + */ +static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams) { + switch (cParams->strategy) { + case ZSTD_fast: + case ZSTD_dfast: + break; + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG; + if (cParams->hashLog < ZSTD_HASHLOG_MIN) { + cParams->hashLog = ZSTD_HASHLOG_MIN; + } + break; + case ZSTD_btlazy2: + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + break; + } +} + +static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) +{ + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: + case ZSTD_cpm_createCDict: + break; + case ZSTD_cpm_attachDict: + dictSize = 0; + break; + default: + assert(0); + break; + } + { int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN; + size_t const addedSize = unknown && dictSize > 0 ? 500 : 0; + return unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize; + } +} + +/*! ZSTD_getCParams_internal() : * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. - * Size values are optional, provide 0 if not known or unused */ -ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) + * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown. + * Use dictSize == 0 for unknown or unused. + * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_CParamMode_e`. */ +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) { - size_t const addedSize = srcSizeHint ? 0 : 500; - U64 const rSize = srcSizeHint+dictSize ? srcSizeHint+dictSize+addedSize : ZSTD_CONTENTSIZE_UNKNOWN; /* intentional overflow for srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN */ + U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode); U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); - int row = compressionLevel; - DEBUGLOG(5, "ZSTD_getCParams (cLevel=%i)", compressionLevel); + int row; + DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel); + + /* row */ if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT; /* 0 == default */ - if (compressionLevel < 0) row = 0; /* entry 0 is baseline for fast mode */ - if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL; + else if (compressionLevel < 0) row = 0; /* entry 0 is baseline for fast mode */ + else if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL; + else row = compressionLevel; + { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; - if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel); /* acceleration factor */ - return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize); /* refine parameters based on srcSize & dictSize */ + DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy); + /* acceleration factor */ + if (compressionLevel < 0) { + int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel); + cp.targetLength = (unsigned)(-clampedCompressionLevel); + } + /* refine parameters based on srcSize & dictSize */ + return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); } } +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Size values are optional, provide 0 if not known or unused */ +ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) +{ + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); +} + /*! ZSTD_getParams() : * same idea as ZSTD_getCParams() * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). * Fields of `ZSTD_frameParameters` are set to default values */ -ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { +static ZSTD_parameters +ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) +{ ZSTD_parameters params; - ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSizeHint, dictSize); + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode); DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); - memset(¶ms, 0, sizeof(params)); + ZSTD_memset(¶ms, 0, sizeof(params)); params.cParams = cParams; params.fParams.contentSizeFlag = 1; return params; } + +/*! ZSTD_getParams() : + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) +{ + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); +} + +void ZSTD_registerSequenceProducer( + ZSTD_CCtx* zc, + void* extSeqProdState, + ZSTD_sequenceProducer_F extSeqProdFunc) +{ + assert(zc != NULL); + ZSTD_CCtxParams_registerSequenceProducer( + &zc->requestedParams, extSeqProdState, extSeqProdFunc + ); +} + +void ZSTD_CCtxParams_registerSequenceProducer( + ZSTD_CCtx_params* params, + void* extSeqProdState, + ZSTD_sequenceProducer_F extSeqProdFunc) +{ + assert(params != NULL); + if (extSeqProdFunc != NULL) { + params->extSeqProdFunc = extSeqProdFunc; + params->extSeqProdState = extSeqProdState; + } else { + params->extSeqProdFunc = NULL; + params->extSeqProdState = NULL; + } +} + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_compress_internal.h b/vendor/github.com/DataDog/zstd/zstd_compress_internal.h index 14036f8..a96783d 100644 --- a/vendor/github.com/DataDog/zstd/zstd_compress_internal.h +++ b/vendor/github.com/DataDog/zstd/zstd_compress_internal.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -23,11 +24,8 @@ #ifdef ZSTD_MULTITHREAD # include "zstdmt_compress.h" #endif - -#if defined (__cplusplus) -extern "C" { -#endif - +#include "bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ +#include "zstd_preSplit.h" /* ZSTD_SLIPBLOCK_WORKSPACESIZE */ /*-************************************* * Constants @@ -39,7 +37,7 @@ extern "C" { It's not a big deal though : candidate will just be sorted again. Additionally, candidate position 1 will be lost. But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. - The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. + The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy. This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ @@ -64,7 +62,7 @@ typedef struct { } ZSTD_localDict; typedef struct { - U32 CTable[HUF_CTABLE_SIZE_U32(255)]; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)]; HUF_repeat repeatMode; } ZSTD_hufCTables_t; @@ -82,29 +80,159 @@ typedef struct { ZSTD_fseCTables_t fse; } ZSTD_entropyCTables_t; +/*********************************************** +* Sequences * +***********************************************/ +typedef struct SeqDef_s { + U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */ + U16 litLength; + U16 mlBase; /* mlBase == matchLength - MINMATCH */ +} SeqDef; + +/* Controls whether seqStore has a single "long" litLength or matchLength. See SeqStore_t. */ +typedef enum { + ZSTD_llt_none = 0, /* no longLengthType */ + ZSTD_llt_literalLength = 1, /* represents a long literal */ + ZSTD_llt_matchLength = 2 /* represents a long match */ +} ZSTD_longLengthType_e; + +typedef struct { + SeqDef* sequencesStart; + SeqDef* sequences; /* ptr to end of sequences */ + BYTE* litStart; + BYTE* lit; /* ptr to end of literals */ + BYTE* llCode; + BYTE* mlCode; + BYTE* ofCode; + size_t maxNbSeq; + size_t maxNbLit; + + /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength + * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment + * the existing value of the litLength or matchLength by 0x10000. + */ + ZSTD_longLengthType_e longLengthType; + U32 longLengthPos; /* Index of the sequence to apply long length modification to */ +} SeqStore_t; + typedef struct { - U32 off; - U32 len; + U32 litLength; + U32 matchLength; +} ZSTD_SequenceLength; + +/** + * Returns the ZSTD_SequenceLength for the given sequences. It handles the decoding of long sequences + * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. + */ +MEM_STATIC ZSTD_SequenceLength ZSTD_getSequenceLength(SeqStore_t const* seqStore, SeqDef const* seq) +{ + ZSTD_SequenceLength seqLen; + seqLen.litLength = seq->litLength; + seqLen.matchLength = seq->mlBase + MINMATCH; + if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { + seqLen.litLength += 0x10000; + } + if (seqStore->longLengthType == ZSTD_llt_matchLength) { + seqLen.matchLength += 0x10000; + } + } + return seqLen; +} + +const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ + + +/*********************************************** +* Entropy buffer statistics structs and funcs * +***********************************************/ +/** ZSTD_hufCTablesMetadata_t : + * Stores Literals Block Type for a super-block in hType, and + * huffman tree description in hufDesBuffer. + * hufDesSize refers to the size of huffman tree description in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */ +typedef struct { + SymbolEncodingType_e hType; + BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; + size_t hufDesSize; +} ZSTD_hufCTablesMetadata_t; + +/** ZSTD_fseCTablesMetadata_t : + * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and + * fse tables in fseTablesBuffer. + * fseTablesSize refers to the size of fse tables in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */ +typedef struct { + SymbolEncodingType_e llType; + SymbolEncodingType_e ofType; + SymbolEncodingType_e mlType; + BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; + size_t fseTablesSize; + size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +} ZSTD_fseCTablesMetadata_t; + +typedef struct { + ZSTD_hufCTablesMetadata_t hufMetadata; + ZSTD_fseCTablesMetadata_t fseMetadata; +} ZSTD_entropyCTablesMetadata_t; + +/** ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +size_t ZSTD_buildBlockEntropyStats( + const SeqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize); + +/********************************* +* Compression internals structs * +*********************************/ + +typedef struct { + U32 off; /* Offset sumtype code for the match, using ZSTD_storeSeq() format */ + U32 len; /* Raw length of match */ } ZSTD_match_t; typedef struct { - int price; - U32 off; - U32 mlen; - U32 litlen; - U32 rep[ZSTD_REP_NUM]; + U32 offset; /* Offset of sequence */ + U32 litLength; /* Length of literals prior to match */ + U32 matchLength; /* Raw length of match */ +} rawSeq; + +typedef struct { + rawSeq* seq; /* The start of the sequences */ + size_t pos; /* The index in seq where reading stopped. pos <= size. */ + size_t posInSequence; /* The position within the sequence at seq[pos] where reading + stopped. posInSequence <= seq[pos].litLength + seq[pos].matchLength */ + size_t size; /* The number of sequences. <= capacity. */ + size_t capacity; /* The capacity starting from `seq` pointer */ +} RawSeqStore_t; + +UNUSED_ATTR static const RawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; + +typedef struct { + int price; /* price from beginning of segment to this position */ + U32 off; /* offset of previous match */ + U32 mlen; /* length of previous match */ + U32 litlen; /* nb of literals since previous match */ + U32 rep[ZSTD_REP_NUM]; /* offset history after previous match */ } ZSTD_optimal_t; typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; +#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3) typedef struct { /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ unsigned* litFreq; /* table of literals statistics, of size 256 */ unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ - ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ - ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ + ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_SIZE */ + ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */ U32 litSum; /* nb of literals */ U32 litLengthSum; /* nb of litLength codes */ @@ -116,7 +244,7 @@ typedef struct { U32 offCodeSumBasePrice; /* to compare to log2(offreq) */ ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */ const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */ - ZSTD_literalCompressionMode_e literalCompressionMode; + ZSTD_ParamSwitch_e literalCompressionMode; } optState_t; typedef struct { @@ -125,15 +253,24 @@ typedef struct { } ZSTD_compressedBlockState_t; typedef struct { - BYTE const* nextSrc; /* next block here to continue on current prefix */ - BYTE const* base; /* All regular indexes relative to this position */ - BYTE const* dictBase; /* extDict indexes relative to this position */ - U32 dictLimit; /* below that point, need extDict */ - U32 lowLimit; /* below that point, no more valid data */ + BYTE const* nextSrc; /* next block here to continue on current prefix */ + BYTE const* base; /* All regular indexes relative to this position */ + BYTE const* dictBase; /* extDict indexes relative to this position */ + U32 dictLimit; /* below that point, need extDict */ + U32 lowLimit; /* below that point, no more valid data */ + U32 nbOverflowCorrections; /* Number of times overflow correction has run since + * ZSTD_window_init(). Useful for debugging coredumps + * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY. + */ } ZSTD_window_t; -typedef struct ZSTD_matchState_t ZSTD_matchState_t; -struct ZSTD_matchState_t { +#define ZSTD_WINDOW_START_INDEX 2 + +typedef struct ZSTD_MatchState_t ZSTD_MatchState_t; + +#define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache for row-based matchfinder */ + +struct ZSTD_MatchState_t { ZSTD_window_t window; /* State for window round buffer management */ U32 loadedDictEnd; /* index of end of dictionary, within context's referential. * When loadedDictEnd != 0, a dictionary is in use, and still valid. @@ -144,18 +281,44 @@ struct ZSTD_matchState_t { */ U32 nextToUpdate; /* index from which to continue table update */ U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ + BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ + U64 hashSalt; /* For row-based matchFinder: salts the hash for reuse of tag table */ + U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ + U32* hashTable; U32* hashTable3; U32* chainTable; + + int forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ + + int dedicatedDictSearch; /* Indicates whether this matchState is using the + * dedicated dictionary search structure. + */ optState_t opt; /* optimal parser state */ - const ZSTD_matchState_t* dictMatchState; + const ZSTD_MatchState_t* dictMatchState; ZSTD_compressionParameters cParams; + const RawSeqStore_t* ldmSeqStore; + + /* Controls prefetching in some dictMatchState matchfinders. + * This behavior is controlled from the cctx ms. + * This parameter has no effect in the cdict ms. */ + int prefetchCDictTables; + + /* When == 0, lazy match finders insert every position. + * When != 0, lazy match finders only insert positions they search. + * This allows them to skip much faster over incompressible data, + * at a small cost to compression ratio. + */ + int lazySkipping; }; typedef struct { ZSTD_compressedBlockState_t* prevCBlock; ZSTD_compressedBlockState_t* nextCBlock; - ZSTD_matchState_t matchState; + ZSTD_MatchState_t matchState; } ZSTD_blockState_t; typedef struct { @@ -163,16 +326,26 @@ typedef struct { U32 checksum; } ldmEntry_t; +typedef struct { + BYTE const* split; + U32 hash; + U32 checksum; + ldmEntry_t* bucket; +} ldmMatchCandidate_t; + +#define LDM_BATCH_SIZE 64 + typedef struct { ZSTD_window_t window; /* State for the window round buffer management */ ldmEntry_t* hashTable; + U32 loadedDictEnd; BYTE* bucketOffsets; /* Next position in bucket to insert entry */ - U64 hashPower; /* Used to compute the rolling hash. - * Depends on ldmParams.minMatchLength */ + size_t splitIndices[LDM_BATCH_SIZE]; + ldmMatchCandidate_t matchCandidates[LDM_BATCH_SIZE]; } ldmState_t; typedef struct { - U32 enableLdm; /* 1 if enable long distance matching */ + ZSTD_ParamSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */ U32 hashLog; /* Log size of hashTable */ U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */ U32 minMatchLength; /* Minimum match length */ @@ -180,19 +353,6 @@ typedef struct { U32 windowLog; /* Window log for the LDM */ } ldmParams_t; -typedef struct { - U32 offset; - U32 litLength; - U32 matchLength; -} rawSeq; - -typedef struct { - rawSeq* seq; /* The start of the sequences */ - size_t pos; /* The position where reading stopped. <= size. */ - size_t size; /* The number of sequences. <= capacity. */ - size_t capacity; /* The capacity starting from `seq` pointer */ -} rawSeqStore_t; - typedef struct { int collectSequences; ZSTD_Sequence* seqStart; @@ -216,7 +376,7 @@ struct ZSTD_CCtx_params_s { * There is no guarantee that hint is close to actual source size */ ZSTD_dictAttachPref_e attachDictPref; - ZSTD_literalCompressionMode_e literalCompressionMode; + ZSTD_ParamSwitch_e literalCompressionMode; /* Multithreading: used to pass parameters to mtctx */ int nbWorkers; @@ -227,36 +387,123 @@ struct ZSTD_CCtx_params_s { /* Long distance matching parameters */ ldmParams_t ldmParams; + /* Dedicated dict search algorithm trigger */ + int enableDedicatedDictSearch; + + /* Input/output buffer modes */ + ZSTD_bufferMode_e inBufferMode; + ZSTD_bufferMode_e outBufferMode; + + /* Sequence compression API */ + ZSTD_SequenceFormat_e blockDelimiters; + int validateSequences; + + /* Block splitting + * @postBlockSplitter executes split analysis after sequences are produced, + * it's more accurate but consumes more resources. + * @preBlockSplitter_level splits before knowing sequences, + * it's more approximative but also cheaper. + * Valid @preBlockSplitter_level values range from 0 to 6 (included). + * 0 means auto, 1 means do not split, + * then levels are sorted in increasing cpu budget, from 2 (fastest) to 6 (slowest). + * Highest @preBlockSplitter_level combines well with @postBlockSplitter. + */ + ZSTD_ParamSwitch_e postBlockSplitter; + int preBlockSplitter_level; + + /* Adjust the max block size*/ + size_t maxBlockSize; + + /* Param for deciding whether to use row-based matchfinder */ + ZSTD_ParamSwitch_e useRowMatchFinder; + + /* Always load a dictionary in ext-dict mode (not prefix mode)? */ + int deterministicRefPrefix; + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ ZSTD_customMem customMem; + + /* Controls prefetching in some dictMatchState matchfinders */ + ZSTD_ParamSwitch_e prefetchCDictTables; + + /* Controls whether zstd will fall back to an internal matchfinder + * if the external matchfinder returns an error code. */ + int enableMatchFinderFallback; + + /* Parameters for the external sequence producer API. + * Users set these parameters through ZSTD_registerSequenceProducer(). + * It is not possible to set these parameters individually through the public API. */ + void* extSeqProdState; + ZSTD_sequenceProducer_F extSeqProdFunc; + + /* Controls repcode search in external sequence parsing */ + ZSTD_ParamSwitch_e searchForExternalRepcodes; }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ +#define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) +#define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE) +#define TMP_WORKSPACE_SIZE (MAX(ENTROPY_WORKSPACE_SIZE, ZSTD_SLIPBLOCK_WORKSPACESIZE)) + +/** + * Indicates whether this compression proceeds directly from user-provided + * source buffer to user-provided destination buffer (ZSTDb_not_buffered), or + * whether the context needs to buffer the input/output (ZSTDb_buffered). + */ +typedef enum { + ZSTDb_not_buffered, + ZSTDb_buffered +} ZSTD_buffered_policy_e; + +/** + * Struct that contains all elements of block splitter that should be allocated + * in a wksp. + */ +#define ZSTD_MAX_NB_BLOCK_SPLITS 196 +typedef struct { + SeqStore_t fullSeqStoreChunk; + SeqStore_t firstHalfSeqStore; + SeqStore_t secondHalfSeqStore; + SeqStore_t currSeqStore; + SeqStore_t nextSeqStore; + + U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS]; + ZSTD_entropyCTablesMetadata_t entropyMetadata; +} ZSTD_blockSplitCtx; + struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ ZSTD_CCtx_params requestedParams; ZSTD_CCtx_params appliedParams; + ZSTD_CCtx_params simpleApiParams; /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */ U32 dictID; + size_t dictContentSize; ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */ - size_t blockSize; + size_t blockSizeMax; unsigned long long pledgedSrcSizePlusOne; /* this way, 0 (default) == unknown */ unsigned long long consumedSrcSize; unsigned long long producedCSize; XXH64_state_t xxhState; ZSTD_customMem customMem; + ZSTD_threadPool* pool; size_t staticSize; SeqCollector seqCollector; int isFirstBlock; + int initialized; - seqStore_t seqStore; /* sequences storage ptrs */ + SeqStore_t seqStore; /* sequences storage ptrs */ ldmState_t ldmState; /* long distance matching state */ rawSeq* ldmSequences; /* Storage for the ldm output sequences */ size_t maxNbLdmSequences; - rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */ + RawSeqStore_t externSeqStore; /* Mutable reference to external sequences */ ZSTD_blockState_t blockState; - U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ + void* tmpWorkspace; /* used as substitute of stack space - must be aligned for S64 type */ + size_t tmpWkspSize; + + /* Whether we are streaming or not */ + ZSTD_buffered_policy_e bufferedPolicy; /* streaming */ char* inBuff; @@ -271,6 +518,11 @@ struct ZSTD_CCtx_s { ZSTD_cStreamStage streamStage; U32 frameEnded; + /* Stable in/out buffer verification */ + ZSTD_inBuffer expectedInBuffer; + size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */ + size_t expectedOutBufferSize; + /* Dictionary */ ZSTD_localDict localDict; const ZSTD_CDict* cdict; @@ -280,17 +532,54 @@ struct ZSTD_CCtx_s { #ifdef ZSTD_MULTITHREAD ZSTDMT_CCtx* mtctx; #endif -}; -typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; + /* Tracing */ +#if ZSTD_TRACE + ZSTD_TraceCtx traceCtx; +#endif -typedef enum { ZSTD_noDict = 0, ZSTD_extDict = 1, ZSTD_dictMatchState = 2 } ZSTD_dictMode_e; + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; + /* Buffer for output from external sequence producer */ + ZSTD_Sequence* extSeqBuf; + size_t extSeqBufCapacity; +}; -typedef size_t (*ZSTD_blockCompressor) ( - ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; +typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e; + +typedef enum { + ZSTD_noDict = 0, + ZSTD_extDict = 1, + ZSTD_dictMatchState = 2, + ZSTD_dedicatedDictSearch = 3 +} ZSTD_dictMode_e; + +typedef enum { + ZSTD_cpm_noAttachDict = 0, /* Compression with ZSTD_noDict or ZSTD_extDict. + * In this mode we use both the srcSize and the dictSize + * when selecting and adjusting parameters. + */ + ZSTD_cpm_attachDict = 1, /* Compression with ZSTD_dictMatchState or ZSTD_dedicatedDictSearch. + * In this mode we only take the srcSize into account when selecting + * and adjusting parameters. + */ + ZSTD_cpm_createCDict = 2, /* Creating a CDict. + * In this mode we take both the source size and the dictionary size + * into account when selecting and adjusting the parameters. + */ + ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. + * We don't know what these parameters are for. We default to the legacy + * behavior of taking both the source size and the dict size into account + * when selecting and adjusting parameters. + */ +} ZSTD_CParamMode_e; + +typedef size_t (*ZSTD_BlockCompressor_f) ( + ZSTD_MatchState_t* bs, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode); +ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); MEM_STATIC U32 ZSTD_LLcode(U32 litLength) @@ -336,6 +625,52 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) return 1; } +/* ZSTD_selectAddr: + * @return index >= lowLimit ? candidate : backup, + * tries to force branchless codegen. */ +MEM_STATIC const BYTE* +ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* candidate, const BYTE* backup) +{ +#if defined(__GNUC__) && defined(__x86_64__) + __asm__ ( + "cmp %1, %2\n" + "cmova %3, %0\n" + : "+r"(candidate) + : "r"(index), "r"(lowLimit), "r"(backup) + ); + return candidate; +#else + return index >= lowLimit ? candidate : backup; +#endif +} + +/* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +MEM_STATIC size_t +ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) +{ + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); + DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); + ZSTD_memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize); + return ZSTD_blockHeaderSize + srcSize; +} + +MEM_STATIC size_t +ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) +{ + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); + RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, ""); + MEM_writeLE24(op, cBlockHeader); + op[3] = src; + return 4; +} + + /* ZSTD_minGain() : * minimum compression required * to generate a compress block or a compressed literals section. @@ -344,16 +679,33 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) { U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); - assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); + assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); return (srcSize >> minlog) + 2; } +MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* cctxParams) +{ + switch (cctxParams->literalCompressionMode) { + case ZSTD_ps_enable: + return 0; + case ZSTD_ps_disable: + return 1; + default: + assert(0 /* impossible: pre-validated */); + ZSTD_FALLTHROUGH; + case ZSTD_ps_auto: + return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0); + } +} + /*! ZSTD_safecopyLiterals() : * memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w. * Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single * large copies. */ -static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) { +static void +ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) +{ assert(iend > ilimit_w); if (ip <= ilimit_w) { ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap); @@ -363,14 +715,69 @@ static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const ie while (ip < iend) *op++ = *ip++; } + +#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1) +#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2) +#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3) +#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */ +#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM) +#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM) +#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) +#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) +#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ + +/*! ZSTD_storeSeqOnly() : + * Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t. + * Literals themselves are not copied, but @litPtr is updated. + * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). + * @matchLength : must be >= MINMATCH +*/ +HINT_INLINE UNUSED_ATTR void +ZSTD_storeSeqOnly(SeqStore_t* seqStorePtr, + size_t litLength, + U32 offBase, + size_t matchLength) +{ + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); + + /* literal Length */ + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (UNLIKELY(litLength>0xFFFF)) { + assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ + seqStorePtr->longLengthType = ZSTD_llt_literalLength; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ + seqStorePtr->sequences[0].offBase = offBase; + + /* match Length */ + assert(matchLength <= ZSTD_BLOCKSIZE_MAX); + assert(matchLength >= MINMATCH); + { size_t const mlBase = matchLength - MINMATCH; + if (UNLIKELY(mlBase>0xFFFF)) { + assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ + seqStorePtr->longLengthType = ZSTD_llt_matchLength; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].mlBase = (U16)mlBase; + } + + seqStorePtr->sequences++; +} + /*! ZSTD_storeSeq() : - * Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t. - * `offCode` : distance to match + ZSTD_REP_MOVE (values <= ZSTD_REP_MOVE are repCodes). - * `mlBase` : matchLength - MINMATCH - * Allowed to overread literals up to litLimit. + * Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t. + * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). + * @matchLength : must be >= MINMATCH + * Allowed to over-read literals up to litLimit. */ -HINT_INLINE UNUSED_ATTR -void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, U32 offCode, size_t mlBase) +HINT_INLINE UNUSED_ATTR void +ZSTD_storeSeq(SeqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, + U32 offBase, + size_t matchLength) { BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; BYTE const* const litEnd = literals + litLength; @@ -378,8 +785,8 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera static const BYTE* g_start = NULL; if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ { U32 const pos = (U32)((const BYTE*)literals - g_start); - DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", - pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode); + DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u", + pos, (U32)litLength, (U32)matchLength, (U32)offBase); } #endif assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); @@ -389,9 +796,9 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera assert(literals + litLength <= litLimit); if (litEnd <= litLimit_w) { /* Common case we can use wildcopy. - * First copy 16 bytes, because literals are likely short. - */ - assert(WILDCOPY_OVERLENGTH >= 16); + * First copy 16 bytes, because literals are likely short. + */ + ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16); ZSTD_copy16(seqStorePtr->lit, literals); if (litLength > 16) { ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); @@ -401,101 +808,50 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera } seqStorePtr->lit += litLength; - /* literal Length */ - if (litLength>0xFFFF) { - assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ - seqStorePtr->longLengthID = 1; - seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); - } - seqStorePtr->sequences[0].litLength = (U16)litLength; - - /* match offset */ - seqStorePtr->sequences[0].offset = offCode + 1; + ZSTD_storeSeqOnly(seqStorePtr, litLength, offBase, matchLength); +} - /* match Length */ - if (mlBase>0xFFFF) { - assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ - seqStorePtr->longLengthID = 2; - seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); +/* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) + * @offBase : sum-type, using numeric representation of ZSTD_storeSeq() + */ +MEM_STATIC void +ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) +{ + if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; + rep[0] = OFFBASE_TO_OFFSET(offBase); + } else { /* repcode */ + U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; + rep[1] = rep[0]; + rep[0] = currentOffset; + } else { /* repCode == 0 */ + /* nothing to do */ + } } - seqStorePtr->sequences[0].matchLength = (U16)mlBase; +} - seqStorePtr->sequences++; +typedef struct repcodes_s { + U32 rep[3]; +} Repcodes_t; + +MEM_STATIC Repcodes_t +ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) +{ + Repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); + ZSTD_updateRep(newReps.rep, offBase, ll0); + return newReps; } /*-************************************* * Match length counter ***************************************/ -static unsigned ZSTD_NbCommonBytes (size_t val) -{ - if (MEM_isLittleEndian()) { - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanForward64( &r, (U64)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 4) - return (__builtin_ctzll((U64)val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, - 0, 3, 1, 3, 1, 4, 2, 7, - 0, 2, 3, 6, 1, 5, 3, 5, - 1, 3, 4, 4, 2, 5, 6, 7, - 7, 0, 1, 2, 3, 3, 4, 6, - 2, 6, 5, 5, 3, 4, 5, 6, - 7, 1, 2, 4, 6, 4, 4, 5, - 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r=0; - _BitScanForward( &r, (U32)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_ctz((U32)val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, - 3, 2, 2, 1, 3, 2, 0, 1, - 3, 3, 1, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif - } - } else { /* Big Endian CPU */ - if (MEM_64bits()) { -# if defined(_MSC_VER) && defined(_WIN64) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 4) - return (__builtin_clzll(val) >> 3); -# else - unsigned r; - const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ - if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif - } else { /* 32 bits */ -# if defined(_MSC_VER) - unsigned long r = 0; - _BitScanReverse( &r, (unsigned long)val ); - return (unsigned)(r>>3); -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (__builtin_clz((U32)val) >> 3); -# else - unsigned r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif - } } -} - - MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) { const BYTE* const pStart = pIn; @@ -529,8 +885,8 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, size_t const matchLength = ZSTD_count(ip, match, vEnd); if (match + matchLength != mEnd) return matchLength; DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength); - DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match); - DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip); + DEBUGLOG(7, "distance from match beginning to end dictionary = %i", (int)(mEnd - match)); + DEBUGLOG(7, "distance from current pos to end buffer = %i", (int)(iEnd - ip)); DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart); DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd)); return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd); @@ -541,31 +897,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, * Hashes ***************************************/ static const U32 prime3bytes = 506832829U; -static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } -MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ +static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } +MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ +MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } static const U32 prime4bytes = 2654435761U; -static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } -static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } +static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } +static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } +static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } static const U64 prime5bytes = 889523592379ULL; -static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } -static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } +static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } +static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } +static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } static const U64 prime6bytes = 227718039650203ULL; -static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } -static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } +static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } +static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } +static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } static const U64 prime7bytes = 58295818150454627ULL; -static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } -static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } +static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } +static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } +static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; -static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } -static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } +static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } +static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } +static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } + -MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) +MEM_STATIC FORCE_INLINE_ATTR +size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) { + /* Although some of these hashes do support hBits up to 64, some do not. + * To be on the safe side, always avoid hBits > 32. */ + assert(hBits <= 32); + switch(mls) { default: @@ -577,6 +945,24 @@ MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) } } +MEM_STATIC FORCE_INLINE_ATTR +size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { + /* Although some of these hashes do support hBits up to 64, some do not. + * To be on the safe side, always avoid hBits > 32. */ + assert(hBits <= 32); + + switch(mls) + { + default: + case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); + case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); + case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); + case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); + case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); + } +} + + /** ZSTD_ipow() : * Return base^exponent. */ @@ -638,11 +1024,12 @@ MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 /*-************************************* * Round buffer management ***************************************/ -#if (ZSTD_WINDOWLOG_MAX_64 > 31) -# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX" -#endif -/* Max current allowed */ -#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX)) +/* Max @current value allowed: + * In 32-bit mode: we want to avoid crossing the 2 GB limit, + * reducing risks of side effects in case of signed operations on indexes. + * In 64-bit mode: we want to ensure that adding the maximum job size (512 MB) + * doesn't overflow U32 index capacity (4 GB) */ +#define ZSTD_CURRENT_MAX (MEM_64bits() ? 3500U MB : 2000U MB) /* Maximum chunk size before overflow correction needs to be called again */ #define ZSTD_CHUNKSIZE_MAX \ ( ((U32)-1) /* Maximum ending current index */ \ @@ -661,6 +1048,13 @@ MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window) window->dictLimit = end; } +MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window) +{ + return window.dictLimit == ZSTD_WINDOW_START_INDEX && + window.lowLimit == ZSTD_WINDOW_START_INDEX && + (window.nextSrc - window.base) == ZSTD_WINDOW_START_INDEX; +} + /** * ZSTD_window_hasExtDict(): * Returns non-zero if the window has a non-empty extDict. @@ -675,25 +1069,81 @@ MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window) * Inspects the provided matchState and figures out what dictMode should be * passed to the compressor. */ -MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms) +MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_MatchState_t *ms) { return ZSTD_window_hasExtDict(ms->window) ? ZSTD_extDict : ms->dictMatchState != NULL ? - ZSTD_dictMatchState : + (ms->dictMatchState->dedicatedDictSearch ? ZSTD_dedicatedDictSearch : ZSTD_dictMatchState) : ZSTD_noDict; } +/* Defining this macro to non-zero tells zstd to run the overflow correction + * code much more frequently. This is very inefficient, and should only be + * used for tests and fuzzers. + */ +#ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY +# ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +# define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1 +# else +# define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0 +# endif +#endif + +/** + * ZSTD_window_canOverflowCorrect(): + * Returns non-zero if the indices are large enough for overflow correction + * to work correctly without impacting compression ratio. + */ +MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window, + U32 cycleLog, + U32 maxDist, + U32 loadedDictEnd, + void const* src) +{ + U32 const cycleSize = 1u << cycleLog; + U32 const curr = (U32)((BYTE const*)src - window.base); + U32 const minIndexToOverflowCorrect = cycleSize + + MAX(maxDist, cycleSize) + + ZSTD_WINDOW_START_INDEX; + + /* Adjust the min index to backoff the overflow correction frequency, + * so we don't waste too much CPU in overflow correction. If this + * computation overflows we don't really care, we just need to make + * sure it is at least minIndexToOverflowCorrect. + */ + U32 const adjustment = window.nbOverflowCorrections + 1; + U32 const adjustedIndex = MAX(minIndexToOverflowCorrect * adjustment, + minIndexToOverflowCorrect); + U32 const indexLargeEnough = curr > adjustedIndex; + + /* Only overflow correct early if the dictionary is invalidated already, + * so we don't hurt compression ratio. + */ + U32 const dictionaryInvalidated = curr > maxDist + loadedDictEnd; + + return indexLargeEnough && dictionaryInvalidated; +} + /** * ZSTD_window_needOverflowCorrection(): * Returns non-zero if the indices are getting too large and need overflow * protection. */ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + U32 cycleLog, + U32 maxDist, + U32 loadedDictEnd, + void const* src, void const* srcEnd) { - U32 const current = (U32)((BYTE const*)srcEnd - window.base); - return current > ZSTD_CURRENT_MAX; + U32 const curr = (U32)((BYTE const*)srcEnd - window.base); + if (ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { + if (ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, loadedDictEnd, src)) { + return 1; + } + } + return curr > ZSTD_CURRENT_MAX; } /** @@ -704,9 +1154,10 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, * * The least significant cycleLog bits of the indices must remain the same, * which may be 0. Every index up to maxDist in the past must be valid. - * NOTE: (maxDist & cycleMask) must be zero. */ -MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, U32 maxDist, void const* src) { /* preemptive overflow correction: @@ -728,19 +1179,51 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, * 3. (cctx->lowLimit + 1< 3<<29 + 1<base); - U32 const newCurrent = (current & cycleMask) + maxDist; - U32 const correction = current - newCurrent; - assert((maxDist & cycleMask) == 0); - assert(current > newCurrent); - /* Loose bound, should be around 1<<29 (see above) */ - assert(correction > 1<<28); + U32 const cycleSize = 1u << cycleLog; + U32 const cycleMask = cycleSize - 1; + U32 const curr = (U32)((BYTE const*)src - window->base); + U32 const currentCycle = curr & cycleMask; + /* Ensure newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX. */ + U32 const currentCycleCorrection = currentCycle < ZSTD_WINDOW_START_INDEX + ? MAX(cycleSize, ZSTD_WINDOW_START_INDEX) + : 0; + U32 const newCurrent = currentCycle + + currentCycleCorrection + + MAX(maxDist, cycleSize); + U32 const correction = curr - newCurrent; + /* maxDist must be a power of two so that: + * (newCurrent & cycleMask) == (curr & cycleMask) + * This is required to not corrupt the chains / binary tree. + */ + assert((maxDist & (maxDist - 1)) == 0); + assert((curr & cycleMask) == (newCurrent & cycleMask)); + assert(curr > newCurrent); + if (!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { + /* Loose bound, should be around 1<<29 (see above) */ + assert(correction > 1<<28); + } window->base += correction; window->dictBase += correction; - window->lowLimit -= correction; - window->dictLimit -= correction; + if (window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) { + window->lowLimit = ZSTD_WINDOW_START_INDEX; + } else { + window->lowLimit -= correction; + } + if (window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) { + window->dictLimit = ZSTD_WINDOW_START_INDEX; + } else { + window->dictLimit -= correction; + } + + /* Ensure we can still reference the full window. */ + assert(newCurrent >= maxDist); + assert(newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX); + /* Ensure that lowLimit and dictLimit didn't underflow. */ + assert(window->lowLimit <= newCurrent); + assert(window->dictLimit <= newCurrent); + + ++window->nbOverflowCorrections; DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction, window->lowLimit); @@ -775,7 +1258,7 @@ ZSTD_window_enforceMaxDist(ZSTD_window_t* window, const void* blockEnd, U32 maxDist, U32* loadedDictEndPtr, - const ZSTD_matchState_t** dictMatchStatePtr) + const ZSTD_MatchState_t** dictMatchStatePtr) { U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base); U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0; @@ -820,7 +1303,7 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, const void* blockEnd, U32 maxDist, U32* loadedDictEndPtr, - const ZSTD_matchState_t** dictMatchStatePtr) + const ZSTD_MatchState_t** dictMatchStatePtr) { assert(loadedDictEndPtr != NULL); assert(dictMatchStatePtr != NULL); @@ -830,10 +1313,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); assert(blockEndIdx >= loadedDictEnd); - if (blockEndIdx > loadedDictEnd + maxDist) { + if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) { /* On reaching window size, dictionaries are invalidated. * For simplification, if window size is reached anywhere within next block, * the dictionary is invalidated for the full block. + * + * We also have to invalidate the dictionary if ZSTD_window_update() has detected + * non-contiguous segments, which means that loadedDictEnd != window->dictLimit. + * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use + * dictMatchState, so setting it to NULL is not a problem. */ DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); *loadedDictEndPtr = 0; @@ -844,6 +1332,17 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, } } } } +MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { + ZSTD_memset(window, 0, sizeof(*window)); + window->base = (BYTE const*)" "; + window->dictBase = (BYTE const*)" "; + ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX); /* Start above ZSTD_DUBT_UNSORTED_MARK */ + window->dictLimit = ZSTD_WINDOW_START_INDEX; /* start from >0, so that 1st position is valid */ + window->lowLimit = ZSTD_WINDOW_START_INDEX; /* it ensures first and later CCtx usages compress the same */ + window->nextSrc = window->base + ZSTD_WINDOW_START_INDEX; /* see issue #1241 */ + window->nbOverflowCorrections = 0; +} + /** * ZSTD_window_update(): * Updates the window by appending [src, src + srcSize) to the window. @@ -851,14 +1350,21 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, * forget about the extDict. Handles overlap of the prefix and extDict. * Returns non-zero if the segment is contiguous. */ -MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, - void const* src, size_t srcSize) +MEM_STATIC +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_window_update(ZSTD_window_t* window, + const void* src, size_t srcSize, + int forceNonContiguous) { BYTE const* const ip = (BYTE const*)src; U32 contiguous = 1; DEBUGLOG(5, "ZSTD_window_update"); + if (srcSize == 0) + return contiguous; + assert(window->base != NULL); + assert(window->dictBase != NULL); /* Check if blocks follow each other */ - if (src != window->nextSrc) { + if (src != window->nextSrc || forceNonContiguous) { /* not contiguous */ size_t const distanceFromBase = (size_t)(window->nextSrc - window->base); DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit); @@ -867,7 +1373,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, window->dictLimit = (U32)distanceFromBase; window->dictBase = window->base; window->base = ip - distanceFromBase; - // ms->nextToUpdate = window->dictLimit; + /* ms->nextToUpdate = window->dictLimit; */ if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit; /* too small extDict */ contiguous = 0; } @@ -875,24 +1381,55 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */ if ( (ip+srcSize > window->dictBase + window->lowLimit) & (ip < window->dictBase + window->dictLimit)) { - ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase; - U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; + size_t const highInputIdx = (size_t)((ip + srcSize) - window->dictBase); + U32 const lowLimitMax = (highInputIdx > (size_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; + assert(highInputIdx < UINT_MAX); window->lowLimit = lowLimitMax; DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit); } return contiguous; } -MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 current, unsigned windowLog) +/** + * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix. + */ +MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog) +{ + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.lowLimit; + U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; + U32 const isDictionary = (ms->loadedDictEnd != 0); + /* When using a dictionary the entire dictionary is valid if a single byte of the dictionary + * is within the window. We invalidate the dictionary (and set loadedDictEnd to 0) when it isn't + * valid for the entire block. So this check is sufficient to find the lowest valid match index. + */ + U32 const matchLowest = isDictionary ? lowestValid : withinWindow; + return matchLowest; +} + +/** + * Returns the lowest allowed match index in the prefix. + */ +MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog) { U32 const maxDistance = 1U << windowLog; - U32 const lowestValid = ms->window.lowLimit; - U32 const withinWindow = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid; + U32 const lowestValid = ms->window.dictLimit; + U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; U32 const isDictionary = (ms->loadedDictEnd != 0); + /* When computing the lowest prefix index we need to take the dictionary into account to handle + * the edge case where the dictionary and the source are contiguous in memory. + */ U32 const matchLowest = isDictionary ? lowestValid : withinWindow; return matchLowest; } +/* index_safety_check: + * intentional underflow : ensure repIndex isn't overlapping dict + prefix + * @return 1 if values are not overlapping, + * 0 otherwise */ +MEM_STATIC int ZSTD_index_overlap_check(const U32 prefixLowestIndex, const U32 repIndex) { + return ((U32)((prefixLowestIndex-1) - repIndex) >= 3); +} /* debug functions */ @@ -926,11 +1463,76 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) #endif +/* Short Cache */ -#if defined (__cplusplus) +/* Normally, zstd matchfinders follow this flow: + * 1. Compute hash at ip + * 2. Load index from hashTable[hash] + * 3. Check if *ip == *(base + index) + * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss. + * + * Short cache is an optimization which allows us to avoid step 3 most of the time + * when the data doesn't actually match. With short cache, the flow becomes: + * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip. + * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works. + * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue. + * + * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to + * dictMatchState matchfinders. + */ +#define ZSTD_SHORT_CACHE_TAG_BITS 8 +#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1) + +/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable. + * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */ +MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) { + size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; + U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK); + assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0); + hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag; } -#endif +/* Helper function for short cache matchfinders. + * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */ +MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) { + U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK; + U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK; + return tag1 == tag2; +} + +/* =============================================================== + * Shared internal declarations + * These prototypes may be called from sources not in lib/compress + * =============================================================== */ + +/* ZSTD_loadCEntropy() : + * dict : must point at beginning of a valid zstd dictionary. + * return : size of dictionary header (size of magic number + dict ID + entropy tables) + * assumptions : magic number supposed already checked + * and dictSize >= 8 */ +size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + const void* const dict, size_t dictSize); + +void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); + +typedef struct { + U32 idx; /* Index in array of ZSTD_Sequence */ + U32 posInSequence; /* Position within sequence at idx */ + size_t posInSrc; /* Number of bytes given by sequences provided so far */ +} ZSTD_SequencePosition; + +/* for benchmark */ +size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, + const ZSTD_Sequence* const inSeqs, size_t nbSequences, + int const repcodeResolution); + +typedef struct { + size_t nbSequences; + size_t blockSize; + size_t litSize; +} BlockSummary; + +BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs); /* ============================================================== * Private declarations @@ -940,9 +1542,10 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) /* ZSTD_getCParamsFromCCtxParams() : * cParams are built depending on compressionLevel, src size hints, * LDM and manually set compression parameters. + * Note: srcSizeHint == 0 means 0! */ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( - const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize); + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); /*! ZSTD_initCStream_internal() : * Private use only. Init streaming operation. @@ -954,7 +1557,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize); -void ZSTD_resetSeqStore(seqStore_t* ssPtr); +void ZSTD_resetSeqStore(SeqStore_t* ssPtr); /*! ZSTD_getCParamsFromCDict() : * as the name implies */ @@ -993,11 +1596,44 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity); * This cannot be used when long range matching is enabled. * Zstd will use these sequences, and pass the literals to a secondary block * compressor. - * @return : An error code on failure. * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory * access and data corruption. */ -size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); +void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); + +/** ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); + +/** ZSTD_CCtx_trace() : + * Trace the end of a compression call. + */ +void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); + +/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */ +MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) { + return params->extSeqProdFunc != NULL; +} + +/* =============================================================== + * Deprecated definitions that are still used internally to avoid + * deprecation warnings. These functions are exactly equivalent to + * their public variants, but avoid the deprecation warnings. + * =============================================================== */ + +size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + +size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + +size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + +size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); #endif /* ZSTD_COMPRESS_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_compress_literals.c b/vendor/github.com/DataDog/zstd/zstd_compress_literals.c index 6c13331..8fab4f4 100644 --- a/vendor/github.com/DataDog/zstd/zstd_compress_literals.c +++ b/vendor/github.com/DataDog/zstd/zstd_compress_literals.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -13,12 +14,37 @@ ***************************************/ #include "zstd_compress_literals.h" + +/* ************************************************************** +* Debug Traces +****************************************************************/ +#if DEBUGLEVEL >= 2 + +static size_t showHexa(const void* src, size_t srcSize) +{ + const BYTE* const ip = (const BYTE*)src; + size_t u; + for (u=0; u31) + (srcSize>4095); - RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall); + DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity); + + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); switch(flSize) { @@ -35,16 +61,31 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, assert(0); } - memcpy(ostart + flSize, src, srcSize); + ZSTD_memcpy(ostart + flSize, src, srcSize); + DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); return srcSize + flSize; } +static int allBytesIdentical(const void* src, size_t srcSize) +{ + assert(srcSize >= 1); + assert(src != NULL); + { const BYTE b = ((const BYTE*)src)[0]; + size_t p; + for (p=1; p31) + (srcSize>4095); - (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ + assert(dstCapacity >= 4); (void)dstCapacity; + assert(allBytesIdentical(src, srcSize)); switch(flSize) { @@ -62,66 +103,103 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* } ostart[flSize] = *(const BYTE*)src; + DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); return flSize+1; } -size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, - ZSTD_hufCTables_t* nextHuf, - ZSTD_strategy strategy, int disableLiteralCompression, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - void* entropyWorkspace, size_t entropyWorkspaceSize, - const int bmi2) +/* ZSTD_minLiteralsToCompress() : + * returns minimal amount of literals + * for literal compression to even be attempted. + * Minimum is made tighter as compression strategy increases. + */ +static size_t +ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) +{ + assert((int)strategy >= 0); + assert((int)strategy <= 9); + /* btultra2 : min 8 bytes; + * then 2x larger for each successive compression strategy + * max threshold 64 bytes */ + { int const shift = MIN(9-(int)strategy, 3); + size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; + DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); + return mintc; + } +} + +size_t ZSTD_compressLiterals ( + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, + const ZSTD_hufCTables_t* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_strategy strategy, + int disableLiteralCompression, + int suspectUncompressible, + int bmi2) { - size_t const minGain = ZSTD_minGain(srcSize, strategy); size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); BYTE* const ostart = (BYTE*)dst; U32 singleStream = srcSize < 256; - symbolEncodingType_e hType = set_compressed; + SymbolEncodingType_e hType = set_compressed; size_t cLitSize; - DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i)", - disableLiteralCompression); + DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)", + disableLiteralCompression, (U32)srcSize, dstCapacity); + + DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize)); /* Prepare nextEntropy assuming reusing the existing table */ - memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); if (disableLiteralCompression) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); - /* small ? don't even attempt compression (speed opt) */ -# define COMPRESS_LITERALS_SIZE_MIN 63 - { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; - if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); - } + /* if too small, don't even attempt compression (speed opt) */ + if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode)) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); { HUF_repeat repeat = prevHuf->repeatMode; - int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; + int const flags = 0 + | (bmi2 ? HUF_flags_bmi2 : 0) + | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) + | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) + | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0); + + typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int); + huf_compress_f huf_compress; if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; - cLitSize = singleStream ? - HUF_compress1X_repeat( - ostart+lhSize, dstCapacity-lhSize, src, srcSize, - 255, 11, entropyWorkspace, entropyWorkspaceSize, - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) : - HUF_compress4X_repeat( - ostart+lhSize, dstCapacity-lhSize, src, srcSize, - 255, 11, entropyWorkspace, entropyWorkspaceSize, - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2); + huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; + cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize, + src, srcSize, + HUF_SYMBOLVALUE_MAX, LitHufLog, + entropyWorkspace, entropyWorkspaceSize, + (HUF_CElt*)nextHuf->CTable, + &repeat, flags); + DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); if (repeat != HUF_repeat_none) { /* reused the existing table */ + DEBUGLOG(5, "reusing statistics from previous huffman block"); hType = set_repeat; } } - if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) { - memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); - } + { size_t const minGain = ZSTD_minGain(srcSize, strategy); + if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + } } if (cLitSize==1) { - memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); - } + /* A return value of 1 signals that the alphabet consists of a single symbol. + * However, in some rare circumstances, it could be the compressed size (a single byte). + * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. + * (it's also necessary to not generate statistics). + * Therefore, in such a case, actively check that all bytes are identical. */ + if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); + } } if (hType == set_compressed) { /* using a newly constructed table */ @@ -132,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, switch(lhSize) { case 3: /* 2 - 2 - 10 - 10 */ - { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); MEM_writeLE24(ostart, lhc); break; } case 4: /* 2 - 2 - 14 - 14 */ + assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); MEM_writeLE32(ostart, lhc); break; } case 5: /* 2 - 2 - 18 - 18 */ + assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); MEM_writeLE32(ostart, lhc); ostart[4] = (BYTE)(cLitSize >> 10); @@ -150,5 +231,8 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, default: /* not possible : lhSize is {3,4,5} */ assert(0); } + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize+cLitSize)); return lhSize+cLitSize; } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_compress_literals.h b/vendor/github.com/DataDog/zstd/zstd_compress_literals.h index 97273d7..80cb1db 100644 --- a/vendor/github.com/DataDog/zstd/zstd_compress_literals.h +++ b/vendor/github.com/DataDog/zstd/zstd_compress_literals.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -16,14 +17,26 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); +/* ZSTD_compressRleLiteralsBlock() : + * Conditions : + * - All bytes in @src are identical + * - dstCapacity >= 4 */ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); -size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, - ZSTD_hufCTables_t* nextHuf, - ZSTD_strategy strategy, int disableLiteralCompression, - void* dst, size_t dstCapacity, +/* ZSTD_compressLiterals(): + * @entropyWorkspace: must be aligned on 4-bytes boundaries + * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE + * @suspectUncompressible: sampling checks, to potentially skip huffman coding + */ +size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize, void* entropyWorkspace, size_t entropyWorkspaceSize, - const int bmi2); + const ZSTD_hufCTables_t* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_strategy strategy, int disableLiteralCompression, + int suspectUncompressible, + int bmi2); #endif /* ZSTD_COMPRESS_LITERALS_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_compress_sequences.c b/vendor/github.com/DataDog/zstd/zstd_compress_sequences.c index 0ff7a26..0496ab8 100644 --- a/vendor/github.com/DataDog/zstd/zstd_compress_sequences.c +++ b/vendor/github.com/DataDog/zstd/zstd_compress_sequences.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -50,6 +51,19 @@ static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) { return maxSymbolValue; } +/** + * Returns true if we should use ncount=-1 else we should + * use ncount=1 for low probability symbols instead. + */ +static unsigned ZSTD_useLowProbCount(size_t const nbSeq) +{ + /* Heuristic: This should cover most blocks <= 16K and + * start to fade out after 16K to about 32K depending on + * compressibility. + */ + return nbSeq >= 2048; +} + /** * Returns the cost in bytes of encoding the normalized count header. * Returns an error if any of the helper functions return an error. @@ -60,7 +74,7 @@ static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max, BYTE wksp[FSE_NCOUNTBOUND]; S16 norm[MaxSeq + 1]; const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); - FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max)); + FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max, ZSTD_useLowProbCount(nbSeq)), ""); return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog); } @@ -72,6 +86,8 @@ static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t { unsigned cost = 0; unsigned s; + + assert(total > 0); for (s = 0; s <= max; ++s) { unsigned norm = (unsigned)((256 * count[s]) / total); if (count[s] != 0 && norm == 0) @@ -86,7 +102,7 @@ static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t * Returns the cost in bits of encoding the distribution in count using ctable. * Returns an error if ctable cannot represent all the symbols in count. */ -static size_t ZSTD_fseBitCost( +size_t ZSTD_fseBitCost( FSE_CTable const* ctable, unsigned const* count, unsigned const max) @@ -96,18 +112,22 @@ static size_t ZSTD_fseBitCost( unsigned s; FSE_CState_t cstate; FSE_initCState(&cstate, ctable); - RETURN_ERROR_IF(ZSTD_getFSEMaxSymbolValue(ctable) < max, GENERIC, - "Repeat FSE_CTable has maxSymbolValue %u < %u", + if (ZSTD_getFSEMaxSymbolValue(ctable) < max) { + DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u", ZSTD_getFSEMaxSymbolValue(ctable), max); + return ERROR(GENERIC); + } for (s = 0; s <= max; ++s) { unsigned const tableLog = cstate.stateLog; unsigned const badCost = (tableLog + 1) << kAccuracyLog; unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog); if (count[s] == 0) continue; - RETURN_ERROR_IF(bitCost >= badCost, GENERIC, - "Repeat FSE_CTable has Prob[%u] == 0", s); - cost += count[s] * bitCost; + if (bitCost >= badCost) { + DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s); + return ERROR(GENERIC); + } + cost += (size_t)count[s] * bitCost; } return cost >> kAccuracyLog; } @@ -117,15 +137,15 @@ static size_t ZSTD_fseBitCost( * table described by norm. The max symbol support by norm is assumed >= max. * norm must be valid for every symbol with non-zero probability in count. */ -static size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, - unsigned const* count, unsigned const max) +size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, + unsigned const* count, unsigned const max) { unsigned const shift = 8 - accuracyLog; size_t cost = 0; unsigned s; assert(accuracyLog <= 8); for (s = 0; s <= max; ++s) { - unsigned const normAcc = norm[s] != -1 ? norm[s] : 1; + unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1; unsigned const norm256 = normAcc << shift; assert(norm256 > 0); assert(norm256 < 256); @@ -134,20 +154,20 @@ static size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, return cost >> 8; } -symbolEncodingType_e +SymbolEncodingType_e ZSTD_selectEncodingType( FSE_repeat* repeatMode, unsigned const* count, unsigned const max, size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, FSE_CTable const* prevCTable, short const* defaultNorm, U32 defaultNormLog, - ZSTD_defaultPolicy_e const isDefaultAllowed, + ZSTD_DefaultPolicy_e const isDefaultAllowed, ZSTD_strategy const strategy) { ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0); if (mostFrequent == nbSeq) { *repeatMode = FSE_repeat_none; if (isDefaultAllowed && nbSeq <= 2) { - /* Prefer set_basic over set_rle when there are 2 or less symbols, + /* Prefer set_basic over set_rle when there are 2 or fewer symbols, * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. * If basic encoding isn't possible, always choose RLE. */ @@ -215,9 +235,14 @@ ZSTD_selectEncodingType( return set_compressed; } +typedef struct { + S16 norm[MaxSeq + 1]; + U32 wksp[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(MaxSeq, MaxFSELog)]; +} ZSTD_BuildCTableWksp; + size_t ZSTD_buildCTable(void* dst, size_t dstCapacity, - FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, + FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type, unsigned* count, U32 max, const BYTE* codeTable, size_t nbSeq, const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, @@ -230,18 +255,18 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity, switch (type) { case set_rle: - FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max)); - RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall); + FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), ""); + RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall, "not enough space"); *op = codeTable[0]; return 1; case set_repeat: - memcpy(nextCTable, prevCTable, prevCTableSize); + ZSTD_memcpy(nextCTable, prevCTable, prevCTableSize); return 0; case set_basic: - FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize)); /* note : could be pre-calculated */ + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), ""); /* note : could be pre-calculated */ return 0; case set_compressed: { - S16 norm[MaxSeq + 1]; + ZSTD_BuildCTableWksp* wksp = (ZSTD_BuildCTableWksp*)entropyWorkspace; size_t nbSeq_1 = nbSeq; const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); if (count[codeTable[nbSeq-1]] > 1) { @@ -249,14 +274,17 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity, nbSeq_1--; } assert(nbSeq_1 > 1); - FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max)); - { size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */ - FORWARD_IF_ERROR(NCountSize); - FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, entropyWorkspace, entropyWorkspaceSize)); + assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp)); + (void)entropyWorkspaceSize; + FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed"); + assert(oend >= op); + { size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog); /* overflow protected */ + FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed"); return NCountSize; } } - default: assert(0); RETURN_ERROR(GENERIC); + default: assert(0); RETURN_ERROR(GENERIC, "impossible to reach"); } } @@ -266,7 +294,7 @@ ZSTD_encodeSequences_body( FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, - seqDef const* sequences, size_t nbSeq, int longOffsets) + SeqDef const* sequences, size_t nbSeq, int longOffsets) { BIT_CStream_t blockStream; FSE_CState_t stateMatchLength; @@ -286,19 +314,19 @@ ZSTD_encodeSequences_body( FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); if (MEM_32bits()) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); + BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]); if (MEM_32bits()) BIT_flushBits(&blockStream); if (longOffsets) { U32 const ofBits = ofCodeTable[nbSeq-1]; - int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); if (extraBits) { - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits); + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBits); BIT_flushBits(&blockStream); } - BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits, ofBits - extraBits); } else { - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]); } BIT_flushBits(&blockStream); @@ -312,8 +340,8 @@ ZSTD_encodeSequences_body( U32 const mlBits = ML_bits[mlCode]; DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u", (unsigned)sequences[n].litLength, - (unsigned)sequences[n].matchLength + MINMATCH, - (unsigned)sequences[n].offset); + (unsigned)sequences[n].mlBase + MINMATCH, + (unsigned)sequences[n].offBase); /* 32b*/ /* 64b*/ /* (7)*/ /* (7)*/ FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */ @@ -324,18 +352,18 @@ ZSTD_encodeSequences_body( BIT_flushBits(&blockStream); /* (7)*/ BIT_addBits(&blockStream, sequences[n].litLength, llBits); if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); + BIT_addBits(&blockStream, sequences[n].mlBase, mlBits); if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream); if (longOffsets) { - int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); if (extraBits) { - BIT_addBits(&blockStream, sequences[n].offset, extraBits); + BIT_addBits(&blockStream, sequences[n].offBase, extraBits); BIT_flushBits(&blockStream); /* (7)*/ } - BIT_addBits(&blockStream, sequences[n].offset >> extraBits, + BIT_addBits(&blockStream, sequences[n].offBase >> extraBits, ofBits - extraBits); /* 31 */ } else { - BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ + BIT_addBits(&blockStream, sequences[n].offBase, ofBits); /* 31 */ } BIT_flushBits(&blockStream); /* (7)*/ DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr)); @@ -360,7 +388,7 @@ ZSTD_encodeSequences_default( FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, - seqDef const* sequences, size_t nbSeq, int longOffsets) + SeqDef const* sequences, size_t nbSeq, int longOffsets) { return ZSTD_encodeSequences_body(dst, dstCapacity, CTable_MatchLength, mlCodeTable, @@ -372,13 +400,13 @@ ZSTD_encodeSequences_default( #if DYNAMIC_BMI2 -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t ZSTD_encodeSequences_bmi2( void* dst, size_t dstCapacity, FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, - seqDef const* sequences, size_t nbSeq, int longOffsets) + SeqDef const* sequences, size_t nbSeq, int longOffsets) { return ZSTD_encodeSequences_body(dst, dstCapacity, CTable_MatchLength, mlCodeTable, @@ -394,7 +422,7 @@ size_t ZSTD_encodeSequences( FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, - seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) + SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) { DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity); #if DYNAMIC_BMI2 @@ -413,3 +441,5 @@ size_t ZSTD_encodeSequences( CTable_LitLength, llCodeTable, sequences, nbSeq, longOffsets); } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_compress_sequences.h b/vendor/github.com/DataDog/zstd/zstd_compress_sequences.h index 57e8e36..ac1b42e 100644 --- a/vendor/github.com/DataDog/zstd/zstd_compress_sequences.h +++ b/vendor/github.com/DataDog/zstd/zstd_compress_sequences.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,26 +12,27 @@ #ifndef ZSTD_COMPRESS_SEQUENCES_H #define ZSTD_COMPRESS_SEQUENCES_H +#include "zstd_compress_internal.h" /* SeqDef */ #include "fse.h" /* FSE_repeat, FSE_CTable */ -#include "zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */ +#include "zstd_internal.h" /* SymbolEncodingType_e, ZSTD_strategy */ typedef enum { ZSTD_defaultDisallowed = 0, ZSTD_defaultAllowed = 1 -} ZSTD_defaultPolicy_e; +} ZSTD_DefaultPolicy_e; -symbolEncodingType_e +SymbolEncodingType_e ZSTD_selectEncodingType( FSE_repeat* repeatMode, unsigned const* count, unsigned const max, size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, FSE_CTable const* prevCTable, short const* defaultNorm, U32 defaultNormLog, - ZSTD_defaultPolicy_e const isDefaultAllowed, + ZSTD_DefaultPolicy_e const isDefaultAllowed, ZSTD_strategy const strategy); size_t ZSTD_buildCTable(void* dst, size_t dstCapacity, - FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, + FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type, unsigned* count, U32 max, const BYTE* codeTable, size_t nbSeq, const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, @@ -42,6 +44,15 @@ size_t ZSTD_encodeSequences( FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, - seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); + SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); +size_t ZSTD_fseBitCost( + FSE_CTable const* ctable, + unsigned const* count, + unsigned const max); + +size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, + unsigned const* count, unsigned const max); #endif /* ZSTD_COMPRESS_SEQUENCES_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_compress_superblock.c b/vendor/github.com/DataDog/zstd/zstd_compress_superblock.c new file mode 100644 index 0000000..582b2d1 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/zstd_compress_superblock.c @@ -0,0 +1,691 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + /*-************************************* + * Dependencies + ***************************************/ +#include "zstd_compress_superblock.h" + +#include "zstd_internal.h" /* ZSTD_getSequenceLength */ +#include "hist.h" /* HIST_countFast_wksp */ +#include "zstd_compress_internal.h" /* ZSTD_[huf|fse|entropy]CTablesMetadata_t */ +#include "zstd_compress_sequences.h" +#include "zstd_compress_literals.h" + +/** ZSTD_compressSubBlock_literal() : + * Compresses literals section for a sub-block. + * When we have to write the Huffman table we will sometimes choose a header + * size larger than necessary. This is because we have to pick the header size + * before we know the table size + compressed size, so we have a bound on the + * table size. If we guessed incorrectly, we fall back to uncompressed literals. + * + * We write the header when writeEntropy=1 and set entropyWritten=1 when we succeeded + * in writing the header, otherwise it is set to 0. + * + * hufMetadata->hType has literals block type info. + * If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block. + * If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block. + * If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block + * Or 0 if unable to compress. + * Or error code */ +static size_t +ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + const BYTE* literals, size_t litSize, + void* dst, size_t dstSize, + const int bmi2, int writeEntropy, int* entropyWritten) +{ + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart + lhSize; + U32 const singleStream = lhSize == 3; + SymbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; + if (litSize == 0 || hufMetadata->hType == set_basic) { + DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal"); + return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize); + } else if (hufMetadata->hType == set_rle) { + DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal"); + return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize); + } + + assert(litSize > 0); + assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat); + + if (writeEntropy && hufMetadata->hType == set_compressed) { + ZSTD_memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize); + op += hufMetadata->hufDesSize; + cLitSize += hufMetadata->hufDesSize; + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + + { int const flags = bmi2 ? HUF_flags_bmi2 : 0; + const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags) + : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { + DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize)); + return 0; + } + /* If we expand and we aren't writing a header then emit uncompressed */ + if (!writeEntropy && cLitSize >= litSize) { + DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible"); + return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize); + } + /* If we are writing headers then allow expansion that doesn't change our header size. */ + if (lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) { + assert(cLitSize > litSize); + DEBUGLOG(5, "Literals expanded beyond allowed header size"); + return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize); + } + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize); + } + + /* Build header */ + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ + { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ + { U32 const lhc = hType + (2 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ + { U32 const lhc = hType + (3 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); + break; + } + default: /* not possible : lhSize is {3,4,5} */ + assert(0); + } + *entropyWritten = 1; + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); + return (size_t)(op-ostart); +} + +static size_t +ZSTD_seqDecompressedSize(SeqStore_t const* seqStore, + const SeqDef* sequences, size_t nbSeqs, + size_t litSize, int lastSubBlock) +{ + size_t matchLengthSum = 0; + size_t litLengthSum = 0; + size_t n; + for (n=0; nllType, fseMetadata->ofType, and fseMetadata->mlType have + * symbol compression modes for the super-block. + * The first successfully compressed block will have these in its header. + * We set entropyWritten=1 when we succeed in compressing the sequences. + * The following sub-blocks will always have repeat mode. + * @return : compressed size of sequences section of a sub-block + * Or 0 if it is unable to compress + * Or error code. */ +static size_t +ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + const SeqDef* sequences, size_t nbSeq, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + const int bmi2, int writeEntropy, int* entropyWritten) +{ + const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + BYTE* seqHead; + + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets); + + *entropyWritten = 0; + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, ""); + if (nbSeq < 128) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; + else + op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + if (nbSeq==0) { + return (size_t)(op - ostart); + } + + /* seqHead : flags for FSE encoding type */ + seqHead = op++; + + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op-ostart)); + + if (writeEntropy) { + const U32 LLtype = fseMetadata->llType; + const U32 Offtype = fseMetadata->ofType; + const U32 MLtype = fseMetadata->mlType; + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize); + *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); + ZSTD_memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize); + op += fseMetadata->fseTablesSize; + } else { + const U32 repeat = set_repeat; + *seqHead = (BYTE)((repeat<<6) + (repeat<<4) + (repeat<<2)); + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( + op, (size_t)(oend - op), + fseTables->matchlengthCTable, mlCode, + fseTables->offcodeCTable, ofCode, + fseTables->litlengthCTable, llCode, + sequences, nbSeq, + longOffsets, bmi2); + FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed"); + op += bitstreamSize; + /* zstd versions <= 1.3.4 mistakenly report corruption when + * FSE_readNCount() receives a buffer < 4 bytes. + * Fixed by https://github.com/facebook/zstd/pull/1146. + * This can happen when the last set_compressed table present is 2 + * bytes and the bitstream is only one byte. + * In this exceedingly rare case, we will simply emit an uncompressed + * block, since it isn't worth optimizing. + */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) { + /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ + assert(fseMetadata->lastCountSize + bitstreamSize == 3); + DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " + "emitting an uncompressed block."); + return 0; + } +#endif + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize); + } + + /* zstd versions <= 1.4.0 mistakenly report error when + * sequences section body size is less than 3 bytes. + * Fixed by https://github.com/facebook/zstd/pull/1664. + * This can happen when the previous sequences section block is compressed + * with rle mode and the current block's sequences section is compressed + * with repeat mode where sequences section body size can be 1 byte. + */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (op-seqHead < 4) { + DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting " + "an uncompressed block when sequences are < 4 bytes"); + return 0; + } +#endif + + *entropyWritten = 1; + return (size_t)(op - ostart); +} + +/** ZSTD_compressSubBlock() : + * Compresses a single sub-block. + * @return : compressed size of the sub-block + * Or 0 if it failed to compress. */ +static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + const SeqDef* sequences, size_t nbSeq, + const BYTE* literals, size_t litSize, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + const int bmi2, + int writeLitEntropy, int writeSeqEntropy, + int* litEntropyWritten, int* seqEntropyWritten, + U32 lastBlock) +{ + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart + ZSTD_blockHeaderSize; + DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)", + litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); + { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, + &entropyMetadata->hufMetadata, literals, litSize, + op, (size_t)(oend-op), + bmi2, writeLitEntropy, litEntropyWritten); + FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); + if (cLitSize == 0) return 0; + op += cLitSize; + } + { size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse, + &entropyMetadata->fseMetadata, + sequences, nbSeq, + llCode, mlCode, ofCode, + cctxParams, + op, (size_t)(oend-op), + bmi2, writeSeqEntropy, seqEntropyWritten); + FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); + if (cSeqSize == 0) return 0; + op += cSeqSize; + } + /* Write block header */ + { size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize; + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(ostart, cBlockHeader24); + } + return (size_t)(op-ostart); +} + +static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, + const ZSTD_hufCTables_t* huf, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = 255; + size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ + + if (hufMetadata->hType == set_basic) return litSize; + else if (hufMetadata->hType == set_rle) return 1; + else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { + size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); + if (ZSTD_isError(largest)) return litSize; + { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); + if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; + return cLitSizeEstimate + literalSectionHeaderSize; + } } + assert(0); /* impossible */ + return 0; +} + +static size_t ZSTD_estimateSubBlockSize_symbolType(SymbolEncodingType_e type, + const BYTE* codeTable, unsigned maxCode, + size_t nbSeq, const FSE_CTable* fseCTable, + const U8* additionalBits, + short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, + void* workspace, size_t wkspSize) +{ + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; + const BYTE* const ctStart = ctp; + const BYTE* const ctEnd = ctStart + nbSeq; + size_t cSymbolTypeSizeEstimateInBits = 0; + unsigned max = maxCode; + + HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */ + if (type == set_basic) { + /* We selected this encoding type, so it must be valid. */ + assert(max <= defaultMax); + cSymbolTypeSizeEstimateInBits = max <= defaultMax + ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max) + : ERROR(GENERIC); + } else if (type == set_rle) { + cSymbolTypeSizeEstimateInBits = 0; + } else if (type == set_compressed || type == set_repeat) { + cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); + } + if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) return nbSeq * 10; + while (ctp < ctEnd) { + if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; + else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ + ctp++; + } + return cSymbolTypeSizeEstimateInBits / 8; +} + +static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + size_t const sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ + size_t cSeqSizeEstimate = 0; + if (nbSeq == 0) return sequencesSectionHeaderSize; + cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff, + nbSeq, fseTables->offcodeCTable, NULL, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL, + nbSeq, fseTables->litlengthCTable, LL_bits, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML, + nbSeq, fseTables->matchlengthCTable, ML_bits, + ML_defaultNorm, ML_defaultNormLog, MaxML, + workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; +} + +typedef struct { + size_t estLitSize; + size_t estBlockSize; +} EstimatedBlockSize; +static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, + int writeLitEntropy, int writeSeqEntropy) +{ + EstimatedBlockSize ebs; + ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize, + &entropy->huf, &entropyMetadata->hufMetadata, + workspace, wkspSize, writeLitEntropy); + ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); + ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize; + return ebs; +} + +static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) +{ + if (fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle) + return 1; + if (fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle) + return 1; + if (fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle) + return 1; + return 0; +} + +static size_t countLiterals(SeqStore_t const* seqStore, const SeqDef* sp, size_t seqCount) +{ + size_t n, total = 0; + assert(sp != NULL); + for (n=0; n %zu bytes", seqCount, (const void*)sp, total); + return total; +} + +#define BYTESCALE 256 + +static size_t sizeBlockSequences(const SeqDef* sp, size_t nbSeqs, + size_t targetBudget, size_t avgLitCost, size_t avgSeqCost, + int firstSubBlock) +{ + size_t n, budget = 0, inSize=0; + /* entropy headers */ + size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */ + assert(firstSubBlock==0 || firstSubBlock==1); + budget += headerSize; + + /* first sequence => at least one sequence*/ + budget += sp[0].litLength * avgLitCost + avgSeqCost; + if (budget > targetBudget) return 1; + inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH); + + /* loop over sequences */ + for (n=1; n targetBudget) + /* though continue to expand until the sub-block is deemed compressible */ + && (budget < inSize * BYTESCALE) ) + break; + } + + return n; +} + +/** ZSTD_compressSubBlock_multi() : + * Breaks super-block into multiple sub-blocks and compresses them. + * Entropy will be written into the first block. + * The following blocks use repeat_mode to compress. + * Sub-blocks are all compressed, except the last one when beneficial. + * @return : compressed size of the super block (which features multiple ZSTD blocks) + * or 0 if it failed to compress. */ +static size_t ZSTD_compressSubBlock_multi(const SeqStore_t* seqStorePtr, + const ZSTD_compressedBlockState_t* prevCBlock, + ZSTD_compressedBlockState_t* nextCBlock, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const int bmi2, U32 lastBlock, + void* workspace, size_t wkspSize) +{ + const SeqDef* const sstart = seqStorePtr->sequencesStart; + const SeqDef* const send = seqStorePtr->sequences; + const SeqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ + size_t const nbSeqs = (size_t)(send - sstart); + const BYTE* const lstart = seqStorePtr->litStart; + const BYTE* const lend = seqStorePtr->lit; + const BYTE* lp = lstart; + size_t const nbLiterals = (size_t)(lend - lstart); + BYTE const* ip = (BYTE const*)src; + BYTE const* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + const BYTE* llCodePtr = seqStorePtr->llCode; + const BYTE* mlCodePtr = seqStorePtr->mlCode; + const BYTE* ofCodePtr = seqStorePtr->ofCode; + size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */ + size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize); + int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed); + int writeSeqEntropy = 1; + + DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)", + (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart)); + + /* let's start by a general estimation for the full block */ + if (nbSeqs > 0) { + EstimatedBlockSize const ebs = + ZSTD_estimateSubBlockSize(lp, nbLiterals, + ofCodePtr, llCodePtr, mlCodePtr, nbSeqs, + &nextCBlock->entropy, entropyMetadata, + workspace, wkspSize, + writeLitEntropy, writeSeqEntropy); + /* quick estimation */ + size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE; + size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs; + const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1); + size_t n, avgBlockBudget, blockBudgetSupp=0; + avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks; + DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes", + (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE, + (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE); + /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately + * this will result in the production of a single uncompressed block covering @srcSize.*/ + if (ebs.estBlockSize > srcSize) return 0; + + /* compress and write sub-blocks */ + assert(nbSubBlocks>0); + for (n=0; n < nbSubBlocks-1; n++) { + /* determine nb of sequences for current sub-block + nbLiterals from next sequence */ + size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp), + avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0); + /* if reached last sequence : break to last sub-block (simplification) */ + assert(seqCount <= (size_t)(send-sp)); + if (sp + seqCount == send) break; + assert(seqCount > 0); + /* compress sub-block */ + { int litEntropyWritten = 0; + int seqEntropyWritten = 0; + size_t litSize = countLiterals(seqStorePtr, sp, seqCount); + const size_t decompressedSize = + ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0); + size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, + sp, seqCount, + lp, litSize, + llCodePtr, mlCodePtr, ofCodePtr, + cctxParams, + op, (size_t)(oend-op), + bmi2, writeLitEntropy, writeSeqEntropy, + &litEntropyWritten, &seqEntropyWritten, + 0); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); + + /* check compressibility, update state components */ + if (cSize > 0 && cSize < decompressedSize) { + DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes", + (unsigned)decompressedSize, (unsigned)cSize); + assert(ip + decompressedSize <= iend); + ip += decompressedSize; + lp += litSize; + op += cSize; + llCodePtr += seqCount; + mlCodePtr += seqCount; + ofCodePtr += seqCount; + /* Entropy only needs to be written once */ + if (litEntropyWritten) { + writeLitEntropy = 0; + } + if (seqEntropyWritten) { + writeSeqEntropy = 0; + } + sp += seqCount; + blockBudgetSupp = 0; + } } + /* otherwise : do not compress yet, coalesce current sub-block with following one */ + } + } /* if (nbSeqs > 0) */ + + /* write last block */ + DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp)); + { int litEntropyWritten = 0; + int seqEntropyWritten = 0; + size_t litSize = (size_t)(lend - lp); + size_t seqCount = (size_t)(send - sp); + const size_t decompressedSize = + ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1); + size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, + sp, seqCount, + lp, litSize, + llCodePtr, mlCodePtr, ofCodePtr, + cctxParams, + op, (size_t)(oend-op), + bmi2, writeLitEntropy, writeSeqEntropy, + &litEntropyWritten, &seqEntropyWritten, + lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); + + /* update pointers, the nb of literals borrowed from next sequence must be preserved */ + if (cSize > 0 && cSize < decompressedSize) { + DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes", + (unsigned)decompressedSize, (unsigned)cSize); + assert(ip + decompressedSize <= iend); + ip += decompressedSize; + lp += litSize; + op += cSize; + llCodePtr += seqCount; + mlCodePtr += seqCount; + ofCodePtr += seqCount; + /* Entropy only needs to be written once */ + if (litEntropyWritten) { + writeLitEntropy = 0; + } + if (seqEntropyWritten) { + writeSeqEntropy = 0; + } + sp += seqCount; + } + } + + + if (writeLitEntropy) { + DEBUGLOG(5, "Literal entropy tables were never written"); + ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); + } + if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { + /* If we haven't written our entropy tables, then we've violated our contract and + * must emit an uncompressed block. + */ + DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block"); + return 0; + } + + if (ip < iend) { + /* some data left : last part of the block sent uncompressed */ + size_t const rSize = (size_t)((iend - ip)); + size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock); + DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize)); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + assert(cSize != 0); + op += cSize; + /* We have to regenerate the repcodes because we've skipped some sequences */ + if (sp < send) { + const SeqDef* seq; + Repcodes_t rep; + ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { + ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } + } + + DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u", + (unsigned)(op-ostart)); + return (size_t)(op-ostart); +} + +size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned lastBlock) +{ + ZSTD_entropyCTablesMetadata_t entropyMetadata; + + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + &entropyMetadata, + zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */), ""); + + return ZSTD_compressSubBlock_multi(&zc->seqStore, + zc->blockState.prevCBlock, + zc->blockState.nextCBlock, + &entropyMetadata, + &zc->appliedParams, + dst, dstCapacity, + src, srcSize, + zc->bmi2, lastBlock, + zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */); +} + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_compress_superblock.h b/vendor/github.com/DataDog/zstd/zstd_compress_superblock.h new file mode 100644 index 0000000..fdc4b16 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/zstd_compress_superblock.h @@ -0,0 +1,35 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPRESS_ADVANCED_H +#define ZSTD_COMPRESS_ADVANCED_H + +/*-************************************* +* Dependencies +***************************************/ + +#include "zstd.h" /* ZSTD_CCtx */ + +/*-************************************* +* Target Compressed Block Size +***************************************/ + +/* ZSTD_compressSuperBlock() : + * Used to compress a super block when targetCBlockSize is being used. + * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */ +size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + void const* src, size_t srcSize, + unsigned lastBlock); + +#endif /* ZSTD_COMPRESS_ADVANCED_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_ctx.go b/vendor/github.com/DataDog/zstd/zstd_ctx.go new file mode 100644 index 0000000..c4a0889 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/zstd_ctx.go @@ -0,0 +1,144 @@ +package zstd + +/* +#include "zstd.h" +*/ +import "C" +import ( + "bytes" + "io/ioutil" + "runtime" + "unsafe" +) + +type Ctx interface { + // Compress src into dst. If you have a buffer to use, you can pass it to + // prevent allocation. If it is too small, or if nil is passed, a new buffer + // will be allocated and returned. + Compress(dst, src []byte) ([]byte, error) + + // CompressLevel is the same as Compress but you can pass a compression level + CompressLevel(dst, src []byte, level int) ([]byte, error) + + // Decompress src into dst. If you have a buffer to use, you can pass it to + // prevent allocation. If it is too small, or if nil is passed, a new buffer + // will be allocated and returned. + Decompress(dst, src []byte) ([]byte, error) + + // DecompressInto decompresses src into dst. Unlike Decompress, DecompressInto + // requires that dst be sufficiently large to hold the decompressed payload. + // DecompressInto may be used when the caller knows the size of the decompressed + // payload before attempting decompression. + // + // It returns the number of bytes copied and an error if any is encountered. If + // dst is too small, DecompressInto errors. + DecompressInto(dst, src []byte) (int, error) +} + +type ctx struct { + cctx *C.ZSTD_CCtx + dctx *C.ZSTD_DCtx +} + +// Create a new ZStd Context. +// When compressing/decompressing many times, it is recommended to allocate a +// context just once, and re-use it for each successive compression operation. +// This will make workload friendlier for system's memory. +// Note : re-using context is just a speed / resource optimization. +// It doesn't change the compression ratio, which remains identical. +// Note 2 : In multi-threaded environments, +// use one different context per thread for parallel execution. +// +func NewCtx() Ctx { + c := &ctx{ + cctx: C.ZSTD_createCCtx(), + dctx: C.ZSTD_createDCtx(), + } + + runtime.SetFinalizer(c, finalizeCtx) + return c +} + +func (c *ctx) Compress(dst, src []byte) ([]byte, error) { + return c.CompressLevel(dst, src, DefaultCompression) +} + +func (c *ctx) CompressLevel(dst, src []byte, level int) ([]byte, error) { + bound := CompressBound(len(src)) + if cap(dst) >= bound { + dst = dst[0:bound] // Reuse dst buffer + } else { + dst = make([]byte, bound) + } + + // We need unsafe.Pointer(&src[0]) in the Cgo call to avoid "Go pointer to Go pointer" panics. + // This means we need to special case empty input. See: + // https://github.com/golang/go/issues/14210#issuecomment-346402945 + var cWritten C.size_t + if len(src) == 0 { + cWritten = C.ZSTD_compressCCtx( + c.cctx, + unsafe.Pointer(&dst[0]), + C.size_t(len(dst)), + unsafe.Pointer(nil), + C.size_t(0), + C.int(level)) + } else { + cWritten = C.ZSTD_compressCCtx( + c.cctx, + unsafe.Pointer(&dst[0]), + C.size_t(len(dst)), + unsafe.Pointer(&src[0]), + C.size_t(len(src)), + C.int(level)) + } + + written := int(cWritten) + // Check if the return is an Error code + if err := getError(written); err != nil { + return nil, err + } + return dst[:written], nil +} + +func (c *ctx) Decompress(dst, src []byte) ([]byte, error) { + if len(src) == 0 { + return []byte{}, ErrEmptySlice + } + + bound := decompressSizeHint(src) + if cap(dst) >= bound { + dst = dst[0:cap(dst)] + } else { + dst = make([]byte, bound) + } + + written, err := c.DecompressInto(dst, src) + if err == nil { + return dst[:written], nil + } + if !IsDstSizeTooSmallError(err) { + return nil, err + } + + // We failed getting a dst buffer of correct size, use stream API + r := NewReader(bytes.NewReader(src)) + defer r.Close() + return ioutil.ReadAll(r) +} + +func (c *ctx) DecompressInto(dst, src []byte) (int, error) { + written := int(C.ZSTD_decompressDCtx( + c.dctx, + unsafe.Pointer(&dst[0]), + C.size_t(len(dst)), + unsafe.Pointer(&src[0]), + C.size_t(len(src)))) + err := getError(written) + return written, err +} + +func finalizeCtx(c *ctx) { + C.ZSTD_freeCCtx(c.cctx) + C.ZSTD_freeDCtx(c.dctx) +} diff --git a/vendor/github.com/DataDog/zstd/zstd_cwksp.h b/vendor/github.com/DataDog/zstd/zstd_cwksp.h index fc9765b..e32ce26 100644 --- a/vendor/github.com/DataDog/zstd/zstd_cwksp.h +++ b/vendor/github.com/DataDog/zstd/zstd_cwksp.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -14,26 +15,15 @@ /*-************************************* * Dependencies ***************************************/ +#include "allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ #include "zstd_internal.h" - -#if defined (__cplusplus) -extern "C" { -#endif +#include "portability_macros.h" +#include "compiler.h" /* ZS2_isPower2 */ /*-************************************* * Constants ***************************************/ -/* define "workspace is too large" as this number of times larger than needed */ -#define ZSTD_WORKSPACETOOLARGE_FACTOR 3 - -/* when workspace is continuously too large - * during at least this number of times, - * context's memory usage is considered wasteful, - * because it's sized to handle a worst case scenario which rarely happens. - * In which case, resize it down to free some memory */ -#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128 - /* Since the workspace is effectively its own little malloc implementation / * arena, when we run under ASAN, we should similarly insert redzones between * each internal element of the workspace, so ASAN will catch overruns that @@ -45,15 +35,30 @@ extern "C" { #define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128 #endif + +/* Set our tables and aligneds to align by 64 bytes */ +#define ZSTD_CWKSP_ALIGNMENT_BYTES 64 + /*-************************************* * Structures ***************************************/ typedef enum { ZSTD_cwksp_alloc_objects, - ZSTD_cwksp_alloc_buffers, - ZSTD_cwksp_alloc_aligned + ZSTD_cwksp_alloc_aligned_init_once, + ZSTD_cwksp_alloc_aligned, + ZSTD_cwksp_alloc_buffers } ZSTD_cwksp_alloc_phase_e; +/** + * Used to describe whether the workspace is statically allocated (and will not + * necessarily ever be freed), or if it's dynamically allocated and we can + * expect a well-formed caller to free this. + */ +typedef enum { + ZSTD_cwksp_dynamic_alloc, + ZSTD_cwksp_static_alloc +} ZSTD_cwksp_static_alloc_e; + /** * Zstd fits all its internal datastructures into a single continuous buffer, * so that it only needs to perform a single OS allocation (or so that a buffer @@ -94,15 +99,15 @@ typedef enum { * * Workspace Layout: * - * [ ... workspace ... ] - * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] + * [ ... workspace ... ] + * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] * * The various objects that live in the workspace are divided into the * following categories, and are allocated separately: * * - Static objects: this is optionally the enclosing ZSTD_CCtx or ZSTD_CDict, * so that literally everything fits in a single buffer. Note: if present, - * this must be the first object in the workspace, since ZSTD_free{CCtx, + * this must be the first object in the workspace, since ZSTD_customFree{CCtx, * CDict}() rely on a pointer comparison to see whether one or two frees are * required. * @@ -117,10 +122,20 @@ typedef enum { * - Tables: these are any of several different datastructures (hash tables, * chain tables, binary trees) that all respect a common format: they are * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). - * Their sizes depend on the cparams. + * Their sizes depend on the cparams. These tables are 64-byte aligned. * - * - Aligned: these buffers are used for various purposes that require 4 byte - * alignment, but don't require any initialization before they're used. + * - Init once: these buffers require to be initialized at least once before + * use. They should be used when we want to skip memory initialization + * while not triggering memory checkers (like Valgrind) when reading from + * from this memory without writing to it first. + * These buffers should be used carefully as they might contain data + * from previous compressions. + * Buffers are aligned to 64 bytes. + * + * - Aligned: these buffers don't require any initialization before they're + * used. The user of the buffer should make sure they write into a buffer + * location before reading from it. + * Buffers are aligned to 64 bytes. * * - Buffers: these buffers are used for various purposes that don't require * any alignment or initialization before they're used. This means they can @@ -132,9 +147,9 @@ typedef enum { * correctly packed into the workspace buffer. That order is: * * 1. Objects - * 2. Buffers - * 3. Aligned - * 4. Tables + * 2. Init once / Tables + * 3. Aligned / Tables + * 4. Buffers / Tables * * Attempts to reserve objects of different types out of order will fail. */ @@ -146,10 +161,12 @@ typedef struct { void* tableEnd; void* tableValidEnd; void* allocStart; + void* initOnceStart; - int allocFailed; + BYTE allocFailed; int workspaceOversizedDuration; ZSTD_cwksp_alloc_phase_e phase; + ZSTD_cwksp_static_alloc_e isStatic; } ZSTD_cwksp; /*-************************************* @@ -157,6 +174,7 @@ typedef struct { ***************************************/ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); +MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { (void)ws; @@ -166,14 +184,29 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { assert(ws->tableEnd <= ws->allocStart); assert(ws->tableValidEnd <= ws->allocStart); assert(ws->allocStart <= ws->workspaceEnd); + assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); + assert(ws->workspace <= ws->initOnceStart); +#if ZSTD_MEMORY_SANITIZER + { + intptr_t const offset = __msan_test_shadow(ws->initOnceStart, + (U8*)ZSTD_cwksp_initialAllocStart(ws) - (U8*)ws->initOnceStart); + (void)offset; +#if defined(ZSTD_MSAN_PRINT) + if(offset!=-1) { + __msan_print_shadow((U8*)ws->initOnceStart + offset - 8, 32); + } +#endif + assert(offset==-1); + }; +#endif } /** * Align must be a power of 2. */ -MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { +MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t align) { size_t const mask = align - 1; - assert((align & mask) == 0); + assert(ZSTD_isPower2(align)); return (size + mask) & ~mask; } @@ -186,64 +219,81 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { * Since tables aren't currently redzoned, you don't need to call through this * to figure out how much space you need for the matchState tables. Everything * else is though. + * + * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned64_alloc_size(). */ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { -#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + if (size == 0) + return 0; +#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) return size + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; #else return size; #endif } -MEM_STATIC void ZSTD_cwksp_internal_advance_phase( - ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) { - assert(phase >= ws->phase); - if (phase > ws->phase) { - if (ws->phase < ZSTD_cwksp_alloc_buffers && - phase >= ZSTD_cwksp_alloc_buffers) { - ws->tableValidEnd = ws->objectEnd; - } - if (ws->phase < ZSTD_cwksp_alloc_aligned && - phase >= ZSTD_cwksp_alloc_aligned) { - /* If unaligned allocations down from a too-large top have left us - * unaligned, we need to realign our alloc ptr. Technically, this - * can consume space that is unaccounted for in the neededSpace - * calculation. However, I believe this can only happen when the - * workspace is too large, and specifically when it is too large - * by a larger margin than the space that will be consumed. */ - /* TODO: cleaner, compiler warning friendly way to do this??? */ - ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1)); - if (ws->allocStart < ws->tableValidEnd) { - ws->tableValidEnd = ws->allocStart; - } - } - ws->phase = phase; - } +MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size, size_t alignment) { + return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, alignment)); } /** - * Returns whether this object/buffer/etc was allocated in this workspace. + * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes. + * Used to determine the number of bytes required for a given "aligned". */ -MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { - return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); +MEM_STATIC size_t ZSTD_cwksp_aligned64_alloc_size(size_t size) { + return ZSTD_cwksp_aligned_alloc_size(size, ZSTD_CWKSP_ALIGNMENT_BYTES); } /** - * Internal function. Do not use directly. + * Returns the amount of additional space the cwksp must allocate + * for internal purposes (currently only alignment). */ -MEM_STATIC void* ZSTD_cwksp_reserve_internal( - ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) { - void* alloc; - void* bottom = ws->tableEnd; - ZSTD_cwksp_internal_advance_phase(ws, phase); - alloc = (BYTE *)ws->allocStart - bytes; +MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { + /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES + * bytes to align the beginning of tables section and end of buffers; + */ + size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; + return slackSpace; +} -#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) - /* over-reserve space */ - alloc = (BYTE *)alloc - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; -#endif - DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", +/** + * Return the number of additional bytes required to align a pointer to the given number of bytes. + * alignBytes must be a power of two. + */ +MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) { + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; + assert(ZSTD_isPower2(alignBytes)); + assert(bytes < alignBytes); + return bytes; +} + +/** + * Returns the initial value for allocStart which is used to determine the position from + * which we can allocate from the end of the workspace. + */ +MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) +{ + char* endPtr = (char*)ws->workspaceEnd; + assert(ZSTD_isPower2(ZSTD_CWKSP_ALIGNMENT_BYTES)); + endPtr = endPtr - ((size_t)endPtr % ZSTD_CWKSP_ALIGNMENT_BYTES); + return (void*)endPtr; +} + +/** + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, + * which counts from the end of the wksp (as opposed to the object/table segment). + * + * Returns a pointer to the beginning of that space. + */ +MEM_STATIC void* +ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) +{ + void* const alloc = (BYTE*)ws->allocStart - bytes; + void* const bottom = ws->tableEnd; + DEBUGLOG(5, "cwksp: reserving [0x%p]:%zd bytes; %zd bytes remaining", alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); ZSTD_cwksp_assert_internal_consistency(ws); assert(alloc >= bottom); @@ -252,16 +302,88 @@ MEM_STATIC void* ZSTD_cwksp_reserve_internal( ws->allocFailed = 1; return NULL; } + /* the area is reserved from the end of wksp. + * If it overlaps with tableValidEnd, it voids guarantees on values' range */ if (alloc < ws->tableValidEnd) { ws->tableValidEnd = alloc; } ws->allocStart = alloc; + return alloc; +} + +/** + * Moves the cwksp to the next phase, and does any necessary allocations. + * cwksp initialization must necessarily go through each phase in order. + * Returns a 0 on success, or zstd error + */ +MEM_STATIC size_t +ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) +{ + assert(phase >= ws->phase); + if (phase > ws->phase) { + /* Going from allocating objects to allocating initOnce / tables */ + if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && + phase >= ZSTD_cwksp_alloc_aligned_init_once) { + ws->tableValidEnd = ws->objectEnd; + ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ + void *const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); + void *const objectEnd = (BYTE *) alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); + ws->objectEnd = objectEnd; + ws->tableEnd = objectEnd; /* table area starts being empty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; + } + } + } + ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); + } + return 0; +} -#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) +/** + * Returns whether this object/buffer/etc was allocated in this workspace. + */ +MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) +{ + return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); +} + +/** + * Internal function. Do not use directly. + */ +MEM_STATIC void* +ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) +{ + void* alloc; + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) { + return NULL; + } + +#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* over-reserve space */ + bytes += 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; +#endif + + alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes); + +#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on * either size. */ - alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; - __asan_unpoison_memory_region(alloc, bytes); + if (alloc) { + alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; + if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) { + /* We need to keep the redzone poisoned while unpoisoning the bytes that + * are actually allocated. */ + __asan_unpoison_memory_region(alloc, bytes - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE); + } + } #endif return alloc; @@ -270,33 +392,79 @@ MEM_STATIC void* ZSTD_cwksp_reserve_internal( /** * Reserves and returns unaligned memory. */ -MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) { +MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) +{ return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); } /** - * Reserves and returns memory sized on and aligned on sizeof(unsigned). + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). + * This memory has been initialized at least once in the past. + * This doesn't mean it has been initialized this time, and it might contain data from previous + * operations. + * The main usage is for algorithms that might need read access into uninitialized memory. + * The algorithm must maintain safety under these conditions and must make sure it doesn't + * leak any of the past data (directly or in side channels). */ -MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) { - assert((bytes & (sizeof(U32)-1)) == 0); - return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned); +MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) +{ + size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); + void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); + assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + if(ptr && ptr < ws->initOnceStart) { + /* We assume the memory following the current allocation is either: + * 1. Not usable as initOnce memory (end of workspace) + * 2. Another initOnce buffer that has been allocated before (and so was previously memset) + * 3. An ASAN redzone, in which case we don't want to write on it + * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. + * Note that we assume here that MSAN and ASAN cannot run in the same time. */ + ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); + ws->initOnceStart = ptr; + } +#if ZSTD_MEMORY_SANITIZER + assert(__msan_test_shadow(ptr, bytes) == -1); +#endif + return ptr; +} + +/** + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). + */ +MEM_STATIC void* ZSTD_cwksp_reserve_aligned64(ZSTD_cwksp* ws, size_t bytes) +{ + void* const ptr = ZSTD_cwksp_reserve_internal(ws, + ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), + ZSTD_cwksp_alloc_aligned); + assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + return ptr; } /** - * Aligned on sizeof(unsigned). These buffers have the special property that - * their values remain constrained, allowing us to re-use them without + * Aligned on 64 bytes. These buffers have the special property that + * their values remain constrained, allowing us to reuse them without * memset()-ing them. */ -MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { - const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; - void* alloc = ws->tableEnd; - void* end = (BYTE *)alloc + bytes; - void* top = ws->allocStart; +MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) +{ + const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; + void* alloc; + void* end; + void* top; + + /* We can only start allocating tables after we are done reserving space for objects at the + * start of the workspace */ + if(ws->phase < phase) { + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { + return NULL; + } + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; + top = ws->allocStart; DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining", alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); assert((bytes & (sizeof(U32)-1)) == 0); - ZSTD_cwksp_internal_advance_phase(ws, phase); ZSTD_cwksp_assert_internal_consistency(ws); assert(end <= top); if (end > top) { @@ -306,35 +474,41 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { } ws->tableEnd = end; -#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) - __asan_unpoison_memory_region(alloc, bytes); +#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) { + __asan_unpoison_memory_region(alloc, bytes); + } #endif + assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); return alloc; } /** * Aligned on sizeof(void*). + * Note : should happen only once, at workspace first initialization */ -MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) { - size_t roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*)); +MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) +{ + size_t const roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*)); void* alloc = ws->objectEnd; void* end = (BYTE*)alloc + roundedBytes; -#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) +#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) /* over-reserve space */ end = (BYTE *)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; #endif - DEBUGLOG(5, + DEBUGLOG(4, "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining", alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes); - assert(((size_t)alloc & (sizeof(void*)-1)) == 0); - assert((bytes & (sizeof(void*)-1)) == 0); + assert((size_t)alloc % ZSTD_ALIGNOF(void*) == 0); + assert(bytes % ZSTD_ALIGNOF(void*) == 0); ZSTD_cwksp_assert_internal_consistency(ws); /* we must be in the first phase, no advance is possible */ if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) { - DEBUGLOG(4, "cwksp: object alloc failed!"); + DEBUGLOG(3, "cwksp: object alloc failed!"); ws->allocFailed = 1; return NULL; } @@ -342,27 +516,52 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) { ws->tableEnd = end; ws->tableValidEnd = end; -#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) +#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on * either size. */ - alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; - __asan_unpoison_memory_region(alloc, bytes); + alloc = (BYTE*)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; + if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) { + __asan_unpoison_memory_region(alloc, bytes); + } #endif return alloc; } +/** + * with alignment control + * Note : should happen only once, at workspace first initialization + */ +MEM_STATIC void* ZSTD_cwksp_reserve_object_aligned(ZSTD_cwksp* ws, size_t byteSize, size_t alignment) +{ + size_t const mask = alignment - 1; + size_t const surplus = (alignment > sizeof(void*)) ? alignment - sizeof(void*) : 0; + void* const start = ZSTD_cwksp_reserve_object(ws, byteSize + surplus); + if (start == NULL) return NULL; + if (surplus == 0) return start; + assert(ZSTD_isPower2(alignment)); + return (void*)(((size_t)start + surplus) & ~mask); +} -MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) { +MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) +{ DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty"); -#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) - /* To validate that the table re-use logic is sound, and that we don't +#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the table reuse logic is sound, and that we don't * access table space that we haven't cleaned, we re-"poison" the table - * space every time we mark it dirty. */ + * space every time we mark it dirty. + * Since tableValidEnd space and initOnce space may overlap we don't poison + * the initOnce portion as it break its promise. This means that this poisoning + * check isn't always applied fully. */ { size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd; assert(__msan_test_shadow(ws->objectEnd, size) == -1); - __msan_poison(ws->objectEnd, size); + if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) { + __msan_poison(ws->objectEnd, size); + } else { + assert(ws->initOnceStart >= ws->objectEnd); + __msan_poison(ws->objectEnd, (BYTE*)ws->initOnceStart - (BYTE*)ws->objectEnd); + } } #endif @@ -390,7 +589,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { assert(ws->tableValidEnd >= ws->objectEnd); assert(ws->tableValidEnd <= ws->allocStart); if (ws->tableValidEnd < ws->tableEnd) { - memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); + ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd)); } ZSTD_cwksp_mark_tables_clean(ws); } @@ -399,11 +598,16 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { * Invalidates table allocations. * All other allocations remain valid. */ -MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) { +MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) +{ DEBUGLOG(4, "cwksp: clearing tables!"); -#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) - { +#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* We don't do this when the workspace is statically allocated, because + * when that is the case, we have no capability to hook into the end of the + * workspace's lifecycle to unpoison the memory. + */ + if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) { size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd; __asan_poison_memory_region(ws->objectEnd, size); } @@ -420,77 +624,96 @@ MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) { MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { DEBUGLOG(4, "cwksp: clearing!"); -#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) - /* To validate that the context re-use logic is sound, and that we don't +#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the context reuse logic is sound, and that we don't * access stuff that this compression hasn't initialized, we re-"poison" - * the workspace (or at least the non-static, non-table parts of it) - * every time we start a new compression. */ + * the workspace except for the areas in which we expect memory reuse + * without initialization (objects, valid tables area and init once + * memory). */ { - size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->tableValidEnd; - __msan_poison(ws->tableValidEnd, size); + if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) { + size_t size = (BYTE*)ws->initOnceStart - (BYTE*)ws->tableValidEnd; + __msan_poison(ws->tableValidEnd, size); + } } #endif -#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) - { +#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* We don't do this when the workspace is statically allocated, because + * when that is the case, we have no capability to hook into the end of the + * workspace's lifecycle to unpoison the memory. + */ + if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) { size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->objectEnd; __asan_poison_memory_region(ws->objectEnd, size); } #endif ws->tableEnd = ws->objectEnd; - ws->allocStart = ws->workspaceEnd; + ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); ws->allocFailed = 0; - if (ws->phase > ZSTD_cwksp_alloc_buffers) { - ws->phase = ZSTD_cwksp_alloc_buffers; + if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { + ws->phase = ZSTD_cwksp_alloc_aligned_init_once; } ZSTD_cwksp_assert_internal_consistency(ws); } +MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { + return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); +} + +MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { + return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) + + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); +} + /** * The provided workspace takes ownership of the buffer [start, start+size). * Any existing values in the workspace are ignored (the previously managed * buffer, if present, must be separately freed). */ -MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size) { +MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_cwksp_static_alloc_e isStatic) { DEBUGLOG(4, "cwksp: init'ing workspace with %zd bytes", size); assert(((size_t)start & (sizeof(void*)-1)) == 0); /* ensure correct alignment */ ws->workspace = start; ws->workspaceEnd = (BYTE*)start + size; ws->objectEnd = ws->workspace; ws->tableValidEnd = ws->objectEnd; + ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); ws->phase = ZSTD_cwksp_alloc_objects; + ws->isStatic = isStatic; ZSTD_cwksp_clear(ws); ws->workspaceOversizedDuration = 0; ZSTD_cwksp_assert_internal_consistency(ws); } MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem customMem) { - void* workspace = ZSTD_malloc(size, customMem); + void* workspace = ZSTD_customMalloc(size, customMem); DEBUGLOG(4, "cwksp: creating new workspace with %zd bytes", size); - RETURN_ERROR_IF(workspace == NULL, memory_allocation); - ZSTD_cwksp_init(ws, workspace, size); + RETURN_ERROR_IF(workspace == NULL, memory_allocation, "NULL pointer!"); + ZSTD_cwksp_init(ws, workspace, size, ZSTD_cwksp_dynamic_alloc); return 0; } MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) { void *ptr = ws->workspace; DEBUGLOG(4, "cwksp: freeing workspace"); - memset(ws, 0, sizeof(ZSTD_cwksp)); - ZSTD_free(ptr, customMem); +#if ZSTD_MEMORY_SANITIZER && !defined(ZSTD_MSAN_DONT_POISON_WORKSPACE) + if (ptr != NULL && customMem.customFree != NULL) { + __msan_unpoison(ptr, ZSTD_cwksp_sizeof(ws)); + } +#endif + ZSTD_memset(ws, 0, sizeof(ZSTD_cwksp)); + ZSTD_customFree(ptr, customMem); } /** * Moves the management of a workspace from one cwksp to another. The src cwksp - * is left in an invalid state (src must be re-init()'ed before its used again). + * is left in an invalid state (src must be re-init()'ed before it's used again). */ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { *dst = *src; - memset(src, 0, sizeof(ZSTD_cwksp)); -} - -MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { - return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); + ZSTD_memset(src, 0, sizeof(ZSTD_cwksp)); } MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { @@ -501,6 +724,18 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { * Functions Checking Free Space ***************************************/ +/* ZSTD_alignmentSpaceWithinBounds() : + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { + /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice + * the alignment bytes difference between estimation and actual usage */ + return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && + ZSTD_cwksp_used(ws) <= estimatedSpace; +} + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) { return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd); } @@ -528,8 +763,6 @@ MEM_STATIC void ZSTD_cwksp_bump_oversized_duration( } } -#if defined (__cplusplus) -} -#endif - #endif /* ZSTD_CWKSP_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_ddict.c b/vendor/github.com/DataDog/zstd/zstd_ddict.c index 0af3d23..c77a9df 100644 --- a/vendor/github.com/DataDog/zstd/zstd_ddict.c +++ b/vendor/github.com/DataDog/zstd/zstd_ddict.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -14,18 +15,18 @@ /*-******************************************************* * Dependencies *********************************************************/ -#include /* memcpy, memmove, memset */ +#include "allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ +#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ #include "cpu.h" /* bmi2 */ #include "mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "fse.h" -#define HUF_STATIC_LINKING_ONLY #include "huf.h" #include "zstd_decompress_internal.h" #include "zstd_ddict.h" #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) -# include "zstd_legacy.h" +#include "zstd_legacy.h" #endif @@ -65,6 +66,10 @@ void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) dctx->virtualStart = ddict->dictContent; dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize; dctx->previousDstEnd = dctx->dictEnd; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentBeginForFuzzing = dctx->prefixStart; + dctx->dictContentEndForFuzzing = dctx->previousDstEnd; +#endif if (ddict->entropyPresent) { dctx->litEntropy = 1; dctx->fseEntropy = 1; @@ -107,7 +112,7 @@ ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict, /* load entropy tables */ RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy( &ddict->entropy, ddict->dictContent, ddict->dictSize)), - dictionary_corrupted); + dictionary_corrupted, ""); ddict->entropyPresent = 1; return 0; } @@ -123,17 +128,17 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, ddict->dictContent = dict; if (!dict) dictSize = 0; } else { - void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem); + void* const internalBuffer = ZSTD_customMalloc(dictSize, ddict->cMem); ddict->dictBuffer = internalBuffer; ddict->dictContent = internalBuffer; if (!internalBuffer) return ERROR(memory_allocation); - memcpy(internalBuffer, dict, dictSize); + ZSTD_memcpy(internalBuffer, dict, dictSize); } ddict->dictSize = dictSize; - ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ /* parse dictionary content */ - FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) ); + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); return 0; } @@ -143,9 +148,9 @@ ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, ZSTD_customMem customMem) { - if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; - { ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem); + { ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_customMalloc(sizeof(ZSTD_DDict), customMem); if (ddict == NULL) return NULL; ddict->cMem = customMem; { size_t const initResult = ZSTD_initDDict_internal(ddict, @@ -194,7 +199,7 @@ const ZSTD_DDict* ZSTD_initStaticDDict( if ((size_t)sBuffer & 7) return NULL; /* 8-aligned */ if (sBufferSize < neededSpace) return NULL; if (dictLoadMethod == ZSTD_dlm_byCopy) { - memcpy(ddict+1, dict, dictSize); /* local copy */ + ZSTD_memcpy(ddict+1, dict, dictSize); /* local copy */ dict = ddict+1; } if (ZSTD_isError( ZSTD_initDDict_internal(ddict, @@ -209,8 +214,8 @@ size_t ZSTD_freeDDict(ZSTD_DDict* ddict) { if (ddict==NULL) return 0; /* support free on NULL */ { ZSTD_customMem const cMem = ddict->cMem; - ZSTD_free(ddict->dictBuffer, cMem); - ZSTD_free(ddict, cMem); + ZSTD_customFree(ddict->dictBuffer, cMem); + ZSTD_customFree(ddict, cMem); return 0; } } @@ -236,5 +241,7 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) { if (ddict==NULL) return 0; - return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); + return ddict->dictID; } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_ddict.h b/vendor/github.com/DataDog/zstd/zstd_ddict.h index 0479d11..97e432e 100644 --- a/vendor/github.com/DataDog/zstd/zstd_ddict.h +++ b/vendor/github.com/DataDog/zstd/zstd_ddict.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -15,7 +16,7 @@ /*-******************************************************* * Dependencies *********************************************************/ -#include /* size_t */ +#include "zstd_deps.h" /* size_t */ #include "zstd.h" /* ZSTD_DDict, and several public functions */ @@ -42,3 +43,5 @@ void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); #endif /* ZSTD_DDICT_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_decompress.c b/vendor/github.com/DataDog/zstd/zstd_decompress.c index dd4591b..8b96357 100644 --- a/vendor/github.com/DataDog/zstd/zstd_decompress.c +++ b/vendor/github.com/DataDog/zstd/zstd_decompress.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -55,23 +56,166 @@ /*-******************************************************* * Dependencies *********************************************************/ -#include /* memcpy, memmove, memset */ -#include "cpu.h" /* bmi2 */ +#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ +#include "allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ +#include "error_private.h" +#include "zstd_internal.h" /* blockProperties_t */ #include "mem.h" /* low level memory routines */ +#include "bits.h" /* ZSTD_highbit32 */ #define FSE_STATIC_LINKING_ONLY #include "fse.h" -#define HUF_STATIC_LINKING_ONLY #include "huf.h" -#include "zstd_internal.h" /* blockProperties_t */ +#include "xxhash.h" /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */ #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) -# include "zstd_legacy.h" +#include "zstd_legacy.h" #endif + +/************************************* + * Multiple DDicts Hashset internals * + *************************************/ + +#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. + * Currently, that means a 0.75 load factor. + * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded + * the load factor of the ddict hash set. + */ + +#define DDICT_HASHSET_TABLE_BASE_SIZE 64 +#define DDICT_HASHSET_RESIZE_FACTOR 2 + +/* Hash function to determine starting position of dict insertion within the table + * Returns an index between [0, hashSet->ddictPtrTableSize] + */ +static size_t ZSTD_DDictHashSet_getIndex(const ZSTD_DDictHashSet* hashSet, U32 dictID) { + const U64 hash = XXH64(&dictID, sizeof(U32), 0); + /* DDict ptr table size is a multiple of 2, use size - 1 as mask to get index within [0, hashSet->ddictPtrTableSize) */ + return hash & (hashSet->ddictPtrTableSize - 1); +} + +/* Adds DDict to a hashset without resizing it. + * If inserting a DDict with a dictID that already exists in the set, replaces the one in the set. + * Returns 0 if successful, or a zstd error code if something went wrong. + */ +static size_t ZSTD_DDictHashSet_emplaceDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict) { + const U32 dictID = ZSTD_getDictID_fromDDict(ddict); + size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID); + const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1; + RETURN_ERROR_IF(hashSet->ddictPtrCount == hashSet->ddictPtrTableSize, GENERIC, "Hash set is full!"); + DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx); + while (hashSet->ddictPtrTable[idx] != NULL) { + /* Replace existing ddict if inserting ddict with same dictID */ + if (ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]) == dictID) { + DEBUGLOG(4, "DictID already exists, replacing rather than adding"); + hashSet->ddictPtrTable[idx] = ddict; + return 0; + } + idx &= idxRangeMask; + idx++; + } + DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx); + hashSet->ddictPtrTable[idx] = ddict; + hashSet->ddictPtrCount++; + return 0; +} + +/* Expands hash table by factor of DDICT_HASHSET_RESIZE_FACTOR and + * rehashes all values, allocates new table, frees old table. + * Returns 0 on success, otherwise a zstd error code. + */ +static size_t ZSTD_DDictHashSet_expand(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) { + size_t newTableSize = hashSet->ddictPtrTableSize * DDICT_HASHSET_RESIZE_FACTOR; + const ZSTD_DDict** newTable = (const ZSTD_DDict**)ZSTD_customCalloc(sizeof(ZSTD_DDict*) * newTableSize, customMem); + const ZSTD_DDict** oldTable = hashSet->ddictPtrTable; + size_t oldTableSize = hashSet->ddictPtrTableSize; + size_t i; + + DEBUGLOG(4, "Expanding DDict hash table! Old size: %zu new size: %zu", oldTableSize, newTableSize); + RETURN_ERROR_IF(!newTable, memory_allocation, "Expanded hashset allocation failed!"); + hashSet->ddictPtrTable = newTable; + hashSet->ddictPtrTableSize = newTableSize; + hashSet->ddictPtrCount = 0; + for (i = 0; i < oldTableSize; ++i) { + if (oldTable[i] != NULL) { + FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, oldTable[i]), ""); + } + } + ZSTD_customFree((void*)oldTable, customMem); + DEBUGLOG(4, "Finished re-hash"); + return 0; +} + +/* Fetches a DDict with the given dictID + * Returns the ZSTD_DDict* with the requested dictID. If it doesn't exist, then returns NULL. + */ +static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet, U32 dictID) { + size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID); + const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1; + DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx); + for (;;) { + size_t currDictID = ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]); + if (currDictID == dictID || currDictID == 0) { + /* currDictID == 0 implies a NULL ddict entry */ + break; + } else { + idx &= idxRangeMask; /* Goes to start of table when we reach the end */ + idx++; + } + } + DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx); + return hashSet->ddictPtrTable[idx]; +} + +/* Allocates space for and returns a ddict hash set + * The hash set's ZSTD_DDict* table has all values automatically set to NULL to begin with. + * Returns NULL if allocation failed. + */ +static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) { + ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem); + DEBUGLOG(4, "Allocating new hash set"); + if (!ret) + return NULL; + ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem); + if (!ret->ddictPtrTable) { + ZSTD_customFree(ret, customMem); + return NULL; + } + ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE; + ret->ddictPtrCount = 0; + return ret; +} + +/* Frees the table of ZSTD_DDict* within a hashset, then frees the hashset itself. + * Note: The ZSTD_DDict* within the table are NOT freed. + */ +static void ZSTD_freeDDictHashSet(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) { + DEBUGLOG(4, "Freeing ddict hash set"); + if (hashSet && hashSet->ddictPtrTable) { + ZSTD_customFree((void*)hashSet->ddictPtrTable, customMem); + } + if (hashSet) { + ZSTD_customFree(hashSet, customMem); + } +} + +/* Public function: Adds a DDict into the ZSTD_DDictHashSet, possibly triggering a resize of the hash set. + * Returns 0 on success, or a ZSTD error. + */ +static size_t ZSTD_DDictHashSet_addDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict, ZSTD_customMem customMem) { + DEBUGLOG(4, "Adding dict ID: %u to hashset with - Count: %zu Tablesize: %zu", ZSTD_getDictID_fromDDict(ddict), hashSet->ddictPtrCount, hashSet->ddictPtrTableSize); + if (hashSet->ddictPtrCount * DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT / hashSet->ddictPtrTableSize * DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT != 0) { + FORWARD_IF_ERROR(ZSTD_DDictHashSet_expand(hashSet, customMem), ""); + } + FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, ddict), ""); + return 0; +} + /*-************************************************************* * Context management ***************************************************************/ @@ -94,11 +238,21 @@ static size_t ZSTD_startingInputLength(ZSTD_format_e format) return startingInputLength; } +static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) +{ + assert(dctx->streamStage == zdss_init); + dctx->format = ZSTD_f_zstd1; + dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; + dctx->outBufferMode = ZSTD_bm_buffered; + dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; + dctx->disableHufAsm = 0; + dctx->maxBlockSizeParam = 0; +} + static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) { - dctx->format = ZSTD_f_zstd1; /* ZSTD_decompressBegin() invokes ZSTD_startingInputLength() with argument dctx->format */ dctx->staticSize = 0; - dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; dctx->ddict = NULL; dctx->ddictLocal = NULL; dctx->dictEnd = NULL; @@ -108,10 +262,21 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) dctx->inBuffSize = 0; dctx->outBuffSize = 0; dctx->streamStage = zdss_init; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) dctx->legacyContext = NULL; dctx->previousLegacyVersion = 0; +#endif dctx->noForwardProgress = 0; - dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + dctx->oversizedDuration = 0; + dctx->isFrameDecompression = 1; +#if DYNAMIC_BMI2 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); +#endif + dctx->ddictSet = NULL; + ZSTD_DCtx_resetParameters(dctx); +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentEndForFuzzing = NULL; +#endif } ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize) @@ -127,11 +292,10 @@ ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize) return dctx; } -ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) -{ - if (!customMem.customAlloc ^ !customMem.customFree) return NULL; +static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) { + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; - { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem); + { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dctx), customMem); if (!dctx) return NULL; dctx->customMem = customMem; ZSTD_initDCtx_internal(dctx); @@ -139,10 +303,15 @@ ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) } } +ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createDCtx_internal(customMem); +} + ZSTD_DCtx* ZSTD_createDCtx(void) { DEBUGLOG(3, "ZSTD_createDCtx"); - return ZSTD_createDCtx_advanced(ZSTD_defaultCMem); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); } static void ZSTD_clearDict(ZSTD_DCtx* dctx) @@ -159,13 +328,17 @@ size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx) RETURN_ERROR_IF(dctx->staticSize, memory_allocation, "not compatible with static DCtx"); { ZSTD_customMem const cMem = dctx->customMem; ZSTD_clearDict(dctx); - ZSTD_free(dctx->inBuff, cMem); + ZSTD_customFree(dctx->inBuff, cMem); dctx->inBuff = NULL; #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) if (dctx->legacyContext) ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion); #endif - ZSTD_free(dctx, cMem); + if (dctx->ddictSet) { + ZSTD_freeDDictHashSet(dctx->ddictSet, cMem); + dctx->ddictSet = NULL; + } + ZSTD_customFree(dctx, cMem); return 0; } } @@ -174,7 +347,30 @@ size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx) void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) { size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx); - memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */ + ZSTD_memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */ +} + +/* Given a dctx with a digested frame params, re-selects the correct ZSTD_DDict based on + * the requested dict ID from the frame. If there exists a reference to the correct ZSTD_DDict, then + * accordingly sets the ddict to be used to decompress the frame. + * + * If no DDict is found, then no action is taken, and the ZSTD_DCtx::ddict remains as-is. + * + * ZSTD_d_refMultipleDDicts must be enabled for this function to be called. + */ +static void ZSTD_DCtx_selectFrameDDict(ZSTD_DCtx* dctx) { + assert(dctx->refMultipleDDicts && dctx->ddictSet); + DEBUGLOG(4, "Adjusting DDict based on requested dict ID from frame"); + if (dctx->ddict) { + const ZSTD_DDict* frameDDict = ZSTD_DDictHashSet_getDDict(dctx->ddictSet, dctx->fParams.dictID); + if (frameDDict) { + DEBUGLOG(4, "DDict found!"); + ZSTD_clearDict(dctx); + dctx->dictID = dctx->fParams.dictID; + dctx->ddict = frameDDict; + dctx->dictUses = ZSTD_use_indefinitely; + } + } } @@ -200,6 +396,19 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size) return 0; } +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + */ +unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size) +{ + if (size < ZSTD_FRAMEIDSIZE) return 0; + { U32 const magic = MEM_readLE32(buffer); + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; + } + return 0; +} + /** ZSTD_frameHeaderSize_internal() : * srcSize must be large enough to reach header size fields. * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. @@ -208,7 +417,7 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size) static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format) { size_t const minInputSize = ZSTD_startingInputLength(format); - RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong); + RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong, ""); { BYTE const fhd = ((const BYTE*)src)[minInputSize-1]; U32 const dictID= fhd & 3; @@ -235,28 +444,54 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless * @return : 0, `zfhPtr` is correctly filled, * >0, `srcSize` is too small, value is wanted `srcSize` amount, - * or an error code, which can be tested using ZSTD_isError() */ -size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) +** or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) { const BYTE* ip = (const BYTE*)src; size_t const minInputSize = ZSTD_startingInputLength(format); - memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ - if (srcSize < minInputSize) return minInputSize; - RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); + DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); + if (srcSize > 0) { + /* note : technically could be considered an assert(), since it's an invalid entry */ + RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); + } + if (srcSize < minInputSize) { + if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { + /* when receiving less than @minInputSize bytes, + * control these bytes at least correspond to a supported magic number + * in order to error out early if they don't. + **/ + size_t const toCopy = MIN(4, srcSize); + unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); + assert(src != NULL); + ZSTD_memcpy(hbuf, src, toCopy); + if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { + /* not a zstd frame : let's check if it's a skippable frame */ + MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); + ZSTD_memcpy(hbuf, src, toCopy); + if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { + RETURN_ERROR(prefix_unknown, + "first bytes don't correspond to any supported magic number"); + } } } + return minInputSize; + } + + ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ if ( (format != ZSTD_f_zstd1_magicless) && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ if (srcSize < ZSTD_SKIPPABLEHEADERSIZE) return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */ - memset(zfhPtr, 0, sizeof(*zfhPtr)); - zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); zfhPtr->frameType = ZSTD_skippableFrame; + zfhPtr->dictID = MEM_readLE32(src) - ZSTD_MAGIC_SKIPPABLE_START; + zfhPtr->headerSize = ZSTD_SKIPPABLEHEADERSIZE; + zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); return 0; } - RETURN_ERROR(prefix_unknown); + RETURN_ERROR(prefix_unknown, ""); } /* ensure there is enough `srcSize` to fully read/decode frame header */ @@ -280,13 +515,15 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s if (!singleSegment) { BYTE const wlByte = ip[pos++]; U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN; - RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge); + RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge, ""); windowSize = (1ULL << windowLog); windowSize += (windowSize >> 3) * (wlByte&7); } switch(dictIDSizeCode) { - default: assert(0); /* impossible */ + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; case 0 : break; case 1 : dictID = ip[pos]; pos++; break; case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break; @@ -294,7 +531,9 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s } switch(fcsID) { - default: assert(0); /* impossible */ + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; case 0 : if (singleSegment) frameContentSize = ip[pos]; break; case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break; case 2 : frameContentSize = MEM_readLE32(ip+pos); break; @@ -318,12 +557,11 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s * @return : 0, `zfhPtr` is correctly filled, * >0, `srcSize` is too small, value is wanted `srcSize` amount, * or an error code, which can be tested using ZSTD_isError() */ -size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize) +size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize) { return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); } - /** ZSTD_getFrameContentSize() : * compatible with legacy mode * @return : decompressed size of the single frame pointed to be `src` if known, otherwise @@ -337,7 +575,7 @@ unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret; } #endif - { ZSTD_frameHeader zfh; + { ZSTD_FrameHeader zfh; if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0) return ZSTD_CONTENTSIZE_ERROR; if (zfh.frameType == ZSTD_skippableFrame) { @@ -352,23 +590,57 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) size_t const skippableHeaderSize = ZSTD_SKIPPABLEHEADERSIZE; U32 sizeU32; - RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong); + RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, - frameParameter_unsupported); - { - size_t const skippableSize = skippableHeaderSize + sizeU32; - RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong); + frameParameter_unsupported, ""); + { size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); return skippableSize; } } +/*! ZSTD_readSkippableFrame() : + * Retrieves content of a skippable frame, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. + * + * @return : number of bytes written or a ZSTD error. + */ +size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, + unsigned* magicVariant, /* optional, can be NULL */ + const void* src, size_t srcSize) +{ + RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + + { U32 const magicNumber = MEM_readLE32(src); + size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); + size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; + + /* check input validity */ + RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); + RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); + RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); + + /* deliver payload */ + if (skippableContentSize > 0 && dst != NULL) + ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); + if (magicVariant != NULL) + *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; + return skippableContentSize; + } +} + /** ZSTD_findDecompressedSize() : - * compatible with legacy mode * `srcSize` must be the exact length of some number of ZSTD compressed and/or * skippable frames - * @return : decompressed size of the frames contained */ + * note: compatible with legacy mode + * @return : decompressed size of the frames contained */ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) { unsigned long long totalDstSize = 0; @@ -378,9 +650,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { size_t const skippableSize = readSkippableFrameSize(src, srcSize); - if (ZSTD_isError(skippableSize)) { - return ZSTD_CONTENTSIZE_ERROR; - } + if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; assert(skippableSize <= srcSize); src = (const BYTE *)src + skippableSize; @@ -388,17 +658,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) continue; } - { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); - if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; + { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); + if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; - /* check for overflow */ - if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; - totalDstSize += ret; + if (totalDstSize + fcs < totalDstSize) + return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ + totalDstSize += fcs; } + /* skip to next frame */ { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); - if (ZSTD_isError(frameSrcSize)) { - return ZSTD_CONTENTSIZE_ERROR; - } + if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(frameSrcSize <= srcSize); src = (const BYTE *)src + frameSrcSize; srcSize -= frameSrcSize; @@ -428,20 +698,29 @@ unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize) /** ZSTD_decodeFrameHeader() : * `headerSize` must be the size provided by ZSTD_frameHeaderSize(). + * If multiple DDict references are enabled, also will choose the correct DDict to use. * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize) { size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format); if (ZSTD_isError(result)) return result; /* invalid header */ RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small"); + + /* Reference DDict requested by frame if dctx references multiple ddicts */ + if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts && dctx->ddictSet) { + ZSTD_DCtx_selectFrameDDict(dctx); + } + #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION /* Skip the dictID check in fuzzing mode, because it makes the search * harder. */ RETURN_ERROR_IF(dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID), - dictionary_wrong); + dictionary_wrong, ""); #endif - if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0); + dctx->validateChecksum = (dctx->fParams.checksumFlag && !dctx->forceIgnoreChecksum) ? 1 : 0; + if (dctx->validateChecksum) XXH64_reset(&dctx->xxhState, 0); + dctx->processedCSize += headerSize; return 0; } @@ -453,17 +732,17 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) return frameSizeInfo; } -static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) +static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format) { ZSTD_frameSizeInfo frameSizeInfo; - memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); + ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) - if (ZSTD_isLegacy(src, srcSize)) + if (format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize)) return ZSTD_findFrameSizeInfoLegacy(src, srcSize); #endif - if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE) && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); assert(ZSTD_isError(frameSizeInfo.compressedSize) || @@ -474,10 +753,10 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize const BYTE* const ipstart = ip; size_t remainingSize = srcSize; size_t nbBlocks = 0; - ZSTD_frameHeader zfh; + ZSTD_FrameHeader zfh; /* Extract Frame Header */ - { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); + { size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format); if (ZSTD_isError(ret)) return ZSTD_errorFrameSizeInfo(ret); if (ret > 0) @@ -511,28 +790,31 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize ip += 4; } - frameSizeInfo.compressedSize = ip - ipstart; + frameSizeInfo.nbBlocks = nbBlocks; + frameSizeInfo.compressedSize = (size_t)(ip - ipstart); frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) ? zfh.frameContentSize - : nbBlocks * zfh.blockSizeMax; + : (unsigned long long)nbBlocks * zfh.blockSizeMax; return frameSizeInfo; } } +static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) { + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format); + return frameSizeInfo.compressedSize; +} + /** ZSTD_findFrameCompressedSize() : - * compatible with legacy mode - * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame - * `srcSize` must be at least as large as the frame contained - * @return : the compressed size of the frame starting at `src` */ + * See docs in zstd.h + * Note: compatible with legacy mode */ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) { - ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); - return frameSizeInfo.compressedSize; + return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1); } /** ZSTD_decompressBound() : * compatible with legacy mode - * `src` must point to the start of a ZSTD frame or a skippeable frame + * `src` must point to the start of a ZSTD frame or a skippable frame * `srcSize` must be at least as large as the frame contained * @return : the maximum decompressed size of the compressed source */ @@ -541,7 +823,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) unsigned long long bound = 0; /* Iterate over each frame */ while (srcSize > 0) { - ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); size_t const compressedSize = frameSizeInfo.compressedSize; unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) @@ -554,28 +836,59 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) return bound; } +size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) +{ + size_t margin = 0; + unsigned maxBlockSize = 0; -/*-************************************************************* - * Frame decoding - ***************************************************************/ + /* Iterate over each frame */ + while (srcSize > 0) { + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + ZSTD_FrameHeader zfh; + FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) + return ERROR(corruption_detected); + + if (zfh.frameType == ZSTD_frame) { + /* Add the frame header to our margin */ + margin += zfh.headerSize; + /* Add the checksum to our margin */ + margin += zfh.checksumFlag ? 4 : 0; + /* Add 3 bytes per block */ + margin += 3 * frameSizeInfo.nbBlocks; + + /* Compute the max block size */ + maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); + } else { + assert(zfh.frameType == ZSTD_skippableFrame); + /* Add the entire skippable frame size to our margin. */ + margin += compressedSize; + } -void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst) -{ - if (dst != dctx->previousDstEnd) { /* not contiguous */ - dctx->dictEnd = dctx->previousDstEnd; - dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); - dctx->prefixStart = dst; - dctx->previousDstEnd = dst; + assert(srcSize >= compressedSize); + src = (const BYTE*)src + compressedSize; + srcSize -= compressedSize; } + + /* Add the max block size back to the margin. */ + margin += maxBlockSize; + + return margin; } +/*-************************************************************* + * Frame decoding + ***************************************************************/ + /** ZSTD_insertBlock() : * insert `src` block into `dctx` history. Useful to track uncompressed blocks. */ size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize) { DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize); - ZSTD_checkContinuity(dctx, blockStart); + ZSTD_checkContinuity(dctx, blockStart, blockSize); dctx->previousDstEnd = (const char*)blockStart + blockSize; return blockSize; } @@ -585,12 +898,12 @@ static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_copyRawBlock"); + RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall, ""); if (dst == NULL) { if (srcSize == 0) return 0; - RETURN_ERROR(dstBuffer_null); + RETURN_ERROR(dstBuffer_null, ""); } - RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall); - memcpy(dst, src, srcSize); + ZSTD_memmove(dst, src, srcSize); return srcSize; } @@ -598,15 +911,41 @@ static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, BYTE b, size_t regenSize) { + RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall, ""); if (dst == NULL) { if (regenSize == 0) return 0; - RETURN_ERROR(dstBuffer_null); + RETURN_ERROR(dstBuffer_null, ""); } - RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall); - memset(dst, b, regenSize); + ZSTD_memset(dst, b, regenSize); return regenSize; } +static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, int streaming) +{ +#if ZSTD_TRACE + if (dctx->traceCtx && ZSTD_trace_decompress_end != NULL) { + ZSTD_Trace trace; + ZSTD_memset(&trace, 0, sizeof(trace)); + trace.version = ZSTD_VERSION_NUMBER; + trace.streaming = streaming; + if (dctx->ddict) { + trace.dictionaryID = ZSTD_getDictID_fromDDict(dctx->ddict); + trace.dictionarySize = ZSTD_DDict_dictSize(dctx->ddict); + trace.dictionaryIsCold = dctx->ddictIsCold; + } + trace.uncompressedSize = (size_t)uncompressedSize; + trace.compressedSize = (size_t)compressedSize; + trace.dctx = dctx; + ZSTD_trace_decompress_end(dctx->traceCtx, &trace); + } +#else + (void)dctx; + (void)uncompressedSize; + (void)compressedSize; + (void)streaming; +#endif +} + /*! ZSTD_decompressFrame() : * @dctx must be properly initialized @@ -616,9 +955,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void** srcPtr, size_t *srcSizePtr) { - const BYTE* ip = (const BYTE*)(*srcPtr); - BYTE* const ostart = (BYTE* const)dst; - BYTE* const oend = ostart + dstCapacity; + const BYTE* const istart = (const BYTE*)(*srcPtr); + const BYTE* ip = istart; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart; BYTE* op = ostart; size_t remainingSrcSize = *srcSizePtr; @@ -627,20 +967,25 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, /* check */ RETURN_ERROR_IF( remainingSrcSize < ZSTD_FRAMEHEADERSIZE_MIN(dctx->format)+ZSTD_blockHeaderSize, - srcSize_wrong); + srcSize_wrong, ""); /* Frame Header */ { size_t const frameHeaderSize = ZSTD_frameHeaderSize_internal( ip, ZSTD_FRAMEHEADERSIZE_PREFIX(dctx->format), dctx->format); if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize; RETURN_ERROR_IF(remainingSrcSize < frameHeaderSize+ZSTD_blockHeaderSize, - srcSize_wrong); - FORWARD_IF_ERROR( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) ); + srcSize_wrong, ""); + FORWARD_IF_ERROR( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) , ""); ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; } + /* Shrink the blockSizeMax if enabled */ + if (dctx->maxBlockSizeParam != 0) + dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam); + /* Loop on each block */ while (1) { + BYTE* oBlockEnd = oend; size_t decodedSize; blockProperties_t blockProperties; size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties); @@ -648,28 +993,51 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, ip += ZSTD_blockHeaderSize; remainingSrcSize -= ZSTD_blockHeaderSize; - RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong); + RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, ""); + + if (ip >= op && ip < oBlockEnd) { + /* We are decompressing in-place. Limit the output pointer so that we + * don't overwrite the block that we are currently reading. This will + * fail decompression if the input & output pointers aren't spaced + * far enough apart. + * + * This is important to set, even when the pointers are far enough + * apart, because ZSTD_decompressBlock_internal() can decide to store + * literals in the output buffer, after the block it is decompressing. + * Since we don't want anything to overwrite our input, we have to tell + * ZSTD_decompressBlock_internal to never write past ip. + * + * See ZSTD_allocateLiteralsBuffer() for reference. + */ + oBlockEnd = op + (ip - op); + } switch(blockProperties.blockType) { case bt_compressed: - decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize, /* frame */ 1); + assert(dctx->isFrameDecompression == 1); + decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming); break; case bt_raw : - decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize); + /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ + decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize); break; case bt_rle : - decodedSize = ZSTD_setRleBlock(op, oend-op, *ip, blockProperties.origSize); + decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd-op), *ip, blockProperties.origSize); break; case bt_reserved : default: - RETURN_ERROR(corruption_detected); + RETURN_ERROR(corruption_detected, "invalid block type"); } - - if (ZSTD_isError(decodedSize)) return decodedSize; - if (dctx->fParams.checksumFlag) + FORWARD_IF_ERROR(decodedSize, "Block decompression failure"); + DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize); + if (dctx->validateChecksum) { XXH64_update(&dctx->xxhState, op, decodedSize); - op += decodedSize; + } + if (decodedSize) /* support dst = NULL,0 */ { + op += decodedSize; + } + assert(ip != NULL); ip += cBlockSize; remainingSrcSize -= cBlockSize; if (blockProperties.lastBlock) break; @@ -677,25 +1045,30 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) { RETURN_ERROR_IF((U64)(op-ostart) != dctx->fParams.frameContentSize, - corruption_detected); + corruption_detected, ""); } if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */ - U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState); - U32 checkRead; - RETURN_ERROR_IF(remainingSrcSize<4, checksum_wrong); - checkRead = MEM_readLE32(ip); - RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong); + RETURN_ERROR_IF(remainingSrcSize<4, checksum_wrong, ""); + if (!dctx->forceIgnoreChecksum) { + U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState); + U32 checkRead; + checkRead = MEM_readLE32(ip); + RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong, ""); + } ip += 4; remainingSrcSize -= 4; } - + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); /* Allow caller to get size read */ + DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %i, consuming %i bytes of input", (int)(op-ostart), (int)(ip - (const BYTE*)*srcPtr)); *srcPtr = ip; *srcSizePtr = remainingSrcSize; - return op-ostart; + return (size_t)(op-ostart); } -static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict, size_t dictSize, @@ -715,7 +1088,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, while (srcSize >= ZSTD_startingInputLength(dctx->format)) { #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) - if (ZSTD_isLegacy(src, srcSize)) { + if (dctx->format == ZSTD_f_zstd1 && ZSTD_isLegacy(src, srcSize)) { size_t decodedSize; size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize); if (ZSTD_isError(frameSize)) return frameSize; @@ -725,7 +1098,16 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize); if (ZSTD_isError(decodedSize)) return decodedSize; - assert(decodedSize <=- dstCapacity); + { + unsigned long long const expectedSize = ZSTD_getFrameContentSize(src, srcSize); + RETURN_ERROR_IF(expectedSize == ZSTD_CONTENTSIZE_ERROR, corruption_detected, "Corrupted frame header!"); + if (expectedSize != ZSTD_CONTENTSIZE_UNKNOWN) { + RETURN_ERROR_IF(expectedSize != decodedSize, corruption_detected, + "Frame header size does not match decoded size!"); + } + } + + assert(decodedSize <= dstCapacity); dst = (BYTE*)dst + decodedSize; dstCapacity -= decodedSize; @@ -736,28 +1118,29 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, } #endif - { U32 const magicNumber = MEM_readLE32(src); - DEBUGLOG(4, "reading magic number %08X (expecting %08X)", - (unsigned)magicNumber, ZSTD_MAGICNUMBER); + if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) { + U32 const magicNumber = MEM_readLE32(src); + DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + /* skippable frame detected : skip it */ size_t const skippableSize = readSkippableFrameSize(src, srcSize); - FORWARD_IF_ERROR(skippableSize); + FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); assert(skippableSize <= srcSize); src = (const BYTE *)src + skippableSize; srcSize -= skippableSize; - continue; + continue; /* check next frame */ } } if (ddict) { /* we were called from ZSTD_decompress_usingDDict */ - FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict)); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict), ""); } else { /* this will initialize correctly with no dict if dict == NULL, so * use this in all cases but ddict */ - FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize)); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), ""); } - ZSTD_checkContinuity(dctx, dst); + ZSTD_checkContinuity(dctx, dst, dstCapacity); { const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, &src, &srcSize); @@ -765,18 +1148,17 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown) && (moreThan1Frame==1), srcSize_wrong, - "at least one frame successfully completed, but following " - "bytes are garbage: it's more likely to be a srcSize error, " - "specifying more bytes than compressed size of frame(s). This " - "error message replaces ERROR(prefix_unknown), which would be " - "confusing, as the first header is actually correct. Note that " - "one could be unlucky, it might be a corruption error instead, " - "happening right at the place where we expect zstd magic " - "bytes. But this is _much_ less likely than a srcSize field " - "error."); + "At least one frame successfully completed, " + "but following bytes are garbage: " + "it's more likely to be a srcSize error, " + "specifying more input bytes than size of frame(s). " + "Note: one could be unlucky, it might be a corruption error instead, " + "happening right at the place where we expect zstd magic bytes. " + "But this is _much_ less likely than a srcSize field error."); if (ZSTD_isError(res)) return res; assert(res <= dstCapacity); - dst = (BYTE*)dst + res; + if (res != 0) + dst = (BYTE*)dst + res; dstCapacity -= res; } moreThan1Frame = 1; @@ -784,7 +1166,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, RETURN_ERROR_IF(srcSize, srcSize_wrong, "input not entirely consumed"); - return (BYTE*)dst - (BYTE*)dststart; + return (size_t)((BYTE*)dst - (BYTE*)dststart); } size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, @@ -801,7 +1183,7 @@ static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx) switch (dctx->dictUses) { default: assert(0 /* Impossible */); - /* fall-through */ + ZSTD_FALLTHROUGH; case ZSTD_dont_use: ZSTD_clearDict(dctx); return NULL; @@ -823,8 +1205,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr { #if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1) size_t regenSize; - ZSTD_DCtx* const dctx = ZSTD_createDCtx(); - RETURN_ERROR_IF(dctx==NULL, memory_allocation); + ZSTD_DCtx* const dctx = ZSTD_createDCtx_internal(ZSTD_defaultCMem); + RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!"); regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize); ZSTD_freeDCtx(dctx); return regenSize; @@ -842,12 +1224,32 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr ****************************************/ size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } +/** + * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we + * allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce + * output, and avoid copying the input. + * + * @param inputSize - The total amount of input that the caller currently has. + */ +static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t inputSize) { + if (!(dctx->stage == ZSTDds_decompressBlock || dctx->stage == ZSTDds_decompressLastBlock)) + return dctx->expected; + if (dctx->bType != bt_raw) + return dctx->expected; + return BOUNDED(1, inputSize, dctx->expected); +} + ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { switch(dctx->stage) { default: /* should not happen */ assert(0); + ZSTD_FALLTHROUGH; case ZSTDds_getFrameHeaderSize: + ZSTD_FALLTHROUGH; case ZSTDds_decodeFrameHeader: return ZSTDnit_frameHeader; case ZSTDds_decodeBlockHeader: @@ -859,6 +1261,7 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { case ZSTDds_checkChecksum: return ZSTDnit_checksum; case ZSTDds_decodeSkippableHeader: + ZSTD_FALLTHROUGH; case ZSTDds_skipFrame: return ZSTDnit_skippableFrame; } @@ -874,8 +1277,10 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c { DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize); /* Sanity check */ - RETURN_ERROR_IF(srcSize != dctx->expected, srcSize_wrong, "not allowed"); - if (dstCapacity) ZSTD_checkContinuity(dctx, dst); + RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed"); + ZSTD_checkContinuity(dctx, dst, dstCapacity); + + dctx->processedCSize += srcSize; switch (dctx->stage) { @@ -884,22 +1289,22 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c if (dctx->format == ZSTD_f_zstd1) { /* allows header */ assert(srcSize >= ZSTD_FRAMEIDSIZE); /* to read skippable magic number */ if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ - memcpy(dctx->headerBuffer, src, srcSize); + ZSTD_memcpy(dctx->headerBuffer, src, srcSize); dctx->expected = ZSTD_SKIPPABLEHEADERSIZE - srcSize; /* remaining to load to get full skippable frame header */ dctx->stage = ZSTDds_decodeSkippableHeader; return 0; } } dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format); if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize; - memcpy(dctx->headerBuffer, src, srcSize); + ZSTD_memcpy(dctx->headerBuffer, src, srcSize); dctx->expected = dctx->headerSize - srcSize; dctx->stage = ZSTDds_decodeFrameHeader; return 0; case ZSTDds_decodeFrameHeader: assert(src != NULL); - memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize); - FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize)); + ZSTD_memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize); + FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize), ""); dctx->expected = ZSTD_blockHeaderSize; dctx->stage = ZSTDds_decodeBlockHeader; return 0; @@ -940,51 +1345,68 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c { case bt_compressed: DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); - rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1); + assert(dctx->isFrameDecompression == 1); + rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming); + dctx->expected = 0; /* Streaming not supported */ break; case bt_raw : + assert(srcSize <= dctx->expected); rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize); + FORWARD_IF_ERROR(rSize, "ZSTD_copyRawBlock failed"); + assert(rSize == srcSize); + dctx->expected -= rSize; break; case bt_rle : rSize = ZSTD_setRleBlock(dst, dstCapacity, *(const BYTE*)src, dctx->rleSize); + dctx->expected = 0; /* Streaming not supported */ break; case bt_reserved : /* should never happen */ default: - RETURN_ERROR(corruption_detected); + RETURN_ERROR(corruption_detected, "invalid block type"); } - if (ZSTD_isError(rSize)) return rSize; + FORWARD_IF_ERROR(rSize, ""); RETURN_ERROR_IF(rSize > dctx->fParams.blockSizeMax, corruption_detected, "Decompressed Block Size Exceeds Maximum"); DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (unsigned)rSize); dctx->decodedSize += rSize; - if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize); + if (dctx->validateChecksum) XXH64_update(&dctx->xxhState, dst, rSize); + dctx->previousDstEnd = (char*)dst + rSize; + + /* Stay on the same stage until we are finished streaming the block. */ + if (dctx->expected > 0) { + return rSize; + } if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */ DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (unsigned)dctx->decodedSize); RETURN_ERROR_IF( dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN && dctx->decodedSize != dctx->fParams.frameContentSize, - corruption_detected); + corruption_detected, ""); if (dctx->fParams.checksumFlag) { /* another round for frame checksum */ dctx->expected = 4; dctx->stage = ZSTDds_checkChecksum; } else { + ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1); dctx->expected = 0; /* ends here */ dctx->stage = ZSTDds_getFrameHeaderSize; } } else { dctx->stage = ZSTDds_decodeBlockHeader; dctx->expected = ZSTD_blockHeaderSize; - dctx->previousDstEnd = (char*)dst + rSize; } return rSize; } case ZSTDds_checkChecksum: assert(srcSize == 4); /* guaranteed by dctx->expected */ - { U32 const h32 = (U32)XXH64_digest(&dctx->xxhState); - U32 const check32 = MEM_readLE32(src); - DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32); - RETURN_ERROR_IF(check32 != h32, checksum_wrong); + { + if (dctx->validateChecksum) { + U32 const h32 = (U32)XXH64_digest(&dctx->xxhState); + U32 const check32 = MEM_readLE32(src); + DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32); + RETURN_ERROR_IF(check32 != h32, checksum_wrong, ""); + } + ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1); dctx->expected = 0; dctx->stage = ZSTDds_getFrameHeaderSize; return 0; @@ -993,7 +1415,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c case ZSTDds_decodeSkippableHeader: assert(src != NULL); assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); - memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ + assert(dctx->format != ZSTD_f_zstd1_magicless); + ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ dctx->stage = ZSTDds_skipFrame; return 0; @@ -1005,7 +1428,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c default: assert(0); /* impossible */ - RETURN_ERROR(GENERIC); /* some compiler require default to do something */ + RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ } } @@ -1016,6 +1439,10 @@ static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dict dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); dctx->prefixStart = dict; dctx->previousDstEnd = (const char*)dict + dictSize; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentBeginForFuzzing = dctx->prefixStart; + dctx->dictContentEndForFuzzing = dctx->previousDstEnd; +#endif return 0; } @@ -1029,7 +1456,7 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, const BYTE* dictPtr = (const BYTE*)dict; const BYTE* const dictEnd = dictPtr + dictSize; - RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted); + RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted, "dict is too small"); assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY); /* dict must be valid */ dictPtr += 8; /* skip header = magic + dictID */ @@ -1042,66 +1469,72 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, /* in minimal huffman, we always use X1 variants */ size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, dictPtr, dictEnd - dictPtr, - workspace, workspaceSize); + workspace, workspaceSize, /* flags */ 0); #else size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, - dictPtr, dictEnd - dictPtr, - workspace, workspaceSize); + dictPtr, (size_t)(dictEnd - dictPtr), + workspace, workspaceSize, /* flags */ 0); #endif - RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted); + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); dictPtr += hSize; } { short offcodeNCount[MaxOff+1]; unsigned offcodeMaxValue = MaxOff, offcodeLog; - size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); - RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted); - RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted); - RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted); + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); ZSTD_buildFSETable( entropy->OFTable, offcodeNCount, offcodeMaxValue, OF_base, OF_bits, - offcodeLog); + offcodeLog, + entropy->workspace, sizeof(entropy->workspace), + /* bmi2 */0); dictPtr += offcodeHeaderSize; } { short matchlengthNCount[MaxML+1]; unsigned matchlengthMaxValue = MaxML, matchlengthLog; - size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); - RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted); - RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted); - RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted); + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); ZSTD_buildFSETable( entropy->MLTable, matchlengthNCount, matchlengthMaxValue, ML_base, ML_bits, - matchlengthLog); + matchlengthLog, + entropy->workspace, sizeof(entropy->workspace), + /* bmi2 */ 0); dictPtr += matchlengthHeaderSize; } { short litlengthNCount[MaxLL+1]; unsigned litlengthMaxValue = MaxLL, litlengthLog; - size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); - RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted); - RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted); - RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted); + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); ZSTD_buildFSETable( entropy->LLTable, litlengthNCount, litlengthMaxValue, LL_base, LL_bits, - litlengthLog); + litlengthLog, + entropy->workspace, sizeof(entropy->workspace), + /* bmi2 */ 0); dictPtr += litlengthHeaderSize; } - RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted); + RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, ""); { int i; size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12)); for (i=0; i<3; i++) { U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4; RETURN_ERROR_IF(rep==0 || rep > dictContentSize, - dictionary_corrupted); + dictionary_corrupted, ""); entropy->rep[i] = rep; } } - return dictPtr - (const BYTE*)dict; + return (size_t)(dictPtr - (const BYTE*)dict); } static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) @@ -1115,7 +1548,7 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict /* load entropy tables */ { size_t const eSize = ZSTD_loadDEntropy(&dctx->entropy, dict, dictSize); - RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted); + RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted, ""); dict = (const char*)dict + eSize; dictSize -= eSize; } @@ -1128,18 +1561,24 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) { assert(dctx != NULL); +#if ZSTD_TRACE + dctx->traceCtx = (ZSTD_trace_decompress_begin != NULL) ? ZSTD_trace_decompress_begin(dctx) : 0; +#endif dctx->expected = ZSTD_startingInputLength(dctx->format); /* dctx->format must be properly set */ dctx->stage = ZSTDds_getFrameHeaderSize; + dctx->processedCSize = 0; dctx->decodedSize = 0; dctx->previousDstEnd = NULL; dctx->prefixStart = NULL; dctx->virtualStart = NULL; dctx->dictEnd = NULL; - dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ dctx->litEntropy = dctx->fseEntropy = 0; dctx->dictID = 0; + dctx->bType = bt_reserved; + dctx->isFrameDecompression = 1; ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); - memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ dctx->LLTptr = dctx->entropy.LLTable; dctx->MLTptr = dctx->entropy.MLTable; dctx->OFTptr = dctx->entropy.OFTable; @@ -1149,11 +1588,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) { - FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) ); + FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , ""); if (dict && dictSize) RETURN_ERROR_IF( ZSTD_isError(ZSTD_decompress_insertDictionary(dctx, dict, dictSize)), - dictionary_corrupted); + dictionary_corrupted, ""); return 0; } @@ -1172,7 +1611,7 @@ size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) DEBUGLOG(4, "DDict is %s", dctx->ddictIsCold ? "~cold~" : "hot!"); } - FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) ); + FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , ""); if (ddict) { /* NULL ddict is equivalent to no dictionary */ ZSTD_copyDDictParameters(dctx, ddict); } @@ -1196,7 +1635,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) * This could for one of the following reasons : * - The frame does not require a dictionary (most common case). * - The frame was built with dictID intentionally removed. - * Needed dictionary is a hidden information. + * Needed dictionary is a hidden piece of information. * Note : this use case also happens when using a non-conformant dictionary. * - `srcSize` is too small, and as a result, frame header could not be decoded. * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. @@ -1205,7 +1644,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) * ZSTD_getFrameHeader(), which will provide a more precise error code. */ unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) { - ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; + ZSTD_FrameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); if (ZSTD_isError(hError)) return 0; return zfp.dictID; @@ -1234,7 +1673,7 @@ size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, ZSTD_DStream* ZSTD_createDStream(void) { DEBUGLOG(3, "ZSTD_createDStream"); - return ZSTD_createDStream_advanced(ZSTD_defaultCMem); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); } ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) @@ -1244,7 +1683,7 @@ ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) { - return ZSTD_createDCtx_advanced(customMem); + return ZSTD_createDCtx_internal(customMem); } size_t ZSTD_freeDStream(ZSTD_DStream* zds) @@ -1263,11 +1702,11 @@ size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) { - RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong); + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); ZSTD_clearDict(dctx); if (dict && dictSize != 0) { dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem); - RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation); + RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation, "NULL pointer!"); dctx->ddict = dctx->ddictLocal; dctx->dictUses = ZSTD_use_indefinitely; } @@ -1286,7 +1725,7 @@ size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSi size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) { - FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType)); + FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType), ""); dctx->dictUses = ZSTD_use_once; return 0; } @@ -1303,8 +1742,8 @@ size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSiz size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize) { DEBUGLOG(4, "ZSTD_initDStream_usingDict"); - FORWARD_IF_ERROR( ZSTD_DCtx_reset(zds, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) ); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(zds, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) , ""); return ZSTD_startingInputLength(zds->format); } @@ -1312,7 +1751,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di size_t ZSTD_initDStream(ZSTD_DStream* zds) { DEBUGLOG(4, "ZSTD_initDStream"); - return ZSTD_initDStream_usingDDict(zds, NULL); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); + FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); + return ZSTD_startingInputLength(zds->format); } /* ZSTD_initDStream_usingDDict() : @@ -1320,8 +1761,9 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) * this function cannot fail */ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) { - FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) ); - FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) ); + DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); return ZSTD_startingInputLength(dctx->format); } @@ -1330,18 +1772,29 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) * this function cannot fail */ size_t ZSTD_resetDStream(ZSTD_DStream* dctx) { - FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only)); + DEBUGLOG(4, "ZSTD_resetDStream"); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); return ZSTD_startingInputLength(dctx->format); } size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) { - RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong); + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); ZSTD_clearDict(dctx); if (ddict) { dctx->ddict = ddict; dctx->dictUses = ZSTD_use_indefinitely; + if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts) { + if (dctx->ddictSet == NULL) { + dctx->ddictSet = ZSTD_createDDictHashSet(dctx->customMem); + if (!dctx->ddictSet) { + RETURN_ERROR(memory_allocation, "Failed to allocate memory for hash set!"); + } + } + assert(!dctx->staticSize); /* Impossible: ddictSet cannot have been allocated if static dctx */ + FORWARD_IF_ERROR(ZSTD_DDictHashSet_addDDict(dctx->ddictSet, ddict, dctx->customMem), ""); + } } return 0; } @@ -1354,16 +1807,16 @@ size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize) ZSTD_bounds const bounds = ZSTD_dParam_getBounds(ZSTD_d_windowLogMax); size_t const min = (size_t)1 << bounds.lowerBound; size_t const max = (size_t)1 << bounds.upperBound; - RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong); - RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound); - RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound); + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound, ""); + RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound, ""); dctx->maxWindowSize = maxWindowSize; return 0; } size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format) { - return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, format); + return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, (int)format); } ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) @@ -1379,6 +1832,27 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) bounds.upperBound = (int)ZSTD_f_zstd1_magicless; ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless); return bounds; + case ZSTD_d_stableOutBuffer: + bounds.lowerBound = (int)ZSTD_bm_buffered; + bounds.upperBound = (int)ZSTD_bm_stable; + return bounds; + case ZSTD_d_forceIgnoreChecksum: + bounds.lowerBound = (int)ZSTD_d_validateChecksum; + bounds.upperBound = (int)ZSTD_d_ignoreChecksum; + return bounds; + case ZSTD_d_refMultipleDDicts: + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; + case ZSTD_d_disableHuffmanAssembly: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + case ZSTD_d_maxBlockSize: + bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; + bounds.upperBound = ZSTD_BLOCKSIZE_MAX; + return bounds; + default:; } bounds.error = ERROR(parameter_unsupported); @@ -1398,12 +1872,41 @@ static int ZSTD_dParam_withinBounds(ZSTD_dParameter dParam, int value) } #define CHECK_DBOUNDS(p,v) { \ - RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound); \ + RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound, ""); \ +} + +size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value) +{ + switch (param) { + case ZSTD_d_windowLogMax: + *value = (int)ZSTD_highbit32((U32)dctx->maxWindowSize); + return 0; + case ZSTD_d_format: + *value = (int)dctx->format; + return 0; + case ZSTD_d_stableOutBuffer: + *value = (int)dctx->outBufferMode; + return 0; + case ZSTD_d_forceIgnoreChecksum: + *value = (int)dctx->forceIgnoreChecksum; + return 0; + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; + case ZSTD_d_disableHuffmanAssembly: + *value = (int)dctx->disableHufAsm; + return 0; + case ZSTD_d_maxBlockSize: + *value = dctx->maxBlockSizeParam; + return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); } size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value) { - RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong); + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); switch(dParam) { case ZSTD_d_windowLogMax: if (value == 0) value = ZSTD_WINDOWLOG_LIMIT_DEFAULT; @@ -1414,9 +1917,32 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value CHECK_DBOUNDS(ZSTD_d_format, value); dctx->format = (ZSTD_format_e)value; return 0; + case ZSTD_d_stableOutBuffer: + CHECK_DBOUNDS(ZSTD_d_stableOutBuffer, value); + dctx->outBufferMode = (ZSTD_bufferMode_e)value; + return 0; + case ZSTD_d_forceIgnoreChecksum: + CHECK_DBOUNDS(ZSTD_d_forceIgnoreChecksum, value); + dctx->forceIgnoreChecksum = (ZSTD_forceIgnoreChecksum_e)value; + return 0; + case ZSTD_d_refMultipleDDicts: + CHECK_DBOUNDS(ZSTD_d_refMultipleDDicts, value); + if (dctx->staticSize != 0) { + RETURN_ERROR(parameter_unsupported, "Static dctx does not support multiple DDicts!"); + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; + case ZSTD_d_disableHuffmanAssembly: + CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); + dctx->disableHufAsm = value != 0; + return 0; + case ZSTD_d_maxBlockSize: + if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value); + dctx->maxBlockSizeParam = value; + return 0; default:; } - RETURN_ERROR(parameter_unsupported); + RETURN_ERROR(parameter_unsupported, ""); } size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) @@ -1425,13 +1951,13 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) || (reset == ZSTD_reset_session_and_parameters) ) { dctx->streamStage = zdss_init; dctx->noForwardProgress = 0; + dctx->isFrameDecompression = 1; } if ( (reset == ZSTD_reset_parameters) || (reset == ZSTD_reset_session_and_parameters) ) { - RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong); + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); ZSTD_clearDict(dctx); - dctx->format = ZSTD_f_zstd1; - dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; + ZSTD_DCtx_resetParameters(dctx); } return 0; } @@ -1442,17 +1968,29 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) return ZSTD_sizeof_DCtx(dctx); } -size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) +static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax) { - size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); - unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2); + size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax); + /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block + * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing + * the block at the beginning of the output buffer, and maintain a full window. + * + * We need another blockSize worth of buffer so that we can store split + * literals at the end of the block without overwriting the extDict window. + */ + unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2); unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); size_t const minRBSize = (size_t) neededSize; RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, - frameParameter_windowTooLarge); + frameParameter_windowTooLarge, ""); return minRBSize; } +size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) +{ + return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX); +} + size_t ZSTD_estimateDStreamSize(size_t windowSize) { size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); @@ -1464,37 +2002,102 @@ size_t ZSTD_estimateDStreamSize(size_t windowSize) size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize) { U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable, but requires an additional parameter (or a dctx) */ - ZSTD_frameHeader zfh; + ZSTD_FrameHeader zfh; size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize); if (ZSTD_isError(err)) return err; - RETURN_ERROR_IF(err>0, srcSize_wrong); + RETURN_ERROR_IF(err>0, srcSize_wrong, ""); RETURN_ERROR_IF(zfh.windowSize > windowSizeMax, - frameParameter_windowTooLarge); + frameParameter_windowTooLarge, ""); return ZSTD_estimateDStreamSize((size_t)zfh.windowSize); } /* ***** Decompression ***** */ -MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize) +static int ZSTD_DCtx_isOverflow(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) { - size_t const length = MIN(dstCapacity, srcSize); - memcpy(dst, src, length); - return length; + return (zds->inBuffSize + zds->outBuffSize) >= (neededInBuffSize + neededOutBuffSize) * ZSTD_WORKSPACETOOLARGE_FACTOR; } +static void ZSTD_DCtx_updateOversizedDuration(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) +{ + if (ZSTD_DCtx_isOverflow(zds, neededInBuffSize, neededOutBuffSize)) + zds->oversizedDuration++; + else + zds->oversizedDuration = 0; +} + +static int ZSTD_DCtx_isOversizedTooLong(ZSTD_DStream* zds) +{ + return zds->oversizedDuration >= ZSTD_WORKSPACETOOLARGE_MAXDURATION; +} + +/* Checks that the output buffer hasn't changed if ZSTD_obm_stable is used. */ +static size_t ZSTD_checkOutBuffer(ZSTD_DStream const* zds, ZSTD_outBuffer const* output) +{ + ZSTD_outBuffer const expect = zds->expectedOutBuffer; + /* No requirement when ZSTD_obm_stable is not enabled. */ + if (zds->outBufferMode != ZSTD_bm_stable) + return 0; + /* Any buffer is allowed in zdss_init, this must be the same for every other call until + * the context is reset. + */ + if (zds->streamStage == zdss_init) + return 0; + /* The buffer must match our expectation exactly. */ + if (expect.dst == output->dst && expect.pos == output->pos && expect.size == output->size) + return 0; + RETURN_ERROR(dstBuffer_wrong, "ZSTD_d_stableOutBuffer enabled but output differs!"); +} + +/* Calls ZSTD_decompressContinue() with the right parameters for ZSTD_decompressStream() + * and updates the stage and the output buffer state. This call is extracted so it can be + * used both when reading directly from the ZSTD_inBuffer, and in buffered input mode. + * NOTE: You must break after calling this function since the streamStage is modified. + */ +static size_t ZSTD_decompressContinueStream( + ZSTD_DStream* zds, char** op, char* oend, + void const* src, size_t srcSize) { + int const isSkipFrame = ZSTD_isSkipFrame(zds); + if (zds->outBufferMode == ZSTD_bm_buffered) { + size_t const dstSize = isSkipFrame ? 0 : zds->outBuffSize - zds->outStart; + size_t const decodedSize = ZSTD_decompressContinue(zds, + zds->outBuff + zds->outStart, dstSize, src, srcSize); + FORWARD_IF_ERROR(decodedSize, ""); + if (!decodedSize && !isSkipFrame) { + zds->streamStage = zdss_read; + } else { + zds->outEnd = zds->outStart + decodedSize; + zds->streamStage = zdss_flush; + } + } else { + /* Write directly into the output buffer */ + size_t const dstSize = isSkipFrame ? 0 : (size_t)(oend - *op); + size_t const decodedSize = ZSTD_decompressContinue(zds, *op, dstSize, src, srcSize); + FORWARD_IF_ERROR(decodedSize, ""); + *op += decodedSize; + /* Flushing is not needed. */ + zds->streamStage = zdss_read; + assert(*op <= oend); + assert(zds->outBufferMode == ZSTD_bm_stable); + } + return 0; +} size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input) { - const char* const istart = (const char*)(input->src) + input->pos; - const char* const iend = (const char*)(input->src) + input->size; + const char* const src = (const char*)input->src; + const char* const istart = input->pos != 0 ? src + input->pos : src; + const char* const iend = input->size != 0 ? src + input->size : src; const char* ip = istart; - char* const ostart = (char*)(output->dst) + output->pos; - char* const oend = (char*)(output->dst) + output->size; + char* const dst = (char*)output->dst; + char* const ostart = output->pos != 0 ? dst + output->pos : dst; + char* const oend = output->size != 0 ? dst + output->size : dst; char* op = ostart; U32 someMoreWork = 1; DEBUGLOG(5, "ZSTD_decompressStream"); + assert(zds != NULL); RETURN_ERROR_IF( input->pos > input->size, srcSize_wrong, @@ -1506,6 +2109,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB "forbidden. out: pos: %u vs size: %u", (U32)output->pos, (U32)output->size); DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos)); + FORWARD_IF_ERROR(ZSTD_checkOutBuffer(zds, output), ""); while (someMoreWork) { switch(zds->streamStage) @@ -1514,9 +2118,12 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB DEBUGLOG(5, "stage zdss_init => transparent reset "); zds->streamStage = zdss_loadHeader; zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) zds->legacyVersion = 0; +#endif zds->hostageByte = 0; - /* fall-through */ + zds->expectedOutBuffer = *output; + ZSTD_FALLTHROUGH; case zdss_loadHeader : DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip)); @@ -1530,7 +2137,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB } } #endif { size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format); - DEBUGLOG(5, "header size : %u", (U32)hSize); + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } if (ZSTD_isError(hSize)) { #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart); @@ -1543,7 +2152,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB "legacy support is incompatible with static dctx"); FORWARD_IF_ERROR(ZSTD_initLegacyStream(&zds->legacyContext, zds->previousLegacyVersion, legacyVersion, - dict, dictSize)); + dict, dictSize), ""); zds->legacyVersion = zds->previousLegacyVersion = legacyVersion; { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input); if (hint==0) zds->streamStage = zdss_init; /* or stay in stage zdss_loadHeader */ @@ -1558,43 +2167,59 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB assert(iend >= ip); if (toLoad > remainingInput) { /* not enough input to load full header */ if (remainingInput > 0) { - memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput); + ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput); zds->lhSize += remainingInput; } input->pos = input->size; + /* check first few bytes */ + FORWARD_IF_ERROR( + ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), + "First few bytes detected incorrect" ); + /* return hint input size */ return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ } assert(ip != NULL); - memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad; + ZSTD_memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad; break; } } /* check for single-pass mode opportunity */ - if (zds->fParams.frameContentSize && zds->fParams.windowSize /* skippable frame if == 0 */ + if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && zds->fParams.frameType != ZSTD_skippableFrame && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { - size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend-istart); + size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format); if (cSize <= (size_t)(iend-istart)) { /* shortcut : using single-pass mode */ - size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, oend-op, istart, cSize, ZSTD_getDDict(zds)); + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); if (ZSTD_isError(decompressedSize)) return decompressedSize; - DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") + DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()"); + assert(istart != NULL); ip = istart + cSize; - op += decompressedSize; + op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ zds->expected = 0; zds->streamStage = zdss_init; someMoreWork = 0; break; } } + /* Check output buffer is large enough for ZSTD_odm_stable. */ + if (zds->outBufferMode == ZSTD_bm_stable + && zds->fParams.frameType != ZSTD_skippableFrame + && zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && (U64)(size_t)(oend-op) < zds->fParams.frameContentSize) { + RETURN_ERROR(dstSize_tooSmall, "ZSTD_obm_stable passed but ZSTD_outBuffer is too small"); + } + /* Consume header (see ZSTDds_decodeFrameHeader) */ DEBUGLOG(4, "Consume header"); - FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds))); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); - if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + if (zds->format == ZSTD_f_zstd1 + && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); zds->stage = ZSTDds_skipFrame; } else { - FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize)); + FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize), ""); zds->expected = ZSTD_blockHeaderSize; zds->stage = ZSTDds_decodeBlockHeader; } @@ -1605,40 +2230,50 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB (U32)(zds->maxWindowSize >> 10) ); zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, - frameParameter_windowTooLarge); + frameParameter_windowTooLarge, ""); + if (zds->maxBlockSizeParam != 0) + zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam); /* Adapt buffer sizes to frame header instructions */ { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); - size_t const neededOutBuffSize = ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize); - if ((zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize)) { - size_t const bufferSize = neededInBuffSize + neededOutBuffSize; - DEBUGLOG(4, "inBuff : from %u to %u", - (U32)zds->inBuffSize, (U32)neededInBuffSize); - DEBUGLOG(4, "outBuff : from %u to %u", - (U32)zds->outBuffSize, (U32)neededOutBuffSize); - if (zds->staticSize) { /* static DCtx */ - DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize); - assert(zds->staticSize >= sizeof(ZSTD_DCtx)); /* controlled at init */ - RETURN_ERROR_IF( - bufferSize > zds->staticSize - sizeof(ZSTD_DCtx), - memory_allocation); - } else { - ZSTD_free(zds->inBuff, zds->customMem); - zds->inBuffSize = 0; - zds->outBuffSize = 0; - zds->inBuff = (char*)ZSTD_malloc(bufferSize, zds->customMem); - RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation); - } - zds->inBuffSize = neededInBuffSize; - zds->outBuff = zds->inBuff + zds->inBuffSize; - zds->outBuffSize = neededOutBuffSize; - } } + size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered + ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax) + : 0; + + ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); + + { int const tooSmall = (zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize); + int const tooLarge = ZSTD_DCtx_isOversizedTooLong(zds); + + if (tooSmall || tooLarge) { + size_t const bufferSize = neededInBuffSize + neededOutBuffSize; + DEBUGLOG(4, "inBuff : from %u to %u", + (U32)zds->inBuffSize, (U32)neededInBuffSize); + DEBUGLOG(4, "outBuff : from %u to %u", + (U32)zds->outBuffSize, (U32)neededOutBuffSize); + if (zds->staticSize) { /* static DCtx */ + DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize); + assert(zds->staticSize >= sizeof(ZSTD_DCtx)); /* controlled at init */ + RETURN_ERROR_IF( + bufferSize > zds->staticSize - sizeof(ZSTD_DCtx), + memory_allocation, ""); + } else { + ZSTD_customFree(zds->inBuff, zds->customMem); + zds->inBuffSize = 0; + zds->outBuffSize = 0; + zds->inBuff = (char*)ZSTD_customMalloc(bufferSize, zds->customMem); + RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation, ""); + } + zds->inBuffSize = neededInBuffSize; + zds->outBuff = zds->inBuff + zds->inBuffSize; + zds->outBuffSize = neededOutBuffSize; + } } } zds->streamStage = zdss_read; - /* fall-through */ + ZSTD_FALLTHROUGH; case zdss_read: DEBUGLOG(5, "stage zdss_read"); - { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); + { size_t const neededInSize = ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)); DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize); if (neededInSize==0) { /* end of frame */ zds->streamStage = zdss_init; @@ -1646,59 +2281,56 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB break; } if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ - int const isSkipFrame = ZSTD_isSkipFrame(zds); - size_t const decodedSize = ZSTD_decompressContinue(zds, - zds->outBuff + zds->outStart, (isSkipFrame ? 0 : zds->outBuffSize - zds->outStart), - ip, neededInSize); - if (ZSTD_isError(decodedSize)) return decodedSize; + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); + assert(ip != NULL); ip += neededInSize; - if (!decodedSize && !isSkipFrame) break; /* this was just a header */ - zds->outEnd = zds->outStart + decodedSize; - zds->streamStage = zdss_flush; + /* Function modifies the stage so we must break */ break; } } if (ip==iend) { someMoreWork = 0; break; } /* no more input */ zds->streamStage = zdss_load; - /* fall-through */ + ZSTD_FALLTHROUGH; case zdss_load: { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); size_t const toLoad = neededInSize - zds->inPos; int const isSkipFrame = ZSTD_isSkipFrame(zds); size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ + assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); if (isSkipFrame) { loadedSize = MIN(toLoad, (size_t)(iend-ip)); } else { RETURN_ERROR_IF(toLoad > zds->inBuffSize - zds->inPos, corruption_detected, "should never happen"); - loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); + } + if (loadedSize != 0) { + /* ip may be NULL */ + ip += loadedSize; + zds->inPos += loadedSize; } - ip += loadedSize; - zds->inPos += loadedSize; if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ /* decode loaded input */ - { size_t const decodedSize = ZSTD_decompressContinue(zds, - zds->outBuff + zds->outStart, zds->outBuffSize - zds->outStart, - zds->inBuff, neededInSize); - if (ZSTD_isError(decodedSize)) return decodedSize; - zds->inPos = 0; /* input is consumed */ - if (!decodedSize && !isSkipFrame) { zds->streamStage = zdss_read; break; } /* this was just a header */ - zds->outEnd = zds->outStart + decodedSize; - } } - zds->streamStage = zdss_flush; - /* fall-through */ - + zds->inPos = 0; /* input is consumed */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, zds->inBuff, neededInSize), ""); + /* Function modifies the stage so we must break */ + break; + } case zdss_flush: - { size_t const toFlushSize = zds->outEnd - zds->outStart; - size_t const flushedSize = ZSTD_limitCopy(op, oend-op, zds->outBuff + zds->outStart, toFlushSize); - op += flushedSize; + { + size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); + + op = op ? op + flushedSize : op; + zds->outStart += flushedSize; if (flushedSize == toFlushSize) { /* flush completed */ zds->streamStage = zdss_read; if ( (zds->outBuffSize < zds->fParams.frameContentSize) - && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", (int)(zds->outBuffSize - zds->outStart), (U32)zds->fParams.blockSizeMax); @@ -1712,17 +2344,21 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB default: assert(0); /* impossible */ - RETURN_ERROR(GENERIC); /* some compiler require default to do something */ + RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ } } /* result */ input->pos = (size_t)(ip - (const char*)(input->src)); output->pos = (size_t)(op - (char*)(output->dst)); + + /* Update the expected output buffer for ZSTD_obm_stable. */ + zds->expectedOutBuffer = *output; + if ((ip==istart) && (op==ostart)) { /* no forward progress */ zds->noForwardProgress ++; if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { - RETURN_ERROR_IF(op==oend, dstSize_tooSmall); - RETURN_ERROR_IF(ip==iend, srcSize_wrong); + RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); + RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); assert(0); } } else { @@ -1759,11 +2395,19 @@ size_t ZSTD_decompressStream_simpleArgs ( void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos) { - ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; - ZSTD_inBuffer input = { src, srcSize, *srcPos }; - /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ - size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); - *dstPos = output.pos; - *srcPos = input.pos; - return cErr; + ZSTD_outBuffer output; + ZSTD_inBuffer input; + output.dst = dst; + output.size = dstCapacity; + output.pos = *dstPos; + input.src = src; + input.size = srcSize; + input.pos = *srcPos; + { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; + } } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_decompress_block.c b/vendor/github.com/DataDog/zstd/zstd_decompress_block.c index 767e5f9..c454eb7 100644 --- a/vendor/github.com/DataDog/zstd/zstd_decompress_block.c +++ b/vendor/github.com/DataDog/zstd/zstd_decompress_block.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -14,18 +15,18 @@ /*-******************************************************* * Dependencies *********************************************************/ -#include /* memcpy, memmove, memset */ +#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ #include "compiler.h" /* prefetch */ #include "cpu.h" /* bmi2 */ #include "mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "fse.h" -#define HUF_STATIC_LINKING_ONLY #include "huf.h" #include "zstd_internal.h" #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ #include "zstd_decompress_block.h" +#include "bits.h" /* ZSTD_highbit32 */ /*_******************************************************* * Macros @@ -44,19 +45,26 @@ /*_******************************************************* * Memory operations **********************************************************/ -static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); } +static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } /*-************************************************************* * Block decoding ***************************************************************/ +static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) +{ + size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; + assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); + return blockSizeMax; +} + /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr) { - RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong); + RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, ""); { U32 const cBlockHeader = MEM_readLE24(src); U32 const cSize = cBlockHeader >> 3; @@ -64,41 +72,95 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); bpPtr->origSize = cSize; /* only useful for RLE */ if (bpPtr->blockType == bt_rle) return 1; - RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected); + RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, ""); return cSize; } } +/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ +static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) +{ + size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); + assert(litSize <= blockSizeMax); + assert(dctx->isFrameDecompression || streaming == not_streaming); + assert(expectedWriteSize <= blockSizeMax); + if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { + /* If we aren't streaming, we can just put the literals after the output + * of the current block. We don't need to worry about overwriting the + * extDict of our window, because it doesn't exist. + * So if we have space after the end of the block, just put it there. + */ + dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_in_dst; + } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { + /* Literals fit entirely within the extra buffer, put them there to avoid + * having to split the literals. + */ + dctx->litBuffer = dctx->litExtraBuffer; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_not_in_dst; + } else { + assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); + /* Literals must be split between the output block and the extra lit + * buffer. We fill the extra lit buffer with the tail of the literals, + * and put the rest of the literals at the end of the block, with + * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. + * This MUST not write more than our maxBlockSize beyond dst, because in + * streaming mode, that could overwrite part of our extDict window. + */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; + } else { + /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation = ZSTD_split; + assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); + } +} -/* Hidden declaration for fullbench */ -size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize); /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current + * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being + * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. + * * @return : nb of bytes read from src (< srcSize ) * note : symbol not declared but exposed for fullbench */ -size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */ +static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_operation streaming) { DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); - RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected); + RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); { const BYTE* const istart = (const BYTE*) src; - symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); + SymbolEncodingType_e const litEncType = (SymbolEncodingType_e)(istart[0] & 3); + size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); switch(litEncType) { case set_repeat: DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); - RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted); - /* fall-through */ + RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); + ZSTD_FALLTHROUGH; case set_compressed: - RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); + RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); { size_t lhSize, litSize, litCSize; U32 singleStream=0; U32 const lhlCode = (istart[0] >> 2) & 3; U32 const lhc = MEM_readLE32(istart); size_t hufSuccess; + size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + int const flags = 0 + | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) + | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); switch(lhlCode) { case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -121,8 +183,15 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); break; } - RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected); - RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected); + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + if (!singleStream) + RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, + "Not enough literals (%zu) for the 4-streams mode (min %u)", + litSize, MIN_LITERALS_FOR_4_STREAMS); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); /* prefetch huffman table if cold */ if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { @@ -131,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, if (litEncType==set_repeat) { if (singleStream) { - hufSuccess = HUF_decompress1X_usingDTable_bmi2( + hufSuccess = HUF_decompress1X_usingDTable( dctx->litBuffer, litSize, istart+lhSize, litCSize, - dctx->HUFptr, dctx->bmi2); + dctx->HUFptr, flags); } else { - hufSuccess = HUF_decompress4X_usingDTable_bmi2( + assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); + hufSuccess = HUF_decompress4X_usingDTable( dctx->litBuffer, litSize, istart+lhSize, litCSize, - dctx->HUFptr, dctx->bmi2); + dctx->HUFptr, flags); } } else { if (singleStream) { @@ -145,34 +215,43 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, hufSuccess = HUF_decompress1X_DCtx_wksp( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, - sizeof(dctx->workspace)); + sizeof(dctx->workspace), flags); #else - hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( + hufSuccess = HUF_decompress1X1_DCtx_wksp( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, - sizeof(dctx->workspace), dctx->bmi2); + sizeof(dctx->workspace), flags); #endif } else { - hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( + hufSuccess = HUF_decompress4X_hufOnly_wksp( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, - sizeof(dctx->workspace), dctx->bmi2); + sizeof(dctx->workspace), flags); } } + if (dctx->litBufferLocation == ZSTD_split) + { + assert(litSize > ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; + assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); + } - RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected); + RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; dctx->litEntropy = 1; if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; - memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); return litCSize + lhSize; } case set_basic: { size_t litSize, lhSize; U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -185,27 +264,42 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, break; case 3: lhSize = 3; + RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); litSize = MEM_readLE24(istart) >> 4; break; } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ - RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected); - memcpy(dctx->litBuffer, istart+lhSize, litSize); + RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); + } dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; - memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); return lhSize+litSize; } /* direct reference into compressed stream */ dctx->litPtr = istart+lhSize; dctx->litSize = litSize; + dctx->litBufferEnd = dctx->litPtr + litSize; + dctx->litBufferLocation = ZSTD_not_in_dst; return lhSize+litSize; } case set_rle: { U32 const lhlCode = ((istart[0]) >> 2) & 3; size_t litSize, lhSize; + size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -214,16 +308,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, break; case 1: lhSize = 2; + RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); litSize = MEM_readLE16(istart) >> 4; break; case 3: lhSize = 3; + RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); litSize = MEM_readLE24(istart) >> 4; - RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); break; } - RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected); - memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); + } dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; return lhSize+1; @@ -234,14 +340,26 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, } } +/* Hidden declaration for fullbench */ +size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, + void* dst, size_t dstCapacity); +size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, + void* dst, size_t dstCapacity) +{ + dctx->isFrameDecompression = 0; + return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); +} + /* Default FSE distribution tables. * These are pre-calculated FSE decoding tables using default distributions as defined in specification : - * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions * They were generated programmatically with following method : * - start from default distributions, present in /lib/common/zstd_internal.h * - generate tables normally, using ZSTD_buildFSETable() * - printout the content of tables - * - pretify output, report below, test with fuzzer to ensure it's correct */ + * - prettify output, report below, test with fuzzer to ensure it's correct */ /* Default FSE distribution table for Literal Lengths */ static const ZSTD_seqSymbol LL_defaultDTable[(1<nbBits = 0; cell->nextState = 0; assert(nbAddBits < 255); - cell->nbAdditionalBits = (BYTE)nbAddBits; + cell->nbAdditionalBits = nbAddBits; cell->baseValue = baseValue; } @@ -364,23 +482,26 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB * generate FSE decoding table for one symbol (ll, ml or off) * cannot fail if input is valid => * all inputs are presumed validated at this stage */ -void -ZSTD_buildFSETable(ZSTD_seqSymbol* dt, +FORCE_INLINE_TEMPLATE +void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, - unsigned tableLog) + const U32* baseValue, const U8* nbAdditionalBits, + unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_seqSymbol* const tableDecode = dt+1; - U16 symbolNext[MaxSeq+1]; - U32 const maxSV1 = maxSymbolValue + 1; U32 const tableSize = 1 << tableLog; - U32 highThreshold = tableSize-1; + + U16* symbolNext = (U16*)wksp; + BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); + U32 highThreshold = tableSize - 1; + /* Sanity Checks */ assert(maxSymbolValue <= MaxSeq); assert(tableLog <= MaxFSELog); - + assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); + (void)wkspSize; /* Init, lay down lowprob symbols */ { ZSTD_seqSymbol_header DTableH; DTableH.tableLog = tableLog; @@ -396,34 +517,128 @@ ZSTD_buildFSETable(ZSTD_seqSymbol* dt, assert(normalizedCounter[s]>=0); symbolNext[s] = (U16)normalizedCounter[s]; } } } - memcpy(dt, &DTableH, sizeof(DTableH)); + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); } /* Spread symbols */ - { U32 const tableMask = tableSize-1; + assert(tableSize <= 512); + /* Specialized symbol spreading for the case when there are + * no low probability (-1 count) symbols. When compressing + * small blocks we avoid low probability symbols to hit this + * case, since header decoding speed matters more. + */ + if (highThreshold == tableSize - 1) { + size_t const tableMask = tableSize-1; + size_t const step = FSE_TABLESTEP(tableSize); + /* First lay down the symbols in order. + * We use a uint64_t to lay down 8 bytes at a time. This reduces branch + * misses since small blocks generally have small table logs, so nearly + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ + { + U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; s=0); + pos += (size_t)n; + } + } + /* Now we spread those positions across the table. + * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. + * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; + size_t s; + size_t const unroll = 2; + assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ + for (s = 0; s < (size_t)tableSize; s += unroll) { + size_t u; + for (u = 0; u < unroll; ++u) { + size_t const uPosition = (position + (u * step)) & tableMask; + tableDecode[uPosition].baseValue = spread[s + u]; + } + position = (position + (unroll * step)) & tableMask; + } + assert(position == 0); + } + } else { + U32 const tableMask = tableSize-1; U32 const step = FSE_TABLESTEP(tableSize); U32 s, position = 0; for (s=0; s highThreshold) position = (position + step) & tableMask; /* lowprob area */ + while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ } } assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ } /* Build Decoding table */ - { U32 u; + { + U32 u; for (u=0; u max, corruption_detected); + RETURN_ERROR_IF(!srcSize, srcSize_wrong, ""); + RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""); { U32 const symbol = *(const BYTE*)src; U32 const baseline = baseValue[symbol]; - U32 const nbBits = nbAdditionalBits[symbol]; + U8 const nbBits = nbAdditionalBits[symbol]; ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); } *DTablePtr = DTableSpace; @@ -453,7 +669,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb *DTablePtr = defaultTable; return 0; case set_repeat: - RETURN_ERROR_IF(!flagRepeatTable, corruption_detected); + RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, ""); /* prefetch FSE table if used */ if (ddictIsCold && (nbSeq > 24 /* heuristic */)) { const void* const pStart = *DTablePtr; @@ -465,9 +681,9 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb { unsigned tableLog; S16 norm[MaxSeq+1]; size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); - RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected); - RETURN_ERROR_IF(tableLog > maxLog, corruption_detected); - ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog); + RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); + RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); + ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2); *DTablePtr = DTableSpace; return headerSize; } @@ -480,38 +696,42 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; int nbSeq; DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); /* check */ - RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong); + RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, ""); /* SeqHead */ nbSeq = *ip++; - if (!nbSeq) { - *nbSeqPtr=0; - RETURN_ERROR_IF(srcSize != 1, srcSize_wrong); - return 1; - } if (nbSeq > 0x7F) { if (nbSeq == 0xFF) { - RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong); - nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2; + RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); + nbSeq = MEM_readLE16(ip) + LONGNBSEQ; + ip+=2; } else { - RETURN_ERROR_IF(ip >= iend, srcSize_wrong); + RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); nbSeq = ((nbSeq-0x80)<<8) + *ip++; } } *nbSeqPtr = nbSeq; + if (nbSeq == 0) { + /* No sequence : section ends immediately */ + RETURN_ERROR_IF(ip != iend, corruption_detected, + "extraneous data present in the Sequences section"); + return (size_t)(ip - istart); + } + /* FSE table descriptors */ - RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong); /* minimum possible size: 1 byte for symbol encoding types */ - { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); - symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); - symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); + RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ + RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ + { SymbolEncodingType_e const LLtype = (SymbolEncodingType_e)(*ip >> 6); + SymbolEncodingType_e const OFtype = (SymbolEncodingType_e)((*ip >> 4) & 3); + SymbolEncodingType_e const MLtype = (SymbolEncodingType_e)((*ip >> 2) & 3); ip++; /* Build DTables */ @@ -520,8 +740,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, ip, iend-ip, LL_base, LL_bits, LL_defaultDTable, dctx->fseEntropy, - dctx->ddictIsCold, nbSeq); - RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected); + dctx->ddictIsCold, nbSeq, + dctx->workspace, sizeof(dctx->workspace), + ZSTD_DCtx_get_bmi2(dctx)); + RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += llhSize; } @@ -530,8 +752,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, ip, iend-ip, OF_base, OF_bits, OF_defaultDTable, dctx->fseEntropy, - dctx->ddictIsCold, nbSeq); - RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected); + dctx->ddictIsCold, nbSeq, + dctx->workspace, sizeof(dctx->workspace), + ZSTD_DCtx_get_bmi2(dctx)); + RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += ofhSize; } @@ -540,8 +764,10 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, ip, iend-ip, ML_base, ML_bits, ML_defaultDTable, dctx->fseEntropy, - dctx->ddictIsCold, nbSeq); - RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected); + dctx->ddictIsCold, nbSeq, + dctx->workspace, sizeof(dctx->workspace), + ZSTD_DCtx_get_bmi2(dctx)); + RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += mlhSize; } } @@ -554,7 +780,6 @@ typedef struct { size_t litLength; size_t matchLength; size_t offset; - const BYTE* match; } seq_t; typedef struct { @@ -568,9 +793,6 @@ typedef struct { ZSTD_fseState stateOffb; ZSTD_fseState stateML; size_t prevOffset[ZSTD_REP_NUM]; - const BYTE* prefixStart; - const BYTE* dictEnd; - size_t pos; } seqState_t; /*! ZSTD_overlapCopy8() : @@ -580,7 +802,7 @@ typedef struct { * Precondition: *ip <= *op * Postcondition: *op - *op >= 8 */ -static void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { +HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { assert(*ip <= *op); if (offset < 8) { /* close range match, overlap */ @@ -613,7 +835,7 @@ static void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. * The src buffer must be before the dst buffer. */ -static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { +static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { ptrdiff_t const diff = op - ip; BYTE* const oend = op + length; @@ -629,6 +851,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_ /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ assert(length >= 8); ZSTD_overlapCopy8(&op, &ip, diff); + length -= 8; assert(op - ip >= 8); assert(op <= oend); } @@ -643,12 +866,35 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_ assert(oend > oend_w); ZSTD_wildcopy(op, ip, oend_w - op, ovtype); ip += oend_w - op; - op = oend_w; + op += oend_w - op; } /* Handle the leftovers. */ while (op < oend) *op++ = *ip++; } +/* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + + if (length < 8 || diff > -8) { + /* Handle short lengths, close overlaps, and dst not before src. */ + while (op < oend) *op++ = *ip++; + return; + } + + if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { + ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); + ip += oend - WILDCOPY_OVERLENGTH - op; + op += oend - WILDCOPY_OVERLENGTH - op; + } + + /* Handle the leftovers. */ + while (op < oend) *op++ = *ip++; +} + /* ZSTD_execSequenceEnd(): * This version handles cases that are near the end of the output buffer. It requires * more careful checks to make sure there is no overflow. By separating out these hard @@ -658,22 +904,23 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_ * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). */ FORCE_NOINLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequenceEnd(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; - BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset; BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; - /* bounds checks */ - assert(oLitEnd < oMatchEnd); - RETURN_ERROR_IF(oMatchEnd > oend, dstSize_tooSmall, "last match must fit within dstBuffer"); - RETURN_ERROR_IF(iLitEnd > litLimit, corruption_detected, "try to read beyond literal buffer"); + /* bounds checks : careful of address space overflow in 32-bit mode */ + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); + assert(op < op + sequenceLength); + assert(oLitEnd < op + sequenceLength); /* copy literals */ ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); @@ -683,42 +930,109 @@ size_t ZSTD_execSequenceEnd(BYTE* op, /* copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix */ - RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected); - match = dictEnd - (prefixStart-match); + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); + match = dictEnd - (prefixStart - match); if (match + sequence.matchLength <= dictEnd) { - memmove(oLitEnd, match, sequence.matchLength); + ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; - memmove(oLitEnd, match, length1); - op = oLitEnd + length1; - sequence.matchLength -= length1; - match = prefixStart; - } } + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); + return sequenceLength; +} + +/* ZSTD_execSequenceEndSplitLitBuffer(): + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. + */ +FORCE_NOINLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + + /* bounds checks : careful of address space overflow in 32-bit mode */ + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); + assert(op < op + sequenceLength); + assert(oLitEnd < op + sequenceLength); + + /* copy literals */ + RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); + ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); + op = oLitEnd; + *litPtr = iLitEnd; + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix */ + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); + match = dictEnd - (prefixStart - match); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); return sequenceLength; } HINT_INLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequence(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ - BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */ const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset; - /* Errors and uncommon cases handled here. */ - assert(oLitEnd < oMatchEnd); - if (iLitEnd > litLimit || oMatchEnd > oend_w) + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); + +#if defined(__aarch64__) + /* prefetch sequence starting from match that will be used for copy later */ + PREFETCH_L1(match); +#endif + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend + * - 32-bit mode and the match length overflows + */ + if (UNLIKELY( + iLitEnd > litLimit || + oMatchEnd > oend_w || + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ + assert(op <= oLitEnd /* No overflow */); + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); + assert(oMatchEnd <= oend /* No underflow */); assert(iLitEnd <= litLimit /* Literal length is in bounds */); assert(oLitEnd <= oend_w /* Can wildcopy literals */); assert(oMatchEnd <= oend_w /* Can wildcopy matches */); @@ -729,7 +1043,100 @@ size_t ZSTD_execSequence(BYTE* op, */ assert(WILDCOPY_OVERLENGTH >= 16); ZSTD_copy16(op, (*litPtr)); - if (sequence.litLength > 16) { + if (UNLIKELY(sequence.litLength > 16)) { + ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); + } + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* Copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix -> go into extDict */ + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); + match = dictEnd + (match - prefixStart); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } + /* Match within prefix of 1 or more bytes */ + assert(op <= oMatchEnd); + assert(oMatchEnd <= oend_w); + assert(match >= prefixStart); + assert(sequence.matchLength >= 1); + + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy + * without overlap checking. + */ + if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { + /* We bet on a full wildcopy for matches, since we expect matches to be + * longer than literals (in general). In silesia, ~10% of matches are longer + * than 16 bytes. + */ + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); + return sequenceLength; + } + assert(sequence.offset < WILDCOPY_VECLEN); + + /* Copy 8 bytes and spread the offset to be >= 8. */ + ZSTD_overlapCopy8(&op, &match, sequence.offset); + + /* If the match length is > 8 bytes, then continue with the wildcopy. */ + if (sequence.matchLength > 8) { + assert(op < oMatchEnd); + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); + } + return sequenceLength; +} + +HINT_INLINE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend + * - 32-bit mode and the match length overflows + */ + if (UNLIKELY( + iLitEnd > litLimit || + oMatchEnd > oend_w || + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) + return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + + /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ + assert(op <= oLitEnd /* No overflow */); + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); + assert(oMatchEnd <= oend /* No underflow */); + assert(iLitEnd <= litLimit /* Literal length is in bounds */); + assert(oLitEnd <= oend_w /* Can wildcopy literals */); + assert(oMatchEnd <= oend_w /* Can wildcopy matches */); + + /* Copy Literals: + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. + * We likely don't need the full 32-byte wildcopy. + */ + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(op, (*litPtr)); + if (UNLIKELY(sequence.litLength > 16)) { ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); } op = oLitEnd; @@ -738,15 +1145,15 @@ size_t ZSTD_execSequence(BYTE* op, /* Copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix -> go into extDict */ - RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected); + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); match = dictEnd + (match - prefixStart); if (match + sequence.matchLength <= dictEnd) { - memmove(oLitEnd, match, sequence.matchLength); + ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; - memmove(oLitEnd, match, length1); + ZSTD_memmove(oLitEnd, match, length1); op = oLitEnd + length1; sequence.matchLength -= length1; match = prefixStart; @@ -760,7 +1167,7 @@ size_t ZSTD_execSequence(BYTE* op, /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy * without overlap checking. */ - if (sequence.offset >= WILDCOPY_VECLEN) { + if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { /* We bet on a full wildcopy for matches, since we expect matches to be * longer than literals (in general). In silesia, ~10% of matches are longer * than 16 bytes. @@ -781,6 +1188,7 @@ size_t ZSTD_execSequence(BYTE* op, return sequenceLength; } + static void ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) { @@ -794,16 +1202,14 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS } FORCE_INLINE_TEMPLATE void -ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) { - ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; - U32 const nbBits = DInfo.nbBits; size_t const lowBits = BIT_readBits(bitD, nbBits); - DStatePtr->state = DInfo.nextState + lowBits; + DStatePtr->state = nextState + lowBits; } /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum - * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) + * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 * bits before reloading. This value is the maximum number of bytes we read * after reloading when we are decoding long offsets. */ @@ -814,145 +1220,474 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; -#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG +/** + * ZSTD_decodeSequence(): + * @p longOffsets : tells the decoder to reload more bit while decoding large offsets + * only used in 32-bit mode + * @return : Sequence (litL + matchL + offset) + */ FORCE_INLINE_TEMPLATE seq_t -ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) { seq_t seq; - U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits; - U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits; - U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits; - U32 const totalBits = llBits+mlBits+ofBits; - U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue; - U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue; - U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue; - - /* sequence */ - { size_t offset; - if (!ofBits) - offset = 0; - else { - ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); - ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); - assert(ofBits <= MaxOff); - if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { - U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); - offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - BIT_reloadDStream(&seqState->DStream); - if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); - assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ + /* + * ZSTD_seqSymbol is a 64 bits wide structure. + * It can be loaded in one operation + * and its fields extracted by simply shifting or bit-extracting on aarch64. + * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh + * operations that cause performance drop. This can be avoided by using this + * ZSTD_memcpy hack. + */ +#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) + ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; + ZSTD_seqSymbol* const llDInfo = &llDInfoS; + ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; + ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; + ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); + ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); + ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); +#else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; +#endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; + BYTE const llBits = llDInfo->nbAdditionalBits; + BYTE const mlBits = mlDInfo->nbAdditionalBits; + BYTE const ofBits = ofDInfo->nbAdditionalBits; + BYTE const totalBits = llBits+mlBits+ofBits; + + U16 const llNext = llDInfo->nextState; + U16 const mlNext = mlDInfo->nextState; + U16 const ofNext = ofDInfo->nextState; + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; + + assert(llBits <= MaxLLBits); + assert(mlBits <= MaxMLBits); + assert(ofBits <= MaxOff); + /* + * As gcc has better branch and block analyzers, sometimes it is only + * valuable to mark likeliness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; + if (ofBits > 1) { + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); + ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + /* Always read extra bits, this keeps the logic simple, + * avoids branches, and avoids accidentally reading 0 bits. + */ + U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + } + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; } else { - offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); - } + U32 const ll0 = (llDInfo->baseValue == 0); + if (LIKELY((ofBits == 0))) { + offset = seqState->prevOffset[ll0]; + seqState->prevOffset[1] = seqState->prevOffset[!ll0]; + seqState->prevOffset[0] = offset; + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } } } + seq.offset = offset; } - if (ofBits <= 1) { - offset += (llBase==0); - if (offset) { - size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; - } else { /* offset == 0 */ - offset = seqState->prevOffset[0]; - } - } else { - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; + if (mlBits > 0) + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + if (llBits > 0) + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + if (!isLastSeq) { + /* don't update FSE state for last Sequence */ + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ + BIT_reloadDStream(&seqState->DStream); } - seq.offset = offset; } - seq.matchLength = mlBase - + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0); /* <= 16 bits */ - if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) - BIT_reloadDStream(&seqState->DStream); - if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) - BIT_reloadDStream(&seqState->DStream); - /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ - ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - - seq.litLength = llBase - + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0); /* <= 16 bits */ - if (MEM_32bits()) - BIT_reloadDStream(&seqState->DStream); - - DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", - (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + return seq; +} - /* ANS state update */ - ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +#if DEBUGLEVEL >= 1 +static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) +{ + size_t const windowSize = dctx->fParams.windowSize; + /* No dictionary used. */ + if (dctx->dictContentEndForFuzzing == NULL) return 0; + /* Dictionary is our prefix. */ + if (prefixStart == dctx->dictContentBeginForFuzzing) return 1; + /* Dictionary is not our ext-dict. */ + if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0; + /* Dictionary is not within our window size. */ + if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0; + /* Dictionary is active. */ + return 1; +} +#endif - return seq; +static void ZSTD_assertValidSequence( + ZSTD_DCtx const* dctx, + BYTE const* op, BYTE const* oend, + seq_t const seq, + BYTE const* prefixStart, BYTE const* virtualStart) +{ +#if DEBUGLEVEL >= 1 + if (dctx->isFrameDecompression) { + size_t const windowSize = dctx->fParams.windowSize; + size_t const sequenceSize = seq.litLength + seq.matchLength; + BYTE const* const oLitEnd = op + seq.litLength; + DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + assert(op <= oend); + assert((size_t)(oend - op) >= sequenceSize); + assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); + if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { + size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); + /* Offset must be within the dictionary. */ + assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); + assert(seq.offset <= windowSize + dictSize); + } else { + /* Offset must be within our window. */ + assert(seq.offset <= windowSize); + } + } +#else + (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; +#endif } +#endif + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + FORCE_INLINE_TEMPLATE size_t DONT_VECTORIZE -ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, +ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; - BYTE* const oend = ostart + maxDstSize; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; - const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); - DEBUGLOG(5, "ZSTD_decompressSequences_body"); + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); - /* Regen sequences */ + /* Literals are split between internal buffer & output buffer */ if (nbSeq) { seqState_t seqState; dctx->fseEntropy = 1; { U32 i; for (i=0; ientropy.rep[i]; } RETURN_ERROR_IF( ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), - corruption_detected); + corruption_detected, ""); ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); ZSTD_STATIC_ASSERT( BIT_DStream_unfinished < BIT_DStream_completed && BIT_DStream_endOfBuffer < BIT_DStream_completed && BIT_DStream_completed < BIT_DStream_overflow); - for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) { - nbSeq--; - { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); + /* decompress without overrunning litPtr begins */ + { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression + * speed swings based on the alignment of the decompression loop. This + * performance swing is caused by parts of the decompression loop falling + * out of the DSB. The entire decompression loop should fit in the DSB, + * when it can't we get much worse performance. You can measure if you've + * hit the good case or the bad case with this perf command for some + * compressed file test.zst: + * + * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ + * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst + * + * If you see most cycles served out of the MITE you've hit the bad case. + * If you see most cycles served out of the DSB you've hit the good case. + * If it is pretty even then you may be in an okay case. + * + * This issue has been reproduced on the following CPUs: + * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 + * Use Instruments->Counters to get DSB/MITE cycles. + * I never got performance swings, but I was able to + * go from the good case of mostly DSB to half of the + * cycles served from MITE. + * - Coffeelake: Intel i9-9900k + * - Coffeelake: Intel i7-9700k + * + * I haven't been able to reproduce the instability or DSB misses on any + * of the following CPUS: + * - Haswell + * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH + * - Skylake + * + * Alignment is done for each of the three major decompression loops: + * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer + * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer + * - ZSTD_decompressSequences_body + * Alignment choices are made to minimize large swings on bad cases and influence on performance + * from changes external to this code, rather than to overoptimize on the current commit. + * + * If you are seeing performance stability this script can help test. + * It tests on 4 commits in zstd where I saw performance change. + * + * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 + */ +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); +# if __GNUC__ >= 7 + /* good for gcc-7, gcc-9, and gcc-11 */ + __asm__("nop"); + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 4"); +# if __GNUC__ == 8 || __GNUC__ == 10 + /* good for gcc-8 and gcc-10 */ + __asm__("nop"); + __asm__(".p2align 3"); +# endif +# endif +#endif + + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ + for ( ; nbSeq; nbSeq--) { + sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + if (litPtr + sequence.litLength > dctx->litBufferEnd) break; + { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + } } + DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); + + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); + if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + } + nbSeq--; + } + } + + if (nbSeq > 0) { + /* there is remaining lit from extra buffer */ + +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ != 7 + /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); +# elif __GNUC__ >= 11 + __asm__(".p2align 3"); +# else + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); +# endif +#endif + + for ( ; nbSeq ; nbSeq--) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; op += oneSeqSize; - } } + } + } /* check if reached exact end */ - DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); - RETURN_ERROR_IF(nbSeq, corruption_detected); - RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected); + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); + DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); + RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); /* save reps for next block */ { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } } /* last literal segment */ - { size_t const lastLLSize = litEnd - litPtr; - RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall); - memcpy(op, litPtr, lastLLSize); - op += lastLLSize; + if (dctx->litBufferLocation == ZSTD_split) { + /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ + size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); + DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; } + /* copy last literals from internal buffer */ + { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); + DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } } - return op-ostart; + DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); + return (size_t)(op - ostart); +} + +FORCE_INLINE_TEMPLATE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); + DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); + + /* Regen sequences */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; + { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ >= 7 + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); +# else + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); +# endif +#endif + + for ( ; nbSeq ; nbSeq--) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + } + + /* check if reached exact end */ + assert(nbSeq == 0); + RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + { size_t const lastLLSize = (size_t)(litEnd - litPtr); + DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } } + + DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); + return (size_t)(op - ostart); } static size_t @@ -963,89 +1698,38 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } -#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ - +static size_t +ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT -FORCE_INLINE_TEMPLATE seq_t -ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets) -{ - seq_t seq; - U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits; - U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits; - U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits; - U32 const totalBits = llBits+mlBits+ofBits; - U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue; - U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue; - U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue; - - /* sequence */ - { size_t offset; - if (!ofBits) - offset = 0; - else { - ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); - ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); - assert(ofBits <= MaxOff); - if (MEM_32bits() && longOffsets) { - U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1); - offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream); - if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); - } else { - offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); - } - } - if (ofBits <= 1) { - offset += (llBase==0); - if (offset) { - size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; - } else { - offset = seqState->prevOffset[0]; - } - } else { - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; - } - seq.offset = offset; - } +FORCE_INLINE_TEMPLATE - seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ - if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) - BIT_reloadDStream(&seqState->DStream); - if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) - BIT_reloadDStream(&seqState->DStream); - /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */ - ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - - seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ - if (MEM_32bits()) - BIT_reloadDStream(&seqState->DStream); - - { size_t const pos = seqState->pos + seq.litLength; - const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart; - seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. - * No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */ - seqState->pos = pos + seq.matchLength; +size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEnd) +{ + prefetchPos += sequence.litLength; + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; + /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. + * No consequence though : memory address is only used for prefetching, not for dereferencing */ + const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ } - - /* ANS state update */ - ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - - return seq; + return prefetchPos + sequence.matchLength; } +/* This decoding function employs prefetching + * to reduce latency impact of cache misses. + * It's generally employed when block contains a significant portion of long-distance matches + * or when coupled with a "cold" dictionary */ FORCE_INLINE_TEMPLATE size_t ZSTD_decompressSequencesLong_body( ZSTD_DCtx* dctx, @@ -1055,61 +1739,127 @@ ZSTD_decompressSequencesLong_body( { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; - BYTE* const oend = ostart + maxDstSize; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; - const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); /* Regen sequences */ if (nbSeq) { -#define STORED_SEQS 4 +#define STORED_SEQS 8 #define STORED_SEQS_MASK (STORED_SEQS-1) -#define ADVANCED_SEQS 4 +#define ADVANCED_SEQS STORED_SEQS seq_t sequences[STORED_SEQS]; int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); seqState_t seqState; int seqNb; + size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ + dctx->fseEntropy = 1; { int i; for (i=0; ientropy.rep[i]; } - seqState.prefixStart = prefixStart; - seqState.pos = (size_t)(op-prefixStart); - seqState.dictEnd = dictEnd; + assert(dst != NULL); assert(iend >= ip); RETURN_ERROR_IF( ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), - corruption_detected); + corruption_detected, ""); ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); /* prepare in advance */ - for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNblitBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] = sequence; + op += oneSeqSize; + } } + else + { + /* lit buffer is either wholly contained in first or second split, or not split at all*/ + size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] = sequence; + op += oneSeqSize; + } } - RETURN_ERROR_IF(seqNblitBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; + } + } + else + { + size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; + } } /* save reps for next block */ @@ -1117,13 +1867,25 @@ ZSTD_decompressSequencesLong_body( } /* last literal segment */ - { size_t const lastLLSize = litEnd - litPtr; - RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall); - memcpy(op, litPtr, lastLLSize); - op += lastLLSize; + if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + } + { size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } } - return op-ostart; + return (size_t)(op - ostart); } static size_t @@ -1141,7 +1903,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, #if DYNAMIC_BMI2 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t DONT_VECTORIZE ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, @@ -1150,10 +1912,19 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } +static BMI2_TARGET_ATTRIBUTE size_t +DONT_VECTORIZE +ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, @@ -1165,12 +1936,6 @@ ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, #endif /* DYNAMIC_BMI2 */ -typedef size_t (*ZSTD_decompressSequences_t)( - ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, - const ZSTD_longOffset_e isLongOffset); - #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG static size_t ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, @@ -1179,11 +1944,24 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, { DEBUGLOG(5, "ZSTD_decompressSequences"); #if DYNAMIC_BMI2 - if (dctx->bmi2) { + if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif - return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); +} +static size_t +ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset) +{ + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); +#if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { + return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } +#endif + return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ @@ -1202,7 +1980,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, { DEBUGLOG(5, "ZSTD_decompressSequencesLong"); #if DYNAMIC_BMI2 - if (dctx->bmi2) { + if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif @@ -1211,56 +1989,101 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ +/** + * @returns The total size of the history referenceable by zstd, including + * both the prefix and the extDict. At @p op any offset larger than this + * is invalid. + */ +static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) +{ + return (size_t)(op - virtualStart); +} -#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ - !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) -/* ZSTD_getLongOffsetsShare() : +typedef struct { + unsigned longOffsetShare; + unsigned maxNbAdditionalBits; +} ZSTD_OffsetInfo; + +/* ZSTD_getOffsetInfo() : * condition : offTable must be valid * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) - * compared to maximum possible of (1< 22) total += 1; - } + ZSTD_OffsetInfo info = {0, 0}; + /* If nbSeq == 0, then the offTable is uninitialized, but we have + * no sequences, so both values should be 0. + */ + if (nbSeq != 0) { + const void* ptr = offTable; + U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; + const ZSTD_seqSymbol* table = offTable + 1; + U32 const max = 1 << tableLog; + U32 u; + DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); + + assert(max <= (1 << OffFSELog)); /* max not too large */ + for (u=0; u 22) info.longOffsetShare += 1; + } - assert(tableLog <= OffFSELog); - total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + assert(tableLog <= OffFSELog); + info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + } - return total; + return info; } -#endif +/** + * @returns The maximum offset we can decode in one read of our bitstream, without + * reloading more bits in the middle of the offset bits read. Any offsets larger + * than this must use the long offset decoder. + */ +static size_t ZSTD_maxShortOffset(void) +{ + if (MEM_64bits()) { + /* We can decode any offset without reloading bits. + * This might change if the max window size grows. + */ + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); + return (size_t)-1; + } else { + /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. + * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. + * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. + */ + size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; + size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; + assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); + return maxOffset; + } +} size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame) + const void* src, size_t srcSize, const streaming_operation streaming) { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; - /* isLongOffset must be true if there are long offsets. - * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. - * We don't expect that to be the case in 64-bit mode. - * In block mode, window size is not known, so we have to be conservative. - * (note: but it could be evaluated from current-lowLimit) - */ - ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); - DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); - - RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong); + DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); + + /* Note : the wording of the specification + * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). + * This generally does not happen, as it makes little sense, + * since an uncompressed block would feature same size and have no decompression cost. + * Also, note that decoder from reference libzstd before < v1.5.4 + * would consider this edge case as an error. + * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) + * for broader compatibility with the deployed ecosystem of zstd decoders */ + RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); /* Decode literals section */ - { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); - DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); + DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; srcSize -= litCSize; @@ -1268,6 +2091,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, /* Build Decoding Tables */ { + /* Compute the maximum block size, which must also work when !frame and fParams are unset. + * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. + */ + size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); + size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); + /* isLongOffset must be true if there are long offsets. + * Offsets are long if they are larger than ZSTD_maxShortOffset(). + * We don't expect that to be the case in 64-bit mode. + * + * We check here to see if our history is large enough to allow long offsets. + * If it isn't, then we can't possible have (valid) long offsets. If the offset + * is invalid, then it is okay to read it incorrectly. + * + * If isLongOffsets is true, then we will later check our decoding table to see + * if it is even possible to generate long offsets. + */ + ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); /* These macros control at build-time which decompressor implementation * we use. If neither is defined, we do some inspection and dispatch at * runtime. @@ -1275,6 +2115,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) int usePrefetchDecoder = dctx->ddictIsCold; +#else + /* Set to 1 to avoid computing offset info if we don't need to. + * Otherwise this value is ignored. + */ + int usePrefetchDecoder = 1; #endif int nbSeq; size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); @@ -1282,42 +2127,86 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, ip += seqHSize; srcSize -= seqHSize; -#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ - !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) - if ( !usePrefetchDecoder - && (!frame || (dctx->fParams.windowSize > (1<<24))) - && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ - U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); - U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ - usePrefetchDecoder = (shareLongOffsets >= minShare); + RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, + "invalid dst"); + + /* If we could potentially have long offsets, or we might want to use the prefetch decoder, + * compute information about the share of long offsets, and the maximum nbAdditionalBits. + * NOTE: could probably use a larger nbSeq limit + */ + if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { + ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); + if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { + /* If isLongOffset, but the maximum number of additional bits that we see in our table is small + * enough, then we know it is impossible to have too long an offset in this block, so we can + * use the regular offset decoder. + */ + isLongOffset = ZSTD_lo_isRegularOffset; + } + if (!usePrefetchDecoder) { + U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ + usePrefetchDecoder = (info.longOffsetShare >= minShare); + } } -#endif dctx->ddictIsCold = 0; #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) - if (usePrefetchDecoder) + if (usePrefetchDecoder) { +#else + (void)usePrefetchDecoder; + { #endif #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); #endif + } #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG /* else */ - return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + if (dctx->litBufferLocation == ZSTD_split) + return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + else + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); #endif } } -size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize) +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) +{ + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ + dctx->dictEnd = dctx->previousDstEnd; + dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); + dctx->prefixStart = dst; + dctx->previousDstEnd = dst; + } +} + + +size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) { size_t dSize; - ZSTD_checkContinuity(dctx, dst); - dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); + dctx->isFrameDecompression = 0; + ZSTD_checkContinuity(dctx, dst, dstCapacity); + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); + FORWARD_IF_ERROR(dSize, ""); dctx->previousDstEnd = (char*)dst + dSize; return dSize; } + + +/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ +size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); +} + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_decompress_block.h b/vendor/github.com/DataDog/zstd/zstd_decompress_block.h index 7e92960..b727de2 100644 --- a/vendor/github.com/DataDog/zstd/zstd_decompress_block.h +++ b/vendor/github.com/DataDog/zstd/zstd_decompress_block.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -15,7 +16,7 @@ /*-******************************************************* * Dependencies *********************************************************/ -#include /* size_t */ +#include "zstd_deps.h" /* size_t */ #include "zstd.h" /* DCtx, and some public functions */ #include "zstd_internal.h" /* blockProperties_t, and some public functions */ #include "zstd_decompress_internal.h" /* ZSTD_seqSymbol */ @@ -33,6 +34,12 @@ */ + /* Streaming state is used to inform allocation of the literal buffer */ +typedef enum { + not_streaming = 0, + is_streaming = 1 +} streaming_operation; + /* ZSTD_decompressBlock_internal() : * decompress block, starting at `src`, * into destination buffer `dst`. @@ -41,19 +48,29 @@ */ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame); + const void* src, size_t srcSize, const streaming_operation streaming); /* ZSTD_buildFSETable() : * generate FSE decoding table for one symbol (ll, ml or off) * this function must be called with valid parameters only * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.) * in which case it cannot fail. + * The workspace must be 4-byte aligned and at least ZSTD_BUILD_FSE_TABLE_WKSP_SIZE bytes, which is + * defined in zstd_decompress_internal.h. * Internal use only. */ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, - unsigned tableLog); + const U32* baseValue, const U8* nbAdditionalBits, + unsigned tableLog, void* wksp, size_t wkspSize, + int bmi2); + +/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */ +size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); #endif /* ZSTD_DEC_BLOCK_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_decompress_internal.h b/vendor/github.com/DataDog/zstd/zstd_decompress_internal.h index ccbdfa0..9db61ff 100644 --- a/vendor/github.com/DataDog/zstd/zstd_decompress_internal.h +++ b/vendor/github.com/DataDog/zstd/zstd_decompress_internal.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -20,33 +21,33 @@ * Dependencies *********************************************************/ #include "mem.h" /* BYTE, U16, U32 */ -#include "zstd_internal.h" /* ZSTD_seqSymbol */ +#include "zstd_internal.h" /* constants : MaxLL, MaxML, MaxOff, LLFSELog, etc. */ /*-******************************************************* * Constants *********************************************************/ -static const U32 LL_base[MaxLL+1] = { +static UNUSED_ATTR const U32 LL_base[MaxLL+1] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000 }; -static const U32 OF_base[MaxOff+1] = { +static UNUSED_ATTR const U32 OF_base[MaxOff+1] = { 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD }; -static const U32 OF_bits[MaxOff+1] = { +static UNUSED_ATTR const U8 OF_bits[MaxOff+1] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; -static const U32 ML_base[MaxML+1] = { +static UNUSED_ATTR const U32 ML_base[MaxML+1] = { 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, @@ -73,12 +74,17 @@ static const U32 ML_base[MaxML+1] = { #define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log))) +#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) +#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32)) +#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 + typedef struct { ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ - HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ + HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */ U32 rep[ZSTD_REP_NUM]; + U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; } ZSTD_entropyDTables_t; typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader, @@ -95,6 +101,29 @@ typedef enum { ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */ } ZSTD_dictUses_e; +/* Hashset for storing references to multiple ZSTD_DDict within ZSTD_DCtx */ +typedef struct { + const ZSTD_DDict** ddictPtrTable; + size_t ddictPtrTableSize; + size_t ddictPtrCount; +} ZSTD_DDictHashSet; + +#ifndef ZSTD_DECODER_INTERNAL_BUFFER +# define ZSTD_DECODER_INTERNAL_BUFFER (1 << 16) +#endif + +#define ZSTD_LBMIN 64 +#define ZSTD_LBMAX (128 << 10) + +/* extra buffer, compensates when dst is not large enough to store litBuffer */ +#define ZSTD_LITBUFFEREXTRASIZE BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL_BUFFER, ZSTD_LBMAX) + +typedef enum { + ZSTD_not_in_dst = 0, /* Stored entirely within litExtraBuffer */ + ZSTD_in_dst = 1, /* Stored entirely within dst (in memory after current output write) */ + ZSTD_split = 2 /* Split between litExtraBuffer and dst */ +} ZSTD_litLocation_e; + struct ZSTD_DCtx_s { const ZSTD_seqSymbol* LLTptr; @@ -108,7 +137,8 @@ struct ZSTD_DCtx_s const void* virtualStart; /* virtual start of previous segment if it was just before current one */ const void* dictEnd; /* end of previous segment */ size_t expected; - ZSTD_frameHeader fParams; + ZSTD_FrameHeader fParams; + U64 processedCSize; U64 decodedSize; blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */ ZSTD_dStage stage; @@ -117,12 +147,17 @@ struct ZSTD_DCtx_s XXH64_state_t xxhState; size_t headerSize; ZSTD_format_e format; + ZSTD_forceIgnoreChecksum_e forceIgnoreChecksum; /* User specified: if == 1, will ignore checksums in compressed frame. Default == 0 */ + U32 validateChecksum; /* if == 1, will validate checksum. Is == 1 if (fParams.checksumFlag == 1) and (forceIgnoreChecksum == 0). */ const BYTE* litPtr; ZSTD_customMem customMem; size_t litSize; size_t rleSize; size_t staticSize; + int isFrameDecompression; +#if DYNAMIC_BMI2 int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ +#endif /* dictionary */ ZSTD_DDict* ddictLocal; @@ -130,6 +165,10 @@ struct ZSTD_DCtx_s U32 dictID; int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */ ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ + int disableHufAsm; + int maxBlockSizeParam; /* streaming */ ZSTD_dStreamStage streamStage; @@ -142,17 +181,44 @@ struct ZSTD_DCtx_s size_t outStart; size_t outEnd; size_t lhSize; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) void* legacyContext; U32 previousLegacyVersion; U32 legacyVersion; +#endif U32 hostageByte; int noForwardProgress; + ZSTD_bufferMode_e outBufferMode; + ZSTD_outBuffer expectedOutBuffer; /* workspace */ - BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH]; + BYTE* litBuffer; + const BYTE* litBufferEnd; + ZSTD_litLocation_e litBufferLocation; + BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */ BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; + + size_t oversizedDuration; + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + void const* dictContentBeginForFuzzing; + void const* dictContentEndForFuzzing; +#endif + + /* Tracing */ +#if ZSTD_TRACE + ZSTD_TraceCtx traceCtx; +#endif }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ +MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) { +#if DYNAMIC_BMI2 + return dctx->bmi2; +#else + (void)dctx; + return 0; +#endif +} /*-******************************************************* * Shared internal functions @@ -160,7 +226,7 @@ struct ZSTD_DCtx_s /*! ZSTD_loadDEntropy() : * dict : must point at beginning of a valid zstd dictionary. - * @return : size of entropy tables read */ + * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */ size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, const void* const dict, size_t const dictSize); @@ -169,7 +235,9 @@ size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, * If yes, do nothing (continue on current segment). * If not, classify previous segment as "external dictionary", and start a new segment. * This function cannot fail. */ -void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst); +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize); #endif /* ZSTD_DECOMPRESS_INTERNAL_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_deps.h b/vendor/github.com/DataDog/zstd/zstd_deps.h new file mode 100644 index 0000000..fab9477 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/zstd_deps.h @@ -0,0 +1,126 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* This file provides common libc dependencies that zstd requires. + * The purpose is to allow replacing this file with a custom implementation + * to compile zstd without libc support. + */ + +/* Need: + * NULL + * INT_MAX + * UINT_MAX + * ZSTD_memcpy() + * ZSTD_memset() + * ZSTD_memmove() + */ +#ifndef ZSTD_DEPS_COMMON +#define ZSTD_DEPS_COMMON + +/* Even though we use qsort_r only for the dictionary builder, the macro + * _GNU_SOURCE has to be declared *before* the inclusion of any standard + * header and the script 'combine.sh' combines the whole zstd source code + * in a single file. + */ +#if defined(__linux) || defined(__linux__) || defined(linux) || defined(__gnu_linux__) || \ + defined(__CYGWIN__) || defined(__MSYS__) +#if !defined(_GNU_SOURCE) && !defined(__ANDROID__) /* NDK doesn't ship qsort_r(). */ +#define _GNU_SOURCE +#endif +#endif + +#include +#include +#include + +#if defined(__GNUC__) && __GNUC__ >= 4 +# define ZSTD_memcpy(d,s,l) __builtin_memcpy((d),(s),(l)) +# define ZSTD_memmove(d,s,l) __builtin_memmove((d),(s),(l)) +# define ZSTD_memset(p,v,l) __builtin_memset((p),(v),(l)) +#else +# define ZSTD_memcpy(d,s,l) memcpy((d),(s),(l)) +# define ZSTD_memmove(d,s,l) memmove((d),(s),(l)) +# define ZSTD_memset(p,v,l) memset((p),(v),(l)) +#endif + +#endif /* ZSTD_DEPS_COMMON */ + +/* Need: + * ZSTD_malloc() + * ZSTD_free() + * ZSTD_calloc() + */ +#ifdef ZSTD_DEPS_NEED_MALLOC +#ifndef ZSTD_DEPS_MALLOC +#define ZSTD_DEPS_MALLOC + +#include + +#define ZSTD_malloc(s) malloc(s) +#define ZSTD_calloc(n,s) calloc((n), (s)) +#define ZSTD_free(p) free((p)) + +#endif /* ZSTD_DEPS_MALLOC */ +#endif /* ZSTD_DEPS_NEED_MALLOC */ + +/* + * Provides 64-bit math support. + * Need: + * U64 ZSTD_div64(U64 dividend, U32 divisor) + */ +#ifdef ZSTD_DEPS_NEED_MATH64 +#ifndef ZSTD_DEPS_MATH64 +#define ZSTD_DEPS_MATH64 + +#define ZSTD_div64(dividend, divisor) ((dividend) / (divisor)) + +#endif /* ZSTD_DEPS_MATH64 */ +#endif /* ZSTD_DEPS_NEED_MATH64 */ + +/* Need: + * assert() + */ +#ifdef ZSTD_DEPS_NEED_ASSERT +#ifndef ZSTD_DEPS_ASSERT +#define ZSTD_DEPS_ASSERT + +#include + +#endif /* ZSTD_DEPS_ASSERT */ +#endif /* ZSTD_DEPS_NEED_ASSERT */ + +/* Need: + * ZSTD_DEBUG_PRINT() + */ +#ifdef ZSTD_DEPS_NEED_IO +#ifndef ZSTD_DEPS_IO +#define ZSTD_DEPS_IO + +#include +#define ZSTD_DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) + +#endif /* ZSTD_DEPS_IO */ +#endif /* ZSTD_DEPS_NEED_IO */ + +/* Only requested when is known to be present. + * Need: + * intptr_t + */ +#ifdef ZSTD_DEPS_NEED_STDINT +#ifndef ZSTD_DEPS_STDINT +#define ZSTD_DEPS_STDINT + +#include + +#endif /* ZSTD_DEPS_STDINT */ +#endif /* ZSTD_DEPS_NEED_STDINT */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_double_fast.c b/vendor/github.com/DataDog/zstd/zstd_double_fast.c index a661a48..9a7cb22 100644 --- a/vendor/github.com/DataDog/zstd/zstd_double_fast.c +++ b/vendor/github.com/DataDog/zstd/zstd_double_fast.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,8 +12,49 @@ #include "zstd_compress_internal.h" #include "zstd_double_fast.h" +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR -void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillDoubleHashTableForCDict(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashLarge = ms->hashTable; + U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + U32 const mls = cParams->minMatch; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Always insert every fastHashFillStep position into the hash tables. + * Insert the other positions into the large hash table if their entry + * is empty. + */ + for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { + U32 const curr = (U32)(ip - base); + U32 i; + for (i = 0; i < fastHashFillStep; ++i) { + size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); + size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); + if (i == 0) { + ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); + } + if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { + ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); + } + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; + } } +} + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillDoubleHashTableForCCtx(ZSTD_MatchState_t* ms, void const* end, ZSTD_dictTableLoadMethod_e dtlm) { const ZSTD_compressionParameters* const cParams = &ms->cParams; @@ -31,27 +73,263 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, * is empty. */ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { - U32 const current = (U32)(ip - base); + U32 const curr = (U32)(ip - base); U32 i; for (i = 0; i < fastHashFillStep; ++i) { size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls); size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8); if (i == 0) - hashSmall[smHash] = current + i; + hashSmall[smHash] = curr + i; if (i == 0 || hashLarge[lgHash] == 0) - hashLarge[lgHash] = current + i; + hashLarge[lgHash] = curr + i; /* Only load extra positions for ZSTD_dtlm_full */ if (dtlm == ZSTD_dtlm_fast) break; - } } + } } +} + +void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp) +{ + if (tfp == ZSTD_tfp_forCDict) { + ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); + } else { + ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); + } +} + + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_doubleFast_noDict_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls /* template */) +{ + ZSTD_compressionParameters const* cParams = &ms->cParams; + U32* const hashLong = ms->hashTable; + const U32 hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + const U32 hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + /* presumes that, if there is a dictionary, it must be using Attach mode */ + const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); + const BYTE* const prefixLowest = base + prefixLowestIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t mLength; + U32 offset; + U32 curr; + + /* how many positions to search before increasing step size */ + const size_t kStepIncr = 1 << kSearchStrength; + /* the position at which to increment the step size if no match is found */ + const BYTE* nextStep; + size_t step; /* the current step size */ + + size_t hl0; /* the long hash at ip */ + size_t hl1; /* the long hash at ip1 */ + + U32 idxl0; /* the long match index for ip */ + U32 idxl1; /* the long match index for ip1 */ + + const BYTE* matchl0; /* the long match for ip */ + const BYTE* matchs0; /* the short match for ip */ + const BYTE* matchl1; /* the long match for ip1 */ + const BYTE* matchs0_safe; /* matchs0 or safe address */ + + const BYTE* ip = istart; /* the current position */ + const BYTE* ip1; /* the next position */ + /* Array of ~random data, should have low probability of matching data + * we load from here instead of from tables, if matchl0/matchl1 are + * invalid indices. Used to avoid unpredictable branches. */ + const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4}; + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); + + /* init */ + ip += ((ip - prefixLowest) == 0); + { + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; + if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ + while (1) { + step = 1; + nextStep = ip + kStepIncr; + ip1 = ip + step; + + if (ip1 > ilimit) { + goto _cleanup; + } + + hl0 = ZSTD_hashPtr(ip, hBitsL, 8); + idxl0 = hashLong[hl0]; + matchl0 = base + idxl0; + + /* Inner Loop: one iteration per search / position */ + do { + const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls); + const U32 idxs0 = hashSmall[hs0]; + curr = (U32)(ip-base); + matchs0 = base + idxs0; + + hashLong[hl0] = hashSmall[hs0] = curr; /* update hash tables */ + + /* check noDict repcode */ + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + + hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); + + /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch. + * However expression below complies into conditional move. Since + * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex + * if there is a match, all branches become predictable. */ + { const BYTE* const matchl0_safe = ZSTD_selectAddr(idxl0, prefixLowestIndex, matchl0, &dummy[0]); + + /* check prefix long match */ + if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) { + mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; + offset = (U32)(ip-matchl0); + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ + goto _match_found; + } } + + idxl1 = hashLong[hl1]; + matchl1 = base + idxl1; + + /* Same optimization as matchl0 above */ + matchs0_safe = ZSTD_selectAddr(idxs0, prefixLowestIndex, matchs0, &dummy[0]); + + /* check prefix short match */ + if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) { + goto _search_next_long; + } + + if (ip1 >= nextStep) { + PREFETCH_L1(ip1 + 64); + PREFETCH_L1(ip1 + 128); + step++; + nextStep += kStepIncr; + } + ip = ip1; + ip1 += step; + + hl0 = hl1; + idxl0 = idxl1; + matchl0 = matchl1; + #if defined(__aarch64__) + PREFETCH_L1(ip+256); + #endif + } while (ip1 <= ilimit); + +_cleanup: + /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), + * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ + offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved1; + rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + +_search_next_long: + + /* short match found: let's check for a longer one */ + mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; + offset = (U32)(ip - matchs0); + + /* check long match at +1 position */ + if ((idxl1 > prefixLowestIndex) && (MEM_read64(matchl1) == MEM_read64(ip1))) { + size_t const l1len = ZSTD_count(ip1+8, matchl1+8, iend) + 8; + if (l1len > mLength) { + /* use the long match instead */ + ip = ip1; + mLength = l1len; + offset = (U32)(ip-matchl1); + matchs0 = matchl1; + } + } + + while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* complete backward */ + + /* fall-through */ + +_match_found: /* requires ip, offset, mLength */ + offset_2 = offset_1; + offset_1 = offset; + + if (step < 4) { + /* It is unsafe to write this value back to the hashtable when ip1 is + * greater than or equal to the new ip we will have after we're done + * processing this match. Rather than perform that test directly + * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler + * more predictable test. The minmatch even if we take a short match is + * 4 bytes, so as long as step, the distance between ip and ip1 + * (initially) is less than 4, we know ip1 < new ip. */ + hashLong[hl1] = (U32)(ip1 - base); + } + + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + +_match_stored: + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert = curr+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + } + + /* check immediate repcode */ + while ( (ip <= ilimit) + && ( (offset_2>0) + & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + } + } } FORCE_INLINE_TEMPLATE -size_t ZSTD_compressBlock_doubleFast_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, - U32 const mls /* template */, ZSTD_dictMode_e const dictMode) + U32 const mls /* template */) { ZSTD_compressionParameters const* cParams = &ms->cParams; U32* const hashLong = ms->hashTable; @@ -63,63 +341,45 @@ size_t ZSTD_compressBlock_doubleFast_generic( const BYTE* ip = istart; const BYTE* anchor = istart; const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); - const U32 lowestValid = ms->window.dictLimit; - const U32 maxDistance = 1U << cParams->windowLog; /* presumes that, if there is a dictionary, it must be using Attach mode */ - const U32 prefixLowestIndex = (endIndex - lowestValid > maxDistance) ? endIndex - maxDistance : lowestValid; + const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); const BYTE* const prefixLowest = base + prefixLowestIndex; const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - HASH_READ_SIZE; U32 offset_1=rep[0], offset_2=rep[1]; - U32 offsetSaved = 0; - - const ZSTD_matchState_t* const dms = ms->dictMatchState; - const ZSTD_compressionParameters* const dictCParams = - dictMode == ZSTD_dictMatchState ? - &dms->cParams : NULL; - const U32* const dictHashLong = dictMode == ZSTD_dictMatchState ? - dms->hashTable : NULL; - const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ? - dms->chainTable : NULL; - const U32 dictStartIndex = dictMode == ZSTD_dictMatchState ? - dms->window.dictLimit : 0; - const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ? - dms->window.base : NULL; - const BYTE* const dictStart = dictMode == ZSTD_dictMatchState ? - dictBase + dictStartIndex : NULL; - const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ? - dms->window.nextSrc : NULL; - const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ? - prefixLowestIndex - (U32)(dictEnd - dictBase) : - 0; - const U32 dictHBitsL = dictMode == ZSTD_dictMatchState ? - dictCParams->hashLog : hBitsL; - const U32 dictHBitsS = dictMode == ZSTD_dictMatchState ? - dictCParams->chainLog : hBitsS; - const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictStart); - - DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic"); - - assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState); + + const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; + const U32* const dictHashLong = dms->hashTable; + const U32* const dictHashSmall = dms->chainTable; + const U32 dictStartIndex = dms->window.dictLimit; + const BYTE* const dictBase = dms->window.base; + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); + const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); /* if a dictionary is attached, it must be within window range */ - if (dictMode == ZSTD_dictMatchState) { - assert(lowestValid + maxDistance >= endIndex); + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + + if (ms->prefetchCDictTables) { + size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); + size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); + PREFETCH_AREA(dictHashLong, hashTableBytes); + PREFETCH_AREA(dictHashSmall, chainTableBytes); } /* init */ ip += (dictAndPrefixLength == 0); - if (dictMode == ZSTD_noDict) { - U32 const maxRep = (U32)(ip - prefixLowest); - if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; - if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; - } - if (dictMode == ZSTD_dictMatchState) { - /* dictMatchState repCode checks don't currently handle repCode == 0 - * disabling. */ - assert(offset_1 <= dictAndPrefixLength); - assert(offset_2 <= dictAndPrefixLength); - } + + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); /* Main Search Loop */ while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ @@ -127,69 +387,60 @@ size_t ZSTD_compressBlock_doubleFast_generic( U32 offset; size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); - size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); - size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); - U32 const current = (U32)(ip-base); + size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); + size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); + U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; + U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; + int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); + int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); + U32 const curr = (U32)(ip-base); U32 const matchIndexL = hashLong[h2]; U32 matchIndexS = hashSmall[h]; const BYTE* matchLong = base + matchIndexL; const BYTE* match = base + matchIndexS; - const U32 repIndex = current + 1 - offset_1; - const BYTE* repMatch = (dictMode == ZSTD_dictMatchState - && repIndex < prefixLowestIndex) ? + const U32 repIndex = curr + 1 - offset_1; + const BYTE* repMatch = (repIndex < prefixLowestIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; - hashLong[h2] = hashSmall[h] = current; /* update hash tables */ + hashLong[h2] = hashSmall[h] = curr; /* update hash tables */ - /* check dictMatchState repcode */ - if (dictMode == ZSTD_dictMatchState - && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + /* check repcode */ + if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); - goto _match_stored; - } - - /* check noDict repcode */ - if ( dictMode == ZSTD_noDict - && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { - mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; - ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); goto _match_stored; } - if (matchIndexL > prefixLowestIndex) { + if ((matchIndexL >= prefixLowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { /* check prefix long match */ - if (MEM_read64(matchLong) == MEM_read64(ip)) { - mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; - offset = (U32)(ip-matchLong); - while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ - goto _match_found; - } - } else if (dictMode == ZSTD_dictMatchState) { + mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; + offset = (U32)(ip-matchLong); + while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + goto _match_found; + } else if (dictTagsMatchL) { /* check dictMatchState long match */ - U32 const dictMatchIndexL = dictHashLong[dictHL]; + U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; const BYTE* dictMatchL = dictBase + dictMatchIndexL; assert(dictMatchL < dictEnd); if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) { mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8; - offset = (U32)(current - dictMatchIndexL - dictIndexDelta); + offset = (U32)(curr - dictMatchIndexL - dictIndexDelta); while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */ goto _match_found; } } if (matchIndexS > prefixLowestIndex) { - /* check prefix short match */ + /* short match candidate */ if (MEM_read32(match) == MEM_read32(ip)) { goto _search_next_long; } - } else if (dictMode == ZSTD_dictMatchState) { + } else if (dictTagsMatchS) { /* check dictMatchState short match */ - U32 const dictMatchIndexS = dictHashSmall[dictHS]; + U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; match = dictBase + dictMatchIndexS; matchIndexS = dictMatchIndexS + dictIndexDelta; @@ -198,42 +449,44 @@ size_t ZSTD_compressBlock_doubleFast_generic( } } ip += ((ip-anchor) >> kSearchStrength) + 1; +#if defined(__aarch64__) + PREFETCH_L1(ip+256); +#endif continue; _search_next_long: - { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); - size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); U32 const matchIndexL3 = hashLong[hl3]; + U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; + int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); const BYTE* matchL3 = base + matchIndexL3; - hashLong[hl3] = current + 1; + hashLong[hl3] = curr + 1; /* check prefix long +1 match */ - if (matchIndexL3 > prefixLowestIndex) { - if (MEM_read64(matchL3) == MEM_read64(ip+1)) { - mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; - ip++; - offset = (U32)(ip-matchL3); - while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ - goto _match_found; - } - } else if (dictMode == ZSTD_dictMatchState) { + if ((matchIndexL3 >= prefixLowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1))) { + mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; + ip++; + offset = (U32)(ip-matchL3); + while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + goto _match_found; + } else if (dictTagsMatchL3) { /* check dict long +1 match */ - U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; + U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; assert(dictMatchL3 < dictEnd); if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8; ip++; - offset = (U32)(current + 1 - dictMatchIndexL3 - dictIndexDelta); + offset = (U32)(curr + 1 - dictMatchIndexL3 - dictIndexDelta); while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */ goto _match_found; } } } /* if no long +1 match, explore the short match we found */ - if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) { + if (matchIndexS < prefixLowestIndex) { mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4; - offset = (U32)(current - matchIndexS); + offset = (U32)(curr - matchIndexS); while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ } else { mLength = ZSTD_count(ip+4, match+4, iend) + 4; @@ -241,13 +494,11 @@ size_t ZSTD_compressBlock_doubleFast_generic( while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ } - /* fall-through */ - _match_found: offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); _match_stored: /* match found */ @@ -257,7 +508,7 @@ size_t ZSTD_compressBlock_doubleFast_generic( if (ip <= ilimit) { /* Complementary insertion */ /* done after iLimit test, as candidates could be > iend-8 */ - { U32 const indexToInsert = current+2; + { U32 const indexToInsert = curr+2; hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; @@ -265,56 +516,58 @@ size_t ZSTD_compressBlock_doubleFast_generic( } /* check immediate repcode */ - if (dictMode == ZSTD_dictMatchState) { - while (ip <= ilimit) { - U32 const current2 = (U32)(ip-base); - U32 const repIndex2 = current2 - offset_2; - const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState - && repIndex2 < prefixLowestIndex ? - dictBase - dictIndexDelta + repIndex2 : - base + repIndex2; - if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) - && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { - const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; - U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; - ip += repLength2; - anchor = ip; - continue; - } - break; - } } - - if (dictMode == ZSTD_noDict) { - while ( (ip <= ilimit) - && ( (offset_2>0) - & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { - /* store sequence */ - size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; - U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MINMATCH); - ip += rLength; + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ? + dictBase + repIndex2 - dictIndexDelta : + base + repIndex2; + if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex2)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; anchor = ip; - continue; /* faster when present ... (?) */ - } } } + continue; + } + break; + } + } } /* while (ip < ilimit) */ /* save reps for next block */ - rep[0] = offset_1 ? offset_1 : offsetSaved; - rep[1] = offset_2 ? offset_2 : offsetSaved; + rep[0] = offset_1; + rep[1] = offset_2; /* Return the last literals size */ return (size_t)(iend - anchor); } +#define ZSTD_GEN_DFAST_FN(dictMode, mls) \ + static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \ + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ + } + +ZSTD_GEN_DFAST_FN(noDict, 4) +ZSTD_GEN_DFAST_FN(noDict, 5) +ZSTD_GEN_DFAST_FN(noDict, 6) +ZSTD_GEN_DFAST_FN(noDict, 7) + +ZSTD_GEN_DFAST_FN(dictMatchState, 4) +ZSTD_GEN_DFAST_FN(dictMatchState, 5) +ZSTD_GEN_DFAST_FN(dictMatchState, 6) +ZSTD_GEN_DFAST_FN(dictMatchState, 7) + size_t ZSTD_compressBlock_doubleFast( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { const U32 mls = ms->cParams.minMatch; @@ -322,19 +575,19 @@ size_t ZSTD_compressBlock_doubleFast( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize); } } size_t ZSTD_compressBlock_doubleFast_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { const U32 mls = ms->cParams.minMatch; @@ -342,19 +595,21 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize); } } -static size_t ZSTD_compressBlock_doubleFast_extDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_doubleFast_extDict_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls /* template */) { @@ -384,7 +639,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( /* if extDict is invalidated due to maxDistance, switch to "regular" variant */ if (prefixStartIndex == dictStartIndex) - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, mls, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize); /* Search Loop */ while (ip < ilimit) { /* < instead of <=, because (ip+1) */ @@ -398,31 +653,31 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base; const BYTE* matchLong = matchLongBase + matchLongIndex; - const U32 current = (U32)(ip-base); - const U32 repIndex = current + 1 - offset_1; /* offset_1 expected <= current +1 */ + const U32 curr = (U32)(ip-base); + const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; size_t mLength; - hashSmall[hSmall] = hashLong[hLong] = current; /* update hash table */ + hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ - if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ - & (repIndex > dictStartIndex)) + if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) + & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); } else { if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart; U32 offset; mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8; - offset = current - matchLongIndex; + offset = curr - matchLongIndex; while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); @@ -430,24 +685,24 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base; const BYTE* match3 = match3Base + matchIndex3; U32 offset; - hashLong[h3] = current + 1; + hashLong[h3] = curr + 1; if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) { const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend; const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart; mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8; ip++; - offset = current+1 - matchIndex3; + offset = curr+1 - matchIndex3; while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */ } else { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; - offset = current - matchIndex; + offset = curr - matchIndex; while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ } offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); } else { ip += ((ip-anchor) >> kSearchStrength) + 1; @@ -461,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( if (ip <= ilimit) { /* Complementary insertion */ /* done after iLimit test, as candidates could be > iend-8 */ - { U32 const indexToInsert = current+2; + { U32 const indexToInsert = curr+2; hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; @@ -473,13 +728,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( U32 const current2 = (U32)(ip-base); U32 const repIndex2 = current2 - offset_2; const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; - if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ - & (repIndex2 > dictStartIndex)) + if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) + & (offset_2 <= current2 - dictStartIndex)) && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; ip += repLength2; @@ -497,9 +752,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( return (size_t)(iend - anchor); } +ZSTD_GEN_DFAST_FN(extDict, 4) +ZSTD_GEN_DFAST_FN(extDict, 5) +ZSTD_GEN_DFAST_FN(extDict, 6) +ZSTD_GEN_DFAST_FN(extDict, 7) size_t ZSTD_compressBlock_doubleFast_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { U32 const mls = ms->cParams.minMatch; @@ -507,12 +766,16 @@ size_t ZSTD_compressBlock_doubleFast_extDict( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); } } + +#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_double_fast.h b/vendor/github.com/DataDog/zstd/zstd_double_fast.h index 4fa31ac..79d0175 100644 --- a/vendor/github.com/DataDog/zstd/zstd_double_fast.h +++ b/vendor/github.com/DataDog/zstd/zstd_double_fast.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,28 +12,34 @@ #ifndef ZSTD_DOUBLE_FAST_H #define ZSTD_DOUBLE_FAST_H -#if defined (__cplusplus) -extern "C" { -#endif - #include "mem.h" /* U32 */ #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ -void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, - void const* end, ZSTD_dictTableLoadMethod_e dtlm); +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + +void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_doubleFast( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_doubleFast_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_doubleFast_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); - -#if defined (__cplusplus) -} -#endif +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict +#else +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL +#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ #endif /* ZSTD_DOUBLE_FAST_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_errors.h b/vendor/github.com/DataDog/zstd/zstd_errors.h index 92a3433..5ac47dd 100644 --- a/vendor/github.com/DataDog/zstd/zstd_errors.h +++ b/vendor/github.com/DataDog/zstd/zstd_errors.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -15,24 +16,32 @@ extern "C" { #endif -/*===== dependency =====*/ -#include /* size_t */ - - /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ -#ifndef ZSTDERRORLIB_VISIBILITY -# if defined(__GNUC__) && (__GNUC__ >= 4) -# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default"))) +#ifndef ZSTDERRORLIB_VISIBLE + /* Backwards compatibility with old macro name */ +# ifdef ZSTDERRORLIB_VISIBILITY +# define ZSTDERRORLIB_VISIBLE ZSTDERRORLIB_VISIBILITY +# elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZSTDERRORLIB_VISIBLE __attribute__ ((visibility ("default"))) +# else +# define ZSTDERRORLIB_VISIBLE +# endif +#endif + +#ifndef ZSTDERRORLIB_HIDDEN +# if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) # else -# define ZSTDERRORLIB_VISIBILITY +# define ZSTDERRORLIB_HIDDEN # endif #endif + #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) -# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY +# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBLE #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) -# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ #else -# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY +# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE #endif /*-********************************************* @@ -58,14 +67,18 @@ typedef enum { ZSTD_error_frameParameter_windowTooLarge = 16, ZSTD_error_corruption_detected = 20, ZSTD_error_checksum_wrong = 22, + ZSTD_error_literals_headerWrong = 24, ZSTD_error_dictionary_corrupted = 30, ZSTD_error_dictionary_wrong = 32, ZSTD_error_dictionaryCreation_failed = 34, ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_combination_unsupported = 41, ZSTD_error_parameter_outOfBound = 42, ZSTD_error_tableLog_tooLarge = 44, ZSTD_error_maxSymbolValue_tooLarge = 46, ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_cannotProduce_uncompressedBlock = 49, + ZSTD_error_stabilityCondition_notRespected = 50, ZSTD_error_stage_wrong = 60, ZSTD_error_init_missing = 62, ZSTD_error_memory_allocation = 64, @@ -73,16 +86,18 @@ typedef enum { ZSTD_error_dstSize_tooSmall = 70, ZSTD_error_srcSize_wrong = 72, ZSTD_error_dstBuffer_null = 74, + ZSTD_error_noForwardProgress_destFull = 80, + ZSTD_error_noForwardProgress_inputEmpty = 82, /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ ZSTD_error_frameIndex_tooLarge = 100, ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, + ZSTD_error_sequenceProducer_failed = 106, + ZSTD_error_externalSequences_invalid = 107, ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ } ZSTD_ErrorCode; -/*! ZSTD_getErrorCode() : - convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, - which can be used to compare with enum list published above */ -ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ @@ -91,3 +106,5 @@ ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Sa #endif #endif /* ZSTD_ERRORS_H_398273423 */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_fast.c b/vendor/github.com/DataDog/zstd/zstd_fast.c index 6dbefee..ea02c12 100644 --- a/vendor/github.com/DataDog/zstd/zstd_fast.c +++ b/vendor/github.com/DataDog/zstd/zstd_fast.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,8 +12,46 @@ #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ #include "zstd_fast.h" +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + U32 const mls = cParams->minMatch; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. + * Feel free to remove this assert if there's a good reason! */ + assert(dtlm == ZSTD_dtlm_full); -void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ + for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { + U32 const curr = (U32)(ip - base); + { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); + ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } + + if (dtlm == ZSTD_dtlm_fast) continue; + /* Only load extra positions for ZSTD_dtlm_full */ + { U32 p; + for (p = 1; p < fastHashFillStep; ++p) { + size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); + if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ + ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); + } } } } +} + +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms, const void* const end, ZSTD_dictTableLoadMethod_e dtlm) { @@ -25,177 +64,426 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; const U32 fastHashFillStep = 3; + /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. + * Feel free to remove this assert if there's a good reason! */ + assert(dtlm == ZSTD_dtlm_fast); + /* Always insert every fastHashFillStep position into the hash table. * Insert the other positions if their hash entry is empty. */ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { - U32 const current = (U32)(ip - base); + U32 const curr = (U32)(ip - base); size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls); - hashTable[hash0] = current; + hashTable[hash0] = curr; if (dtlm == ZSTD_dtlm_fast) continue; /* Only load extra positions for ZSTD_dtlm_full */ { U32 p; for (p = 1; p < fastHashFillStep; ++p) { size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls); if (hashTable[hash] == 0) { /* not yet filled */ - hashTable[hash] = current + p; + hashTable[hash] = curr + p; } } } } } +void ZSTD_fillHashTable(ZSTD_MatchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp) +{ + if (tfp == ZSTD_tfp_forCDict) { + ZSTD_fillHashTableForCDict(ms, end, dtlm); + } else { + ZSTD_fillHashTableForCCtx(ms, end, dtlm); + } +} + + +typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit); + +static int +ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) +{ + /* Array of ~random data, should have low probability of matching data. + * Load from here if the index is invalid. + * Used to avoid unpredictable branches. */ + static const BYTE dummy[] = {0x12,0x34,0x56,0x78}; + + /* currentIdx >= lowLimit is a (somewhat) unpredictable branch. + * However expression below compiles into conditional move. + */ + const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy); + /* Note: this used to be written as : return test1 && test2; + * Unfortunately, once inlined, these tests become branches, + * in which case it becomes critical that they are executed in the right order (test1 then test2). + * So we have to write these tests in a specific manner to ensure their ordering. + */ + if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0; + /* force ordering of these tests, which matters once the function is inlined, as they become branches */ +#if defined(__GNUC__) + __asm__(""); +#endif + return matchIdx >= idxLowLimit; +} + +static int +ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) +{ + /* using a branch instead of a cmov, + * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true, + * aka almost all candidates are within range */ + U32 mval; + if (matchIdx >= idxLowLimit) { + mval = MEM_read32(matchAddress); + } else { + mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */ + } + + return (MEM_read32(currentPtr) == mval); +} -FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_fast_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + +/** + * If you squint hard enough (and ignore repcodes), the search operation at any + * given position is broken into 4 stages: + * + * 1. Hash (map position to hash value via input read) + * 2. Lookup (map hash val to index via hashtable read) + * 3. Load (map index to value at that position via input read) + * 4. Compare + * + * Each of these steps involves a memory read at an address which is computed + * from the previous step. This means these steps must be sequenced and their + * latencies are cumulative. + * + * Rather than do 1->2->3->4 sequentially for a single position before moving + * onto the next, this implementation interleaves these operations across the + * next few positions: + * + * R = Repcode Read & Compare + * H = Hash + * T = Table Lookup + * M = Match Read & Compare + * + * Pos | Time --> + * ----+------------------- + * N | ... M + * N+1 | ... TM + * N+2 | R H T M + * N+3 | H TM + * N+4 | R H T M + * N+5 | H ... + * N+6 | R ... + * + * This is very much analogous to the pipelining of execution in a CPU. And just + * like a CPU, we have to dump the pipeline when we find a match (i.e., take a + * branch). + * + * When this happens, we throw away our current state, and do the following prep + * to re-enter the loop: + * + * Pos | Time --> + * ----+------------------- + * N | H T + * N+1 | H + * + * This is also the work we do at the beginning to enter the loop initially. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_fast_noDict_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, - U32 const mls) + U32 const mls, int useCmov) { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32* const hashTable = ms->hashTable; U32 const hlog = cParams->hashLog; - /* support stepSize of 0 */ - size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */ const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; - /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */ - const BYTE* ip0 = istart; - const BYTE* ip1; - const BYTE* anchor = istart; const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); - const U32 maxDistance = 1U << cParams->windowLog; - const U32 validStartIndex = ms->window.dictLimit; - const U32 prefixStartIndex = (endIndex - validStartIndex > maxDistance) ? endIndex - maxDistance : validStartIndex; + const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); const BYTE* const prefixStart = base + prefixStartIndex; const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - HASH_READ_SIZE; - U32 offset_1=rep[0], offset_2=rep[1]; - U32 offsetSaved = 0; - /* init */ + const BYTE* anchor = istart; + const BYTE* ip0 = istart; + const BYTE* ip1; + const BYTE* ip2; + const BYTE* ip3; + U32 current0; + + U32 rep_offset1 = rep[0]; + U32 rep_offset2 = rep[1]; + U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ + U32 matchIdx; /* match idx for ip0 */ + + U32 offcode; + const BYTE* match0; + size_t mLength; + + /* ip0 and ip1 are always adjacent. The targetLength skipping and + * uncompressibility acceleration is applied to every other position, + * matching the behavior of #1562. step therefore represents the gap + * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */ + size_t step; + const BYTE* nextStep; + const size_t kStepIncr = (1 << (kSearchStrength - 1)); + const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch; + DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); ip0 += (ip0 == prefixStart); + { U32 const curr = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); + U32 const maxRep = curr - windowLow; + if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; + if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; + } + + /* start each op */ +_start: /* Requires: ip0 */ + + step = stepSize; + nextStep = ip0 + kStepIncr; + + /* calculate positions, ip0 - anchor == 0, so we skip step calc */ ip1 = ip0 + 1; - { U32 const maxRep = (U32)(ip0 - prefixStart); - if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; - if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + ip2 = ip0 + step; + ip3 = ip2 + 1; + + if (ip3 >= ilimit) { + goto _cleanup; } - /* Main Search Loop */ - while (ip1 < ilimit) { /* < instead of <=, because check at ip0+2 */ - size_t mLength; - BYTE const* ip2 = ip0 + 2; - size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls); - U32 const val0 = MEM_read32(ip0); - size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls); - U32 const val1 = MEM_read32(ip1); - U32 const current0 = (U32)(ip0-base); - U32 const current1 = (U32)(ip1-base); - U32 const matchIndex0 = hashTable[h0]; - U32 const matchIndex1 = hashTable[h1]; - BYTE const* repMatch = ip2-offset_1; - const BYTE* match0 = base + matchIndex0; - const BYTE* match1 = base + matchIndex1; - U32 offcode; - hashTable[h0] = current0; /* update hash table */ - hashTable[h1] = current1; /* update hash table */ - - assert(ip0 + 1 == ip1); - - if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) { - mLength = ip2[-1] == repMatch[-1] ? 1 : 0; - ip0 = ip2 - mLength; - match0 = repMatch - mLength; - offcode = 0; + hash0 = ZSTD_hashPtr(ip0, hlog, mls); + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + + matchIdx = hashTable[hash0]; + + do { + /* load repcode match for ip[2]*/ + const U32 rval = MEM_read32(ip2 - rep_offset1); + + /* write back hash table entry */ + current0 = (U32)(ip0 - base); + hashTable[hash0] = current0; + + /* check repcode at ip[2] */ + if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) { + ip0 = ip2; + match0 = ip0 - rep_offset1; + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; + offcode = REPCODE1_TO_OFFBASE; + mLength += 4; + + /* Write next hash table entry: it's already calculated. + * This write is known to be safe because ip1 is before the + * repcode (ip2). */ + hashTable[hash1] = (U32)(ip1 - base); + goto _match; } - if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) { - /* found a regular match */ + + if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { + /* Write next hash table entry (it's already calculated). + * This write is known to be safe because the ip1 == ip0 + 1, + * so searching will resume after ip1 */ + hashTable[hash1] = (U32)(ip1 - base); + goto _offset; } - if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) { - /* found a regular match after one literal */ - ip0 = ip1; - match0 = match1; + + /* lookup ip[1] */ + matchIdx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; + hash1 = ZSTD_hashPtr(ip2, hlog, mls); + + /* advance to next positions */ + ip0 = ip1; + ip1 = ip2; + ip2 = ip3; + + /* write back hash table entry */ + current0 = (U32)(ip0 - base); + hashTable[hash0] = current0; + + if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { + /* Write next hash table entry, since it's already calculated */ + if (step <= 4) { + /* Avoid writing an index if it's >= position where search will resume. + * The minimum possible match has length 4, so search can resume at ip0 + 4. + */ + hashTable[hash1] = (U32)(ip1 - base); + } goto _offset; } - { size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize; - assert(step >= 2); - ip0 += step; - ip1 += step; - continue; + + /* lookup ip[1] */ + matchIdx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; + hash1 = ZSTD_hashPtr(ip2, hlog, mls); + + /* advance to next positions */ + ip0 = ip1; + ip1 = ip2; + ip2 = ip0 + step; + ip3 = ip1 + step; + + /* calculate step */ + if (ip2 >= nextStep) { + step++; + PREFETCH_L1(ip1 + 64); + PREFETCH_L1(ip1 + 128); + nextStep += kStepIncr; } -_offset: /* Requires: ip0, match0 */ - /* Compute the offset code */ - offset_2 = offset_1; - offset_1 = (U32)(ip0-match0); - offcode = offset_1 + ZSTD_REP_MOVE; - mLength = 0; - /* Count the backwards match length */ - while (((ip0>anchor) & (match0>prefixStart)) - && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */ + } while (ip3 < ilimit); + +_cleanup: + /* Note that there are probably still a couple positions one could search. + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ + + /* When the repcodes are outside of the prefix, we set them to zero before the loop. + * When the offsets are still zero, we need to restore them after the block to have a correct + * repcode history. If only one offset was invalid, it is easy. The tricky case is when both + * offsets were invalid. We need to figure out which offset to refill with. + * - If both offsets are zero they are in the same order. + * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. + * - If only one is zero, we need to decide which offset to restore. + * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. + * - It is impossible for rep_offset2 to be non-zero. + * + * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then + * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. + */ + offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ + rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; + rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + +_offset: /* Requires: ip0, idx */ + + /* Compute the offset code. */ + match0 = base + matchIdx; + rep_offset2 = rep_offset1; + rep_offset1 = (U32)(ip0-match0); + offcode = OFFSET_TO_OFFBASE(rep_offset1); + mLength = 4; + + /* Count the backwards match length. */ + while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) { + ip0--; + match0--; + mLength++; + } _match: /* Requires: ip0, match0, offcode */ - /* Count the forward length */ - mLength += ZSTD_count(ip0+mLength+4, match0+mLength+4, iend) + 4; - ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH); - /* match found */ - ip0 += mLength; - anchor = ip0; - ip1 = ip0 + 1; - if (ip0 <= ilimit) { - /* Fill Table */ - assert(base+current0+2 > istart); /* check base overflow */ - hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + /* Count the forward length. */ + mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend); + + ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); + + ip0 += mLength; + anchor = ip0; - while ( ((ip0 <= ilimit) & (offset_2>0)) /* offset_2==0 means offset_2 is invalidated */ - && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) { + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+current0+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */ + while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) { /* store sequence */ - size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4; - { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ + size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4; + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); ip0 += rLength; - ip1 = ip0 + 1; - ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH); + ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); anchor = ip0; continue; /* faster when present (confirmed on gcc-8) ... (?) */ - } - } - } - - /* save reps for next block */ - rep[0] = offset_1 ? offset_1 : offsetSaved; - rep[1] = offset_2 ? offset_2 : offsetSaved; + } } } - /* Return the last literals size */ - return (size_t)(iend - anchor); + goto _start; } +#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \ + static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \ + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \ + } + +ZSTD_GEN_FAST_FN(noDict, 4, 1) +ZSTD_GEN_FAST_FN(noDict, 5, 1) +ZSTD_GEN_FAST_FN(noDict, 6, 1) +ZSTD_GEN_FAST_FN(noDict, 7, 1) + +ZSTD_GEN_FAST_FN(noDict, 4, 0) +ZSTD_GEN_FAST_FN(noDict, 5, 0) +ZSTD_GEN_FAST_FN(noDict, 6, 0) +ZSTD_GEN_FAST_FN(noDict, 7, 0) size_t ZSTD_compressBlock_fast( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - U32 const mls = ms->cParams.minMatch; + U32 const mml = ms->cParams.minMatch; + /* use cmov when "candidate in range" branch is likely unpredictable */ + int const useCmov = ms->cParams.windowLog < 19; assert(ms->dictMatchState == NULL); - switch(mls) - { - default: /* includes case 3 */ - case 4 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4); - case 5 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5); - case 6 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6); - case 7 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7); + if (useCmov) { + switch(mml) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize); + case 5 : + return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize); + case 6 : + return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize); + case 7 : + return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize); + } + } else { + /* use a branch instead */ + switch(mml) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize); + case 5 : + return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize); + case 6 : + return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize); + case 7 : + return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize); + } } } FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_fast_dictMatchState_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, U32 const mls) + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32* const hashTable = ms->hashTable; @@ -204,16 +492,16 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( U32 const stepSize = cParams->targetLength + !(cParams->targetLength); const BYTE* const base = ms->window.base; const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; + const BYTE* ip0 = istart; + const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ const BYTE* anchor = istart; const U32 prefixStartIndex = ms->window.dictLimit; const BYTE* const prefixStart = base + prefixStartIndex; const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - HASH_READ_SIZE; U32 offset_1=rep[0], offset_2=rep[1]; - U32 offsetSaved = 0; - const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_MatchState_t* const dms = ms->dictMatchState; const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; const U32* const dictHashTable = dms->hashTable; const U32 dictStartIndex = dms->window.dictLimit; @@ -221,127 +509,183 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const BYTE* const dictStart = dictBase + dictStartIndex; const BYTE* const dictEnd = dms->window.nextSrc; const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); - const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); - const U32 dictHLog = dictCParams->hashLog; + const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); + const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; /* if a dictionary is still attached, it necessarily means that * it is within window size. So we just check it. */ const U32 maxDistance = 1U << cParams->windowLog; - const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); assert(endIndex - prefixStartIndex <= maxDistance); (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ - /* ensure there will be no no underflow + (void)hasStep; /* not currently specialized on whether it's accelerated */ + + /* ensure there will be no underflow * when translating a dict index into a local index */ assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + if (ms->prefetchCDictTables) { + size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); + PREFETCH_AREA(dictHashTable, hashTableBytes); + } + /* init */ DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); - ip += (dictAndPrefixLength == 0); + ip0 += (dictAndPrefixLength == 0); /* dictMatchState repCode checks don't currently handle repCode == 0 * disabling. */ assert(offset_1 <= dictAndPrefixLength); assert(offset_2 <= dictAndPrefixLength); - /* Main Search Loop */ - while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + /* Outer search loop */ + assert(stepSize >= 1); + while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ size_t mLength; - size_t const h = ZSTD_hashPtr(ip, hlog, mls); - U32 const current = (U32)(ip-base); - U32 const matchIndex = hashTable[h]; - const BYTE* match = base + matchIndex; - const U32 repIndex = current + 1 - offset_1; - const BYTE* repMatch = (repIndex < prefixStartIndex) ? - dictBase + (repIndex - dictIndexDelta) : - base + repIndex; - hashTable[h] = current; /* update hash table */ - - if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ - && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { - const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; - mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; - ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); - } else if ( (matchIndex <= prefixStartIndex) ) { - size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); - U32 const dictMatchIndex = dictHashTable[dictHash]; - const BYTE* dictMatch = dictBase + dictMatchIndex; - if (dictMatchIndex <= dictStartIndex || - MEM_read32(dictMatch) != MEM_read32(ip)) { - assert(stepSize >= 1); - ip += ((ip-anchor) >> kSearchStrength) + stepSize; - continue; - } else { - /* found a dict match */ - U32 const offset = (U32)(current-dictMatchIndex-dictIndexDelta); - mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; - while (((ip>anchor) & (dictMatch>dictStart)) - && (ip[-1] == dictMatch[-1])) { - ip--; dictMatch--; mLength++; + size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); + + size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); + U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; + int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); + + U32 matchIndex = hashTable[hash0]; + U32 curr = (U32)(ip0 - base); + size_t step = stepSize; + const size_t kStepIncr = 1 << kSearchStrength; + const BYTE* nextStep = ip0 + kStepIncr; + + /* Inner search loop */ + while (1) { + const BYTE* match = base + matchIndex; + const U32 repIndex = curr + 1 - offset_1; + const BYTE* repMatch = (repIndex < prefixStartIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); + size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); + hashTable[hash0] = curr; /* update hash table */ + + if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { + const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; + ip0++; + ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + break; + } + + if (dictTagsMatch) { + /* Found a possible dict match */ + const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatch = dictBase + dictMatchIndex; + if (dictMatchIndex > dictStartIndex && + MEM_read32(dictMatch) == MEM_read32(ip0)) { + /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ + if (matchIndex <= prefixStartIndex) { + U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); + mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; + while (((ip0 > anchor) & (dictMatch > dictStart)) + && (ip0[-1] == dictMatch[-1])) { + ip0--; + dictMatch--; + mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + break; + } + } + } + + if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) { + /* found a regular match of size >= 4 */ + U32 const offset = (U32) (ip0 - match); + mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; + while (((ip0 > anchor) & (match > prefixStart)) + && (ip0[-1] == match[-1])) { + ip0--; + match--; + mLength++; } /* catch up */ offset_2 = offset_1; offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + break; } - } else if (MEM_read32(match) != MEM_read32(ip)) { - /* it's not a match, and we're not going to check the dictionary */ - assert(stepSize >= 1); - ip += ((ip-anchor) >> kSearchStrength) + stepSize; - continue; - } else { - /* found a regular match */ - U32 const offset = (U32)(ip-match); - mLength = ZSTD_count(ip+4, match+4, iend) + 4; - while (((ip>anchor) & (match>prefixStart)) - && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ - offset_2 = offset_1; - offset_1 = offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); - } + + /* Prepare for next iteration */ + dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; + dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); + matchIndex = hashTable[hash1]; + + if (ip1 >= nextStep) { + step++; + nextStep += kStepIncr; + } + ip0 = ip1; + ip1 = ip1 + step; + if (ip1 > ilimit) goto _cleanup; + + curr = (U32)(ip0 - base); + hash0 = hash1; + } /* end inner search loop */ /* match found */ - ip += mLength; - anchor = ip; + assert(mLength); + ip0 += mLength; + anchor = ip0; - if (ip <= ilimit) { + if (ip0 <= ilimit) { /* Fill Table */ - assert(base+current+2 > istart); /* check base overflow */ - hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2; /* here because current+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); + assert(base+curr+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); /* check immediate repcode */ - while (ip <= ilimit) { - U32 const current2 = (U32)(ip-base); + while (ip0 <= ilimit) { + U32 const current2 = (U32)(ip0-base); U32 const repIndex2 = current2 - offset_2; const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase - dictIndexDelta + repIndex2 : base + repIndex2; - if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) - && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) + && (MEM_read32(repMatch2) == MEM_read32(ip0))) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); - hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; - ip += repLength2; - anchor = ip; + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; + ip0 += repLength2; + anchor = ip0; continue; } break; } } + + /* Prepare for next iteration */ + assert(ip0 == anchor); + ip1 = ip0 + stepSize; } +_cleanup: /* save reps for next block */ - rep[0] = offset_1 ? offset_1 : offsetSaved; - rep[1] = offset_2 ? offset_2 : offsetSaved; + rep[0] = offset_1; + rep[1] = offset_2; /* Return the last literals size */ return (size_t)(iend - anchor); } + +ZSTD_GEN_FAST_FN(dictMatchState, 4, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 5, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 6, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 7, 0) + size_t ZSTD_compressBlock_fast_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { U32 const mls = ms->cParams.minMatch; @@ -350,30 +694,31 @@ size_t ZSTD_compressBlock_fast_dictMatchState( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize); } } -static size_t ZSTD_compressBlock_fast_extDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, U32 const mls) +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_fast_extDict_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32* const hashTable = ms->hashTable; U32 const hlog = cParams->hashLog; /* support stepSize of 0 */ - U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; const BYTE* const base = ms->window.base; const BYTE* const dictBase = ms->window.dictBase; const BYTE* const istart = (const BYTE*)src; - const BYTE* ip = istart; const BYTE* anchor = istart; const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); @@ -386,99 +731,258 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( const BYTE* const iend = istart + srcSize; const BYTE* const ilimit = iend - 8; U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const BYTE* ip0 = istart; + const BYTE* ip1; + const BYTE* ip2; + const BYTE* ip3; + U32 current0; + + + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ + U32 idx; /* match idx for ip0 */ + const BYTE* idxBase; /* base pointer for idx */ + + U32 offcode; + const BYTE* match0; + size_t mLength; + const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ + + size_t step; + const BYTE* nextStep; + const size_t kStepIncr = (1 << (kSearchStrength - 1)); + + (void)hasStep; /* not currently specialized on whether it's accelerated */ - DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic"); + DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1); /* switch to "regular" variant if extDict is invalidated due to maxDistance */ if (prefixStartIndex == dictStartIndex) - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls); - - /* Search Loop */ - while (ip < ilimit) { /* < instead of <=, because (ip+1) */ - const size_t h = ZSTD_hashPtr(ip, hlog, mls); - const U32 matchIndex = hashTable[h]; - const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; - const BYTE* match = matchBase + matchIndex; - const U32 current = (U32)(ip-base); - const U32 repIndex = current + 1 - offset_1; - const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; - const BYTE* const repMatch = repBase + repIndex; - hashTable[h] = current; /* update hash table */ - assert(offset_1 <= current +1); /* check repIndex */ - - if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex)) - && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { - const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; - size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; - ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, rLength-MINMATCH); - ip += rLength; - anchor = ip; - } else { - if ( (matchIndex < dictStartIndex) || - (MEM_read32(match) != MEM_read32(ip)) ) { - assert(stepSize >= 1); - ip += ((ip-anchor) >> kSearchStrength) + stepSize; - continue; + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); + + { U32 const curr = (U32)(ip0 - base); + U32 const maxRep = curr - dictStartIndex; + if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; + if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* start each op */ +_start: /* Requires: ip0 */ + + step = stepSize; + nextStep = ip0 + kStepIncr; + + /* calculate positions, ip0 - anchor == 0, so we skip step calc */ + ip1 = ip0 + 1; + ip2 = ip0 + step; + ip3 = ip2 + 1; + + if (ip3 >= ilimit) { + goto _cleanup; + } + + hash0 = ZSTD_hashPtr(ip0, hlog, mls); + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + + idx = hashTable[hash0]; + idxBase = idx < prefixStartIndex ? dictBase : base; + + do { + { /* load repcode match for ip[2] */ + U32 const current2 = (U32)(ip2 - base); + U32 const repIndex = current2 - offset_1; + const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; + U32 rval; + if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ + & (offset_1 > 0) ) { + rval = MEM_read32(repBase + repIndex); + } else { + rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ } - { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; - const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; - U32 const offset = current - matchIndex; - size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; - while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ - offset_2 = offset_1; offset_1 = offset; /* update offset history */ - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); - ip += mLength; - anchor = ip; + + /* write back hash table entry */ + current0 = (U32)(ip0 - base); + hashTable[hash0] = current0; + + /* check repcode at ip[2] */ + if (MEM_read32(ip2) == rval) { + ip0 = ip2; + match0 = repBase + repIndex; + matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + assert((match0 != prefixStart) & (match0 != dictStart)); + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; + offcode = REPCODE1_TO_OFFBASE; + mLength += 4; + goto _match; } } - if (ip <= ilimit) { - /* Fill Table */ - hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2; - hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); - /* check immediate repcode */ - while (ip <= ilimit) { - U32 const current2 = (U32)(ip-base); - U32 const repIndex2 = current2 - offset_2; - const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; - if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex)) /* intentional overflow */ - && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { - const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; - { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 /*offcode*/, repLength2-MINMATCH); - hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; - ip += repLength2; - anchor = ip; - continue; - } - break; - } } } + { /* load match for ip[0] */ + U32 const mval = idx >= dictStartIndex ? + MEM_read32(idxBase + idx) : + MEM_read32(ip0) ^ 1; /* guaranteed not to match */ + + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ + goto _offset; + } } + + /* lookup ip[1] */ + idx = hashTable[hash1]; + idxBase = idx < prefixStartIndex ? dictBase : base; + + /* hash ip[2] */ + hash0 = hash1; + hash1 = ZSTD_hashPtr(ip2, hlog, mls); + + /* advance to next positions */ + ip0 = ip1; + ip1 = ip2; + ip2 = ip3; + + /* write back hash table entry */ + current0 = (U32)(ip0 - base); + hashTable[hash0] = current0; + + { /* load match for ip[0] */ + U32 const mval = idx >= dictStartIndex ? + MEM_read32(idxBase + idx) : + MEM_read32(ip0) ^ 1; /* guaranteed not to match */ + + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ + goto _offset; + } } + + /* lookup ip[1] */ + idx = hashTable[hash1]; + idxBase = idx < prefixStartIndex ? dictBase : base; + + /* hash ip[2] */ + hash0 = hash1; + hash1 = ZSTD_hashPtr(ip2, hlog, mls); + + /* advance to next positions */ + ip0 = ip1; + ip1 = ip2; + ip2 = ip0 + step; + ip3 = ip1 + step; + + /* calculate step */ + if (ip2 >= nextStep) { + step++; + PREFETCH_L1(ip1 + 64); + PREFETCH_L1(ip1 + 128); + nextStep += kStepIncr; + } + } while (ip3 < ilimit); + +_cleanup: + /* Note that there are probably still a couple positions we could search. + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ + + /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), + * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ + offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; /* save reps for next block */ - rep[0] = offset_1; - rep[1] = offset_2; + rep[0] = offset_1 ? offset_1 : offsetSaved1; + rep[1] = offset_2 ? offset_2 : offsetSaved2; /* Return the last literals size */ return (size_t)(iend - anchor); + +_offset: /* Requires: ip0, idx, idxBase */ + + /* Compute the offset code. */ + { U32 const offset = current0 - idx; + const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; + matchEnd = idx < prefixStartIndex ? dictEnd : iend; + match0 = idxBase + idx; + offset_2 = offset_1; + offset_1 = offset; + offcode = OFFSET_TO_OFFBASE(offset); + mLength = 4; + + /* Count the backwards match length. */ + while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { + ip0--; + match0--; + mLength++; + } } + +_match: /* Requires: ip0, match0, offcode, matchEnd */ + + /* Count the forward length. */ + assert(matchEnd != 0); + mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); + + ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); + + ip0 += mLength; + anchor = ip0; + + /* write next hash table entry */ + if (ip1 < ip0) { + hashTable[hash1] = (U32)(ip1 - base); + } + + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+current0+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + while (ip0 <= ilimit) { + U32 const repIndex2 = (U32)(ip0-base) - offset_2; + const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; + if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0)) + && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += repLength2; + anchor = ip0; + continue; + } + break; + } } + + goto _start; } +ZSTD_GEN_FAST_FN(extDict, 4, 0) +ZSTD_GEN_FAST_FN(extDict, 5, 0) +ZSTD_GEN_FAST_FN(extDict, 6, 0) +ZSTD_GEN_FAST_FN(extDict, 7, 0) size_t ZSTD_compressBlock_fast_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { U32 const mls = ms->cParams.minMatch; + assert(ms->dictMatchState == NULL); switch(mls) { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize); } } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_fast.h b/vendor/github.com/DataDog/zstd/zstd_fast.h index b74a88c..e6cefaf 100644 --- a/vendor/github.com/DataDog/zstd/zstd_fast.h +++ b/vendor/github.com/DataDog/zstd/zstd_fast.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,27 +12,22 @@ #ifndef ZSTD_FAST_H #define ZSTD_FAST_H -#if defined (__cplusplus) -extern "C" { -#endif - #include "mem.h" /* U32 */ #include "zstd_compress_internal.h" -void ZSTD_fillHashTable(ZSTD_matchState_t* ms, - void const* end, ZSTD_dictTableLoadMethod_e dtlm); +void ZSTD_fillHashTable(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm, + ZSTD_tableFillPurpose_e tfp); size_t ZSTD_compressBlock_fast( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_fast_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_fast_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -#if defined (__cplusplus) -} -#endif - #endif /* ZSTD_FAST_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_internal.h b/vendor/github.com/DataDog/zstd/zstd_internal.h index dcdcbdb..b10d69a 100644 --- a/vendor/github.com/DataDog/zstd/zstd_internal.h +++ b/vendor/github.com/DataDog/zstd/zstd_internal.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -20,6 +21,7 @@ * Dependencies ***************************************/ #include "compiler.h" +#include "cpu.h" #include "mem.h" #include "debug.h" /* assert, DEBUGLOG, RAWLOG, g_debuglevel */ #include "error_private.h" @@ -27,15 +29,15 @@ #include "zstd.h" #define FSE_STATIC_LINKING_ONLY #include "fse.h" -#define HUF_STATIC_LINKING_ONLY #include "huf.h" #ifndef XXH_STATIC_LINKING_ONLY # define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */ #endif #include "xxhash.h" /* XXH_reset, update, digest */ - -#if defined (__cplusplus) -extern "C" { +#ifndef ZSTD_NO_TRACE +# include "zstd_trace.h" +#else +# define ZSTD_TRACE 0 #endif /* ---- static assert (debug) --- */ @@ -52,50 +54,7 @@ extern "C" { #undef MAX #define MIN(a,b) ((a)<(b) ? (a) : (b)) #define MAX(a,b) ((a)>(b) ? (a) : (b)) - -/** - * Return the specified error if the condition evaluates to true. - * - * In debug modes, prints additional information. - * In order to do that (particularly, printing the conditional that failed), - * this can't just wrap RETURN_ERROR(). - */ -#define RETURN_ERROR_IF(cond, err, ...) \ - if (cond) { \ - RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", __FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return ERROR(err); \ - } - -/** - * Unconditionally return the specified error. - * - * In debug modes, prints additional information. - */ -#define RETURN_ERROR(err, ...) \ - do { \ - RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", __FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return ERROR(err); \ - } while(0); - -/** - * If the provided expression evaluates to an error code, returns that error code. - * - * In debug modes, prints additional information. - */ -#define FORWARD_IF_ERROR(err, ...) \ - do { \ - size_t const err_code = (err); \ - if (ERR_isError(err_code)) { \ - RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", __FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_code)); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return err_code; \ - } \ - } while(0); +#define BOUNDED(min,val,max) (MAX(min,MIN(val,max))) /*-************************************* @@ -104,8 +63,7 @@ extern "C" { #define ZSTD_OPT_NUM (1<<12) #define ZSTD_REP_NUM 3 /* number of repcodes */ -#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) -static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 }; +static UNUSED_ATTR const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 }; #define KB *(1 <<10) #define MB *(1 <<20) @@ -119,26 +77,29 @@ static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 }; #define BIT0 1 #define ZSTD_WINDOWLOG_ABSOLUTEMIN 10 -static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 }; -static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 }; +static UNUSED_ATTR const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 }; +static UNUSED_ATTR const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 }; #define ZSTD_FRAMEIDSIZE 4 /* magic number size */ #define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */ -static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE; +static UNUSED_ATTR const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE; typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; +#define ZSTD_FRAMECHECKSUMSIZE 4 + #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ -#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ +#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ +#define MIN_LITERALS_FOR_4_STREAMS 6 -#define HufLog 12 -typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; +typedef enum { set_basic, set_rle, set_compressed, set_repeat } SymbolEncodingType_e; #define LONGNBSEQ 0x7F00 #define MINMATCH 3 #define Litbits 8 +#define LitHufLog 11 #define MaxLit ((1<= 8 || (ovtype == ZSTD_no_overlap && diff <= -WILDCOPY_VECLEN)); - if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { /* Handle short offset copies. */ do { - COPY8(op, ip) + COPY8(op, ip); } while (op < oend); } else { assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); - /* Separate out the first two COPY16() calls because the copy length is + /* Separate out the first COPY16() call because the copy length is * almost certain to be short, so the branches have different - * probabilities. - * On gcc-9 unrolling once is +1.6%, twice is +2%, thrice is +1.8%. - * On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%. + * probabilities. Since it is almost certain to be short, only do + * one COPY16() in the first call. Then, do two calls per loop since + * at that point it is more likely to have a high trip count. */ - COPY16(op, ip); - COPY16(op, ip); - if (op >= oend) return; + ZSTD_copy16(op, ip); + if (16 >= length) return; + op += 16; + ip += 16; do { COPY16(op, ip); COPY16(op, ip); @@ -247,29 +246,35 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e } } +MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + size_t const length = MIN(dstCapacity, srcSize); + if (length > 0) { + ZSTD_memcpy(dst, src, length); + } + return length; +} + +/* define "workspace is too large" as this number of times larger than needed */ +#define ZSTD_WORKSPACETOOLARGE_FACTOR 3 + +/* when workspace is continuously too large + * during at least this number of times, + * context's memory usage is considered wasteful, + * because it's sized to handle a worst case scenario which rarely happens. + * In which case, resize it down to free some memory */ +#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128 + +/* Controls whether the input/output buffer is buffered or stable. */ +typedef enum { + ZSTD_bm_buffered = 0, /* Buffer the input/output */ + ZSTD_bm_stable = 1 /* ZSTD_inBuffer/ZSTD_outBuffer is stable */ +} ZSTD_bufferMode_e; + /*-******************************************* * Private declarations *********************************************/ -typedef struct seqDef_s { - U32 offset; - U16 litLength; - U16 matchLength; -} seqDef; - -typedef struct { - seqDef* sequencesStart; - seqDef* sequences; - BYTE* litStart; - BYTE* lit; - BYTE* llCode; - BYTE* mlCode; - BYTE* ofCode; - size_t maxNbSeq; - size_t maxNbLit; - U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */ - U32 longLengthPos; -} seqStore_t; /** * Contains the compressed frame size and an upper-bound for the decompressed frame size. @@ -278,45 +283,11 @@ typedef struct { * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` */ typedef struct { + size_t nbBlocks; size_t compressedSize; unsigned long long decompressedBound; } ZSTD_frameSizeInfo; /* decompress & legacy */ -const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ -void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ - -/* custom memory allocation functions */ -void* ZSTD_malloc(size_t size, ZSTD_customMem customMem); -void* ZSTD_calloc(size_t size, ZSTD_customMem customMem); -void ZSTD_free(void* ptr, ZSTD_customMem customMem); - - -MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ -{ - assert(val != 0); - { -# if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - _BitScanReverse(&r, val); - return (unsigned)r; -# elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */ - return __builtin_clz (val) ^ 31; -# elif defined(__ICCARM__) /* IAR Intrinsic */ - return 31 - __CLZ(val); -# else /* Software version */ - static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; - U32 v = val; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; -# endif - } -} - - /* ZSTD_invalidateRepCodes() : * ensures next compression will not use repcodes from previous block. * Note : only works with regular variant; @@ -332,19 +303,25 @@ typedef struct { /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ -/* Used by: decompress, fullbench (does not get its definition from here) */ +/* Used by: decompress, fullbench */ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr); /*! ZSTD_decodeSeqHeaders() : * decode sequence header from src */ -/* Used by: decompress, fullbench (does not get its definition from here) */ +/* Used by: zstd_decompress_block, fullbench */ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize); - -#if defined (__cplusplus) +/** + * @returns true iff the CPU supports dynamic BMI2 dispatch. + */ +MEM_STATIC int ZSTD_cpuSupportsBmi2(void) +{ + ZSTD_cpuid_t cpuid = ZSTD_cpuid(); + return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid); } -#endif #endif /* ZSTD_CCOMMON_H_MODULE */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_lazy.c b/vendor/github.com/DataDog/zstd/zstd_lazy.c index 9ad7e03..7d39201 100644 --- a/vendor/github.com/DataDog/zstd/zstd_lazy.c +++ b/vendor/github.com/DataDog/zstd/zstd_lazy.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -10,14 +11,23 @@ #include "zstd_compress_internal.h" #include "zstd_lazy.h" +#include "bits.h" /* ZSTD_countTrailingZeros64 */ + +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + +#define kLazySkippingStep 8 /*-************************************* * Binary Tree search ***************************************/ -static void -ZSTD_updateDUBT(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_updateDUBT(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend, U32 mls) { @@ -58,11 +68,12 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, /** ZSTD_insertDUBT1() : * sort one already inserted but unsorted position - * assumption : current >= btlow == (current - btmask) + * assumption : curr >= btlow == (curr - btmask) * doesn't fail */ -static void -ZSTD_insertDUBT1(ZSTD_matchState_t* ms, - U32 current, const BYTE* inputEnd, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms, + U32 curr, const BYTE* inputEnd, U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode) { @@ -74,41 +85,41 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms, const BYTE* const base = ms->window.base; const BYTE* const dictBase = ms->window.dictBase; const U32 dictLimit = ms->window.dictLimit; - const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current; - const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit; + const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr; + const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit; const BYTE* const dictEnd = dictBase + dictLimit; const BYTE* const prefixStart = base + dictLimit; const BYTE* match; - U32* smallerPtr = bt + 2*(current&btMask); + U32* smallerPtr = bt + 2*(curr&btMask); U32* largerPtr = smallerPtr + 1; U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */ U32 dummy32; /* to be nullified at the end */ U32 const windowValid = ms->window.lowLimit; U32 const maxDistance = 1U << cParams->windowLog; - U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid; + U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid; DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)", - current, dictLimit, windowLow); - assert(current >= btLow); + curr, dictLimit, windowLow); + assert(curr >= btLow); assert(ip < iend); /* condition for ZSTD_count */ - while (nbCompares-- && (matchIndex > windowLow)) { + for (; nbCompares && (matchIndex > windowLow); --nbCompares) { U32* const nextPtr = bt + 2*(matchIndex & btMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ - assert(matchIndex < current); + assert(matchIndex < curr); /* note : all candidates are now supposed sorted, * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */ if ( (dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit) /* both in current segment*/ - || (current < dictLimit) /* both in extDict */) { + || (curr < dictLimit) /* both in extDict */) { const BYTE* const mBase = ( (dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) ? base : dictBase; assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */ - || (current < dictLimit) ); + || (curr < dictLimit) ); match = mBase + matchIndex; matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); } else { @@ -119,7 +130,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms, } DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ", - current, matchIndex, (U32)matchLength); + curr, matchIndex, (U32)matchLength); if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ @@ -149,9 +160,10 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms, } -static size_t -ZSTD_DUBT_findBetterDictMatch ( - ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_DUBT_findBetterDictMatch ( + const ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iend, size_t* offsetPtr, size_t bestLength, @@ -159,7 +171,7 @@ ZSTD_DUBT_findBetterDictMatch ( U32 const mls, const ZSTD_dictMode_e dictMode) { - const ZSTD_matchState_t * const dms = ms->dictMatchState; + const ZSTD_MatchState_t * const dms = ms->dictMatchState; const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; const U32 * const dictHashTable = dms->hashTable; U32 const hashLog = dmsCParams->hashLog; @@ -168,7 +180,7 @@ ZSTD_DUBT_findBetterDictMatch ( const BYTE* const base = ms->window.base; const BYTE* const prefixStart = base + ms->window.dictLimit; - U32 const current = (U32)(ip-base); + U32 const curr = (U32)(ip-base); const BYTE* const dictBase = dms->window.base; const BYTE* const dictEnd = dms->window.nextSrc; U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base); @@ -185,7 +197,7 @@ ZSTD_DUBT_findBetterDictMatch ( (void)dictMode; assert(dictMode == ZSTD_dictMatchState); - while (nbCompares-- && (dictMatchIndex > dictLowLimit)) { + for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) { U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ const BYTE* match = dictBase + dictMatchIndex; @@ -195,10 +207,10 @@ ZSTD_DUBT_findBetterDictMatch ( if (matchLength > bestLength) { U32 matchIndex = dictMatchIndex + dictIndexDelta; - if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", - current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex); - bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; + curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); + bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); } if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ break; /* drop, to guarantee consistency (miss a little bit of compression) */ @@ -218,19 +230,20 @@ ZSTD_DUBT_findBetterDictMatch ( } if (bestLength >= MINMATCH) { - U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", - current, (U32)bestLength, (U32)*offsetPtr, mIndex); + curr, (U32)bestLength, (U32)*offsetPtr, mIndex); } return bestLength; } -static size_t -ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iend, - size_t* offsetPtr, + size_t* offBasePtr, U32 const mls, const ZSTD_dictMode_e dictMode) { @@ -241,13 +254,13 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, U32 matchIndex = hashTable[h]; const BYTE* const base = ms->window.base; - U32 const current = (U32)(ip-base); - U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog); + U32 const curr = (U32)(ip-base); + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog); U32* const bt = ms->chainTable; U32 const btLog = cParams->chainLog - 1; U32 const btMask = (1 << btLog) - 1; - U32 const btLow = (btMask >= current) ? 0 : current - btMask; + U32 const btLow = (btMask >= curr) ? 0 : curr - btMask; U32 const unsortLimit = MAX(btLow, windowLow); U32* nextCandidate = bt + 2*(matchIndex&btMask); @@ -256,8 +269,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, U32 nbCandidates = nbCompares; U32 previousCandidate = 0; - DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current); + DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr); assert(ip <= iend-8); /* required for h calculation */ + assert(dictMode != ZSTD_dedicatedDictSearch); /* reach end of unsorted candidates list */ while ( (matchIndex > unsortLimit) @@ -299,16 +313,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, const U32 dictLimit = ms->window.dictLimit; const BYTE* const dictEnd = dictBase + dictLimit; const BYTE* const prefixStart = base + dictLimit; - U32* smallerPtr = bt + 2*(current&btMask); - U32* largerPtr = bt + 2*(current&btMask) + 1; - U32 matchEndIdx = current + 8 + 1; + U32* smallerPtr = bt + 2*(curr&btMask); + U32* largerPtr = bt + 2*(curr&btMask) + 1; + U32 matchEndIdx = curr + 8 + 1; U32 dummy32; /* to be nullified at the end */ size_t bestLength = 0; matchIndex = hashTable[h]; - hashTable[h] = current; /* Update Hash Table */ + hashTable[h] = curr; /* Update Hash Table */ - while (nbCompares-- && (matchIndex > windowLow)) { + for (; nbCompares && (matchIndex > windowLow); --nbCompares) { U32* const nextPtr = bt + 2*(matchIndex & btMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ const BYTE* match; @@ -326,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, if (matchLength > bestLength) { if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; - if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) - bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) + bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ if (dictMode == ZSTD_dictMatchState) { nbCompares = 0; /* in addition to avoiding checking any @@ -356,19 +370,20 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, *smallerPtr = *largerPtr = 0; + assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ if (dictMode == ZSTD_dictMatchState && nbCompares) { bestLength = ZSTD_DUBT_findBetterDictMatch( ms, ip, iend, - offsetPtr, bestLength, nbCompares, + offBasePtr, bestLength, nbCompares, mls, dictMode); } - assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */ + assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ if (bestLength >= MINMATCH) { - U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", - current, (U32)bestLength, (U32)*offsetPtr, mIndex); + curr, (U32)bestLength, (U32)*offBasePtr, mIndex); } return bestLength; } @@ -376,69 +391,236 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ -FORCE_INLINE_TEMPLATE size_t -ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, - size_t* offsetPtr, + size_t* offBasePtr, const U32 mls /* template */, const ZSTD_dictMode_e dictMode) { DEBUGLOG(7, "ZSTD_BtFindBestMatch"); if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ ZSTD_updateDUBT(ms, ip, iLimit, mls); - return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); + return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); } +/*********************************** +* Dedicated dict search +***********************************/ -static size_t -ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip) { - switch(ms->cParams.minMatch) + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32* const hashTable = ms->hashTable; + U32* const chainTable = ms->chainTable; + U32 const chainSize = 1 << ms->cParams.chainLog; + U32 idx = ms->nextToUpdate; + U32 const minChain = chainSize < target - idx ? target - chainSize : idx; + U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; + U32 const cacheSize = bucketSize - 1; + U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; + U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts; + + /* We know the hashtable is oversized by a factor of `bucketSize`. + * We are going to temporarily pretend `bucketSize == 1`, keeping only a + * single entry. We will use the rest of the space to construct a temporary + * chaintable. + */ + U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; + U32* const tmpHashTable = hashTable; + U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); + U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; + U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; + U32 hashIdx; + + assert(ms->cParams.chainLog <= 24); + assert(ms->cParams.hashLog > ms->cParams.chainLog); + assert(idx != 0); + assert(tmpMinChain <= minChain); + + /* fill conventional hash table and conventional chain table */ + for ( ; idx < target; idx++) { + U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch); + if (idx >= tmpMinChain) { + tmpChainTable[idx - tmpMinChain] = hashTable[h]; + } + tmpHashTable[h] = idx; + } + + /* sort chains into ddss chain table */ { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); + U32 chainPos = 0; + for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) { + U32 count; + U32 countBeyondMinChain = 0; + U32 i = tmpHashTable[hashIdx]; + for (count = 0; i >= tmpMinChain && count < cacheSize; count++) { + /* skip through the chain to the first position that won't be + * in the hash cache bucket */ + if (i < minChain) { + countBeyondMinChain++; + } + i = tmpChainTable[i - tmpMinChain]; + } + if (count == cacheSize) { + for (count = 0; count < chainLimit;) { + if (i < minChain) { + if (!i || ++countBeyondMinChain > cacheSize) { + /* only allow pulling `cacheSize` number of entries + * into the cache or chainTable beyond `minChain`, + * to replace the entries pulled out of the + * chainTable into the cache. This lets us reach + * back further without increasing the total number + * of entries in the chainTable, guaranteeing the + * DDSS chain table will fit into the space + * allocated for the regular one. */ + break; + } + } + chainTable[chainPos++] = i; + count++; + if (i < tmpMinChain) { + break; + } + i = tmpChainTable[i - tmpMinChain]; + } + } else { + count = 0; + } + if (count) { + tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count; + } else { + tmpHashTable[hashIdx] = 0; + } + } + assert(chainPos <= chainSize); /* I believe this is guaranteed... */ + } + + /* move chain pointers into the last entry of each hash bucket */ + for (hashIdx = (1 << hashLog); hashIdx; ) { + U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG; + U32 const chainPackedPointer = tmpHashTable[hashIdx]; + U32 i; + for (i = 0; i < cacheSize; i++) { + hashTable[bucketIdx + i] = 0; + } + hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer; } + + /* fill the buckets of the hash table */ + for (idx = ms->nextToUpdate; idx < target; idx++) { + U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch) + << ZSTD_LAZY_DDSS_BUCKET_LOG; + U32 i; + /* Shift hash cache down 1. */ + for (i = cacheSize - 1; i; i--) + hashTable[h + i] = hashTable[h + i - 1]; + hashTable[h] = idx; + } + + ms->nextToUpdate = target; } +/* Returns the longest match length found in the dedicated dict search structure. + * If none are longer than the argument ml, then ml will be returned. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, + const ZSTD_MatchState_t* const dms, + const BYTE* const ip, const BYTE* const iLimit, + const BYTE* const prefixStart, const U32 curr, + const U32 dictLimit, const size_t ddsIdx) { + const U32 ddsLowestIndex = dms->window.dictLimit; + const BYTE* const ddsBase = dms->window.base; + const BYTE* const ddsEnd = dms->window.nextSrc; + const U32 ddsSize = (U32)(ddsEnd - ddsBase); + const U32 ddsIndexDelta = dictLimit - ddsSize; + const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); + const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; + U32 ddsAttempt; + U32 matchIndex; + + for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { + PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); + } -static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; + U32 const chainIndex = chainPackedPointer >> 8; + + PREFETCH_L1(&dms->chainTable[chainIndex]); } -} + for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { + size_t currentMl=0; + const BYTE* match; + matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; + match = ddsBase + matchIndex; + + if (!matchIndex) { + return ml; + } + + /* guaranteed by table construction */ + (void)ddsLowestIndex; + assert(matchIndex >= ddsLowestIndex); + assert(match+4 <= ddsEnd); + if (MEM_read32(match) == MEM_read32(ip)) { + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; + } + } + } -static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; + U32 chainIndex = chainPackedPointer >> 8; + U32 const chainLength = chainPackedPointer & 0xFF; + U32 const chainAttempts = nbAttempts - ddsAttempt; + U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; + U32 chainAttempt; + + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { + PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); + } + + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { + size_t currentMl=0; + const BYTE* match; + matchIndex = dms->chainTable[chainIndex]; + match = ddsBase + matchIndex; + + /* guaranteed by table construction */ + assert(matchIndex >= ddsLowestIndex); + assert(match+4 <= ddsEnd); + if (MEM_read32(match) == MEM_read32(ip)) { + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } } + return ml; } - /* ********************************* * Hash Chain ***********************************/ @@ -446,10 +628,12 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( /* Update chains up to ip (excluded) Assumption : always within prefix (i.e. not within extDict) */ -static U32 ZSTD_insertAndFindFirstIndex_internal( - ZSTD_matchState_t* ms, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_MatchState_t* ms, const ZSTD_compressionParameters* const cParams, - const BYTE* ip, U32 const mls) + const BYTE* ip, U32 const mls, U32 const lazySkipping) { U32* const hashTable = ms->hashTable; const U32 hashLog = cParams->hashLog; @@ -464,22 +648,25 @@ static U32 ZSTD_insertAndFindFirstIndex_internal( NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; hashTable[h] = idx; idx++; + /* Stop inserting every position when in the lazy skipping mode. */ + if (lazySkipping) + break; } ms->nextToUpdate = target; return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; } -U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { +U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) { const ZSTD_compressionParameters* const cParams = &ms->cParams; - return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); } - /* inlining is important to hardwire a hot branch (template emulation) */ FORCE_INLINE_TEMPLATE -size_t ZSTD_HcFindBestMatch_generic ( - ZSTD_matchState_t* ms, +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_HcFindBestMatch( + ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, size_t* offsetPtr, const U32 mls, const ZSTD_dictMode_e dictMode) @@ -493,25 +680,39 @@ size_t ZSTD_HcFindBestMatch_generic ( const U32 dictLimit = ms->window.dictLimit; const BYTE* const prefixStart = base + dictLimit; const BYTE* const dictEnd = dictBase + dictLimit; - const U32 current = (U32)(ip-base); + const U32 curr = (U32)(ip-base); const U32 maxDistance = 1U << cParams->windowLog; const U32 lowestValid = ms->window.lowLimit; - const U32 withinMaxDistance = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid; + const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; const U32 isDictionary = (ms->loadedDictEnd != 0); const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; - const U32 minChain = current > chainSize ? current - chainSize : 0; + const U32 minChain = curr > chainSize ? curr - chainSize : 0; U32 nbAttempts = 1U << cParams->searchLog; size_t ml=4-1; + const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch + ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0; + const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch + ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0; + + U32 matchIndex; + + if (dictMode == ZSTD_dedicatedDictSearch) { + const U32* entry = &dms->hashTable[ddsIdx]; + PREFETCH_L1(entry); + } + /* HC4 match finder */ - U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); + matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); - for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) { + for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { size_t currentMl=0; if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { const BYTE* const match = base + matchIndex; assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ - if (match[ml] == ip[ml]) /* potentially better */ + /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ + if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ currentMl = ZSTD_count(ip, match, iLimit); } else { const BYTE* const match = dictBase + matchIndex; @@ -523,7 +724,7 @@ size_t ZSTD_HcFindBestMatch_generic ( /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = current - matchIndex + ZSTD_REP_MOVE; + *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } @@ -531,8 +732,11 @@ size_t ZSTD_HcFindBestMatch_generic ( matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); } - if (dictMode == ZSTD_dictMatchState) { - const ZSTD_matchState_t* const dms = ms->dictMatchState; + assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ + if (dictMode == ZSTD_dedicatedDictSearch) { + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); + } else if (dictMode == ZSTD_dictMatchState) { const U32* const dmsChainTable = dms->chainTable; const U32 dmsChainSize = (1 << dms->cParams.chainLog); const U32 dmsChainMask = dmsChainSize - 1; @@ -545,7 +749,7 @@ size_t ZSTD_HcFindBestMatch_generic ( matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)]; - for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) { + for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) { size_t currentMl=0; const BYTE* const match = dmsBase + matchIndex; assert(match+4 <= dmsEnd); @@ -555,11 +759,13 @@ size_t ZSTD_HcFindBestMatch_generic ( /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } if (matchIndex <= dmsMinChain) break; + matchIndex = dmsChainTable[matchIndex & dmsChainMask]; } } @@ -567,63 +773,749 @@ size_t ZSTD_HcFindBestMatch_generic ( return ml; } +/* ********************************* +* (SIMD) Row-based matchfinder +***********************************/ +/* Constants for row-based hash */ +#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) +#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) + +typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */ + +/* ZSTD_VecMask_next(): + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { + return ZSTD_countTrailingZeros64(val); +} + +/* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" + * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { + U32 next = (*tagRow-1) & rowMask; + next += (next == 0) ? rowMask : 0; /* skip first position */ + *tagRow = (BYTE)next; + return next; +} + +/* ZSTD_isAligned(): + * Checks that a pointer is aligned to "align" bytes which must be a power of 2. + */ +MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + assert((align & (align - 1)) == 0); + return (((size_t)ptr) & (align - 1)) == 0; +} -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +/* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); + /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ + } + PREFETCH_L1(tagTable + relRow); + if (rowLog == 6) { + PREFETCH_L1(tagTable + relRow + 32); + } + assert(rowLog == 4 || rowLog == 5 || rowLog == 6); + assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */ + assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ +} + +/* ZSTD_row_fillHashCache(): + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); + U32 const* const hashTable = ms->hashTable; + BYTE const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { + U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; } + + DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1], + ms->hashCache[2], ms->hashCache[3], ms->hashCache[4], + ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]); } +/* ZSTD_row_nextCachedHash(): + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, + BYTE const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, + U32 const rowLog, U32 const mls, + U64 const hashSalt) +{ + U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; + cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash; + return hash; + } +} -static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +/* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms, + U32 updateStartIdx, U32 const updateEndIdx, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); + U32* const hashTable = ms->hashTable; + BYTE* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { + U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) + : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; + BYTE* tagRow = tagTable + relRow; + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + + assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); + tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; + } +} + +/* ZSTD_row_update_internal(): + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) +{ + U32 idx = ms->nextToUpdate; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + const U32 kSkipThreshold = 384; + const U32 kMaxMatchStartPositionsToUpdate = 96; + const U32 kMaxMatchEndPositionsToUpdate = 32; + + if (useCache) { + /* Only skip positions when using hash cache, i.e. + * if we are loading a dict, don't skip anything. + * If we decide to skip, then we only update a set number + * of positions at the beginning and end of the match. + */ + if (UNLIKELY(target - idx > kSkipThreshold)) { + U32 const bound = idx + kMaxMatchStartPositionsToUpdate; + ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache); + idx = target - kMaxMatchEndPositionsToUpdate; + ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); + } } + assert(target >= idx); + ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache); + ms->nextToUpdate = target; } +/* ZSTD_row_update(): + * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary + * processing. + */ +void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) { + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + const U32 rowMask = (1u << rowLog) - 1; + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); +} -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +/* Returns the mask width of bits group of which will be set to 1. Given not all + * architectures have easy movemask instruction, this helps to iterate over + * groups of bits easier and faster. + */ +FORCE_INLINE_TEMPLATE U32 +ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); + (void)rowEntries; +#if defined(ZSTD_ARCH_ARM_NEON) + /* NEON path only works for little endian */ + if (!MEM_isLittleEndian()) { + return 1; + } + if (rowEntries == 16) { + return 4; + } + if (rowEntries == 32) { + return 2; + } + if (rowEntries == 64) { + return 1; + } +#endif + return 1; +} + +#if defined(ZSTD_ARCH_X86_SSE2) +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) +{ + const __m128i comparisonMask = _mm_set1_epi8((char)tag); + int matches[4] = {0}; + int i; + assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); + for (i=0; i> chunkSize; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= (chunk * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } else { /* big endian: reverse bits during extraction */ + const size_t msb = xFF ^ (xFF >> 1); + const size_t extractMagic = (msb / 0x1FF) | msb; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } + matches = ~matches; + if (rowEntries == 16) { + return ZSTD_rotateRight_U16((U16)matches, headGrouped); + } else if (rowEntries == 32) { + return ZSTD_rotateRight_U32((U32)matches, headGrouped); + } else { + return ZSTD_rotateRight_U64((U64)matches, headGrouped); + } + } +#endif +} + +/* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: + * - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index. + * - The hash is salted by a value that changes on every context reset, so when the same table is used + * we will avoid collisions that would otherwise slow us down by introducing phantom matches. + * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines + * which row to insert into. + * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can + * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes + * per row). + * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and + * generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. + * - Insert the tag into the equivalent row and position in the tagTable. + */ +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_RowFindBestMatch( + ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode, + const U32 rowLog) +{ + U32* const hashTable = ms->hashTable; + BYTE* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const U32 curr = (U32)(ip-base); + const U32 maxDistance = 1U << cParams->windowLog; + const U32 lowestValid = ms->window.lowLimit; + const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; + const U32 isDictionary = (ms->loadedDictEnd != 0); + const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ + const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); + const U64 hashSalt = ms->hashSalt; + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; + U32 hash; + + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_MatchState_t* const dms = ms->dictMatchState; + + /* Initialize the following variables to satisfy static analyzer */ + size_t ddsIdx = 0; + U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ + U32 dmsTag = 0; + U32* dmsRow = NULL; + BYTE* dmsTagRow = NULL; + + if (dictMode == ZSTD_dedicatedDictSearch) { + const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; + { /* Prefetch DDS hashtable entry */ + ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG; + PREFETCH_L1(&dms->hashTable[ddsIdx]); + } + ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0; + } + + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; + BYTE* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; + dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow); + dmsRow = dmsHashTable + dmsRelRow; + ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); + } + + /* Update the hashTable and tagTable up to (but not including) ip */ + if (!ms->lazySkipping) { + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); + hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); + } else { + /* Stop inserting every position when in the lazy skipping mode. + * The hash cache is also not kept up to date in this mode. + */ + hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + ms->nextToUpdate = curr; + } + ms->hashSaltEntropy += hash; /* collect salt entropy */ + + { /* Get the hash for ip, compute the appropriate row */ + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); + U32 const headGrouped = (*tagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; + ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); + + /* Cycle through the matches and prefetch */ + for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { + U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = row[matchPos]; + if(matchPos == 0) continue; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + PREFETCH_L1(base + matchIndex); + } else { + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; + --nbAttempts; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + tagRow[pos] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex = matchBuffer[currMatch]; + size_t currentMl=0; + assert(matchIndex < curr); + assert(matchIndex >= lowLimit); + + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ + /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ + if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; + assert(match+4 <= dictEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; + } + + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } + } + + assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ + if (dictMode == ZSTD_dedicatedDictSearch) { + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); + } else if (dictMode == ZSTD_dictMatchState) { + /* TODO: Measure and potentially add prefetching to DMS */ + const U32 dmsLowestIndex = dms->window.dictLimit; + const BYTE* const dmsBase = dms->window.base; + const BYTE* const dmsEnd = dms->window.nextSrc; + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + + { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; + ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); + + for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { + U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; + if(matchPos == 0) continue; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; + --nbAttempts; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex = matchBuffer[currMatch]; + size_t currentMl=0; + assert(matchIndex >= dmsLowestIndex); + assert(matchIndex < curr); + + { const BYTE* const match = dmsBase + matchIndex; + assert(match+4 <= dmsEnd); + if (MEM_read32(match) == MEM_read32(ip)) + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; + } + + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } + } } + return ml; } +/** + * Generate search functions templated on (dictMode, mls, rowLog). + * These functions are outlined for code size & compilation time. + * ZSTD_searchMax() dispatches to the correct implementation function. + * + * TODO: The start of the search function involves loading and calculating a + * bunch of constants from the ZSTD_MatchState_t. These computations could be + * done in an initialization function, and saved somewhere in the match state. + * Then we could pass a pointer to the saved state instead of the match state, + * and avoid duplicate computations. + * + * TODO: Move the match re-winding into searchMax. This improves compression + * ratio, and unlocks further simplifications with the next TODO. + * + * TODO: Try moving the repcode search into searchMax. After the re-winding + * and repcode search are in searchMax, there is no more logic in the match + * finder loop that requires knowledge about the dictMode. So we should be + * able to avoid force inlining it, and we can join the extDict loop with + * the single segment loop. It should go in searchMax instead of its own + * function to avoid having multiple virtual function calls per search. + */ + +#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls +#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls +#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog + +#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE + +#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ + ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offBasePtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \ + } \ + +#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ + ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ + } \ + +#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ + ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \ + return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ + } \ + +#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ + X(dictMode, mls, 4) \ + X(dictMode, mls, 5) \ + X(dictMode, mls, 6) + +#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) + +#define ZSTD_FOR_EACH_MLS(X, dictMode) \ + X(dictMode, 4) \ + X(dictMode, 5) \ + X(dictMode, 6) + +#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ + X(__VA_ARGS__, noDict) \ + X(__VA_ARGS__, extDict) \ + X(__VA_ARGS__, dictMatchState) \ + X(__VA_ARGS__, dedicatedDictSearch) + +/* Generate row search fns for each combination of (dictMode, mls, rowLog) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) +/* Generate binary Tree search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) +/* Generate hash chain search fns for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) + +typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; + +#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + case rowLog: \ + return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr); + +#define ZSTD_SWITCH_MLS(X, dictMode) \ + switch (mls) { \ + ZSTD_FOR_EACH_MLS(X, dictMode) \ + } + +#define ZSTD_SWITCH_ROWLOG(dictMode, mls) \ + case mls: \ + switch (rowLog) { \ + ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \ + } \ + ZSTD_UNREACHABLE; \ + break; + +#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \ + switch (searchMethod) { \ + case search_hashChain: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ + break; \ + case search_binaryTree: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ + break; \ + case search_rowHash: \ + ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \ + break; \ + } \ + ZSTD_UNREACHABLE; + +/** + * Searches for the longest match at @p ip. + * Dispatches to the correct implementation function based on the + * (searchMethod, dictMode, mls, rowLog). We use switch statements + * here instead of using an indirect function call through a function + * pointer because after Spectre and Meltdown mitigations, indirect + * function calls can be very costly, especially in the kernel. + * + * NOTE: dictMode and searchMethod should be templated, so those switch + * statements should be optimized out. Only the mls & rowLog switches + * should be left. + * + * @param ms The match state. + * @param ip The position to search at. + * @param iend The end of the input data. + * @param[out] offsetPtr Stores the match offset into this pointer. + * @param mls The minimum search length, in the range [4, 6]. + * @param rowLog The row log (if applicable), in the range [4, 6]. + * @param searchMethod The search method to use (templated). + * @param dictMode The dictMode (templated). + * + * @returns The length of the longest match found, or < mls if no match is found. + * If a match is found its offset is stored in @p offsetPtr. + */ +FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( + ZSTD_MatchState_t* ms, + const BYTE* ip, + const BYTE* iend, + size_t* offsetPtr, + U32 const mls, + U32 const rowLog, + searchMethod_e const searchMethod, + ZSTD_dictMode_e const dictMode) +{ + if (dictMode == ZSTD_noDict) { + ZSTD_SWITCH_SEARCH_METHOD(noDict) + } else if (dictMode == ZSTD_extDict) { + ZSTD_SWITCH_SEARCH_METHOD(extDict) + } else if (dictMode == ZSTD_dictMatchState) { + ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) + } else if (dictMode == ZSTD_dedicatedDictSearch) { + ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) + } + ZSTD_UNREACHABLE; + return 0; +} + /* ******************************* * Common parser - lazy strategy *********************************/ -typedef enum { search_hashChain, search_binaryTree } searchMethod_e; -FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_lazy_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_compressBlock_lazy_generic( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize, const searchMethod_e searchMethod, const U32 depth, @@ -633,63 +1525,73 @@ ZSTD_compressBlock_lazy_generic( const BYTE* ip = istart; const BYTE* anchor = istart; const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; + const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; const BYTE* const base = ms->window.base; const U32 prefixLowestIndex = ms->window.dictLimit; const BYTE* const prefixLowest = base + prefixLowestIndex; + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ? - (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS - : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) : - (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_selectMLS - : ZSTD_HcFindBestMatch_selectMLS); - U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; - - const ZSTD_matchState_t* const dms = ms->dictMatchState; - const U32 dictLowestIndex = dictMode == ZSTD_dictMatchState ? - dms->window.dictLimit : 0; - const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ? - dms->window.base : NULL; - const BYTE* const dictLowest = dictMode == ZSTD_dictMatchState ? - dictBase + dictLowestIndex : NULL; - const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ? - dms->window.nextSrc : NULL; - const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ? + U32 offset_1 = rep[0], offset_2 = rep[1]; + U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const int isDMS = dictMode == ZSTD_dictMatchState; + const int isDDS = dictMode == ZSTD_dedicatedDictSearch; + const int isDxS = isDMS || isDDS; + const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0; + const BYTE* const dictBase = isDxS ? dms->window.base : NULL; + const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL; + const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL; + const U32 dictIndexDelta = isDxS ? prefixLowestIndex - (U32)(dictEnd - dictBase) : 0; - const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest); + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); - /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod); ip += (dictAndPrefixLength == 0); if (dictMode == ZSTD_noDict) { - U32 const maxRep = (U32)(ip - prefixLowest); - if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; - if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; + U32 const curr = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); + U32 const maxRep = curr - windowLow; + if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; } - if (dictMode == ZSTD_dictMatchState) { + if (isDxS) { /* dictMatchState repCode checks don't currently handle repCode == 0 * disabling. */ assert(offset_1 <= dictAndPrefixLength); assert(offset_2 <= dictAndPrefixLength); } + /* Reset the lazy skipping state */ + ms->lazySkipping = 0; + + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) + /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the + * code alignment is perturbed. To fix the instability align the loop on 32-bytes. + */ + __asm__(".p2align 5"); +#endif while (ip < ilimit) { size_t matchLength=0; - size_t offset=0; + size_t offBase = REPCODE1_TO_OFFBASE; const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); /* check repCode */ - if (dictMode == ZSTD_dictMatchState) { + if (isDxS) { const U32 repIndex = (U32)(ip - base) + 1 - offset_1; - const BYTE* repMatch = (dictMode == ZSTD_dictMatchState + const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch) && repIndex < prefixLowestIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; - if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; @@ -703,132 +1605,148 @@ ZSTD_compressBlock_lazy_generic( } /* first search (depth 0) */ - { size_t offsetFound = 999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + { size_t offbaseFound = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); if (ml2 > matchLength) - matchLength = ml2, start = ip, offset=offsetFound; + matchLength = ml2, start = ip, offBase = offbaseFound; } if (matchLength < 4) { - ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; + ip += step; + /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. + * In this mode we stop inserting every position into our tables, and only insert + * positions that we search, which is one in step positions. + * The exact cutoff is flexible, I've just chosen a number that is reasonably high, + * so we minimize the compression ratio loss in "normal" scenarios. This mode gets + * triggered once we've gone 2KB without finding any matches. + */ + ms->lazySkipping = step > kLazySkippingStep; continue; } /* let's try to find a better solution */ if (depth>=1) while (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; int const gain2 = (int)(mlRep * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; } - if (dictMode == ZSTD_dictMatchState) { + if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; const BYTE* repMatch = repIndex < prefixLowestIndex ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; - if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; } } - { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + { size_t ofbCandidate=999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offBase = ofbCandidate, start = ip; continue; /* search a better one */ } } /* let's find an even better one */ if ((depth==2) && (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; int const gain2 = (int)(mlRep * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; } - if (dictMode == ZSTD_dictMatchState) { + if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; const BYTE* repMatch = repIndex < prefixLowestIndex ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; - if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; } } - { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + { size_t ofbCandidate=999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offBase = ofbCandidate, start = ip; continue; } } } break; /* nothing found : store previous solution */ } /* NOTE: - * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. - * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which - * overflows the pointer, which is undefined behavior. + * Pay attention that `start[-value]` can lead to strange undefined behavior + * notably if `value` is unsigned, resulting in a large positive `-value`. */ /* catch up */ - if (offset) { + if (OFFBASE_IS_OFFSET(offBase)) { if (dictMode == ZSTD_noDict) { - while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest)) - && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */ + while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) + && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ { start--; matchLength++; } } - if (dictMode == ZSTD_dictMatchState) { - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + if (isDxS) { + U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ } - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); } /* store sequence */ _storeSequence: - { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + { size_t const litLength = (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); anchor = ip = start + matchLength; } + if (ms->lazySkipping) { + /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + ms->lazySkipping = 0; + } /* check immediate repcode */ - if (dictMode == ZSTD_dictMatchState) { + if (isDxS) { while (ip <= ilimit) { U32 const current2 = (U32)(ip-base); U32 const repIndex = current2 - offset_2; - const BYTE* repMatch = dictMode == ZSTD_dictMatchState - && repIndex < prefixLowestIndex ? + const BYTE* repMatch = repIndex < prefixLowestIndex ? dictBase - dictIndexDelta + repIndex : base + repIndex; - if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */) + if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); ip += matchLength; anchor = ip; continue; @@ -842,82 +1760,183 @@ ZSTD_compressBlock_lazy_generic( && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { /* store sequence */ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ } } } - /* Save reps for next block */ - rep[0] = offset_1 ? offset_1 : savedOffset; - rep[1] = offset_2 ? offset_2 : savedOffset; + /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), + * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ + offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved1; + rep[1] = offset_2 ? offset_2 : offsetSaved2; /* Return the last literals size */ return (size_t)(iend - anchor); } +#endif /* build exclusions */ -size_t ZSTD_compressBlock_btlazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_greedy( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); } -size_t ZSTD_compressBlock_lazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_greedy_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); } +size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); +} +#endif + +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_lazy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); } -size_t ZSTD_compressBlock_greedy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); } -size_t ZSTD_compressBlock_btlazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_lazy_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); +} +#endif + +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); } size_t ZSTD_compressBlock_lazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); } -size_t ZSTD_compressBlock_lazy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); } -size_t ZSTD_compressBlock_greedy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy2_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); } +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); +} +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); +} +#endif + +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); +} +#endif + +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_compressBlock_lazy_extDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize, const searchMethod_e searchMethod, const U32 depth) @@ -926,37 +1945,50 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* ip = istart; const BYTE* anchor = istart; const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; + const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; const BYTE* const base = ms->window.base; const U32 dictLimit = ms->window.dictLimit; - const U32 lowestIndex = ms->window.lowLimit; const BYTE* const prefixStart = base + dictLimit; const BYTE* const dictBase = ms->window.dictBase; const BYTE* const dictEnd = dictBase + dictLimit; - const BYTE* const dictStart = dictBase + lowestIndex; - - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS; + const BYTE* const dictStart = dictBase + ms->window.lowLimit; + const U32 windowLog = ms->cParams.windowLog; + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); U32 offset_1 = rep[0], offset_2 = rep[1]; + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + + /* Reset the lazy skipping state */ + ms->lazySkipping = 0; + /* init */ ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) + /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the + * code alignment is perturbed. To fix the instability align the loop on 32-bytes. + */ + __asm__(".p2align 5"); +#endif while (ip < ilimit) { size_t matchLength=0; - size_t offset=0; + size_t offBase = REPCODE1_TO_OFFBASE; const BYTE* start=ip+1; - U32 current = (U32)(ip-base); + U32 curr = (U32)(ip-base); /* check repCode */ - { const U32 repIndex = (U32)(current+1 - offset_1); + { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog); + const U32 repIndex = (U32)(curr+1 - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ if (MEM_read32(ip+1) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -965,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( } } /* first search (depth 0) */ - { size_t offsetFound = 999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + { size_t ofbCandidate = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); if (ml2 > matchLength) - matchLength = ml2, start = ip, offset=offsetFound; + matchLength = ml2, start = ip, offBase = ofbCandidate; } - if (matchLength < 4) { - ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + if (matchLength < 4) { + size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); + ip += step + 1; /* jump faster over incompressible sections */ + /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. + * In this mode we stop inserting every position into our tables, and only insert + * positions that we search, which is one in step positions. + * The exact cutoff is flexible, I've just chosen a number that is reasonably high, + * so we minimize the compression ratio loss in "normal" scenarios. This mode gets + * triggered once we've gone 2KB without finding any matches. + */ + ms->lazySkipping = step > kLazySkippingStep; continue; } @@ -980,93 +2021,107 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( if (depth>=1) while (ip= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); if ((repLength >= 4) && (gain2 > gain1)) - matchLength = repLength, offset = 0, start = ip; + matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; } } /* search match, depth 1 */ - { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + { size_t ofbCandidate = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offBase = ofbCandidate, start = ip; continue; /* search a better one */ } } /* let's find an even better one */ if ((depth==2) && (ip= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); if ((repLength >= 4) && (gain2 > gain1)) - matchLength = repLength, offset = 0, start = ip; + matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; } } /* search match, depth 2 */ - { size_t offset2=999999999; - size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + { size_t ofbCandidate = 999999999; + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offBase = ofbCandidate, start = ip; continue; } } } break; /* nothing found : store previous solution */ } /* catch up */ - if (offset) { - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + if (OFFBASE_IS_OFFSET(offBase)) { + U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); } /* store sequence */ _storeSequence: - { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + { size_t const litLength = (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); anchor = ip = start + matchLength; } + if (ms->lazySkipping) { + /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + ms->lazySkipping = 0; + } /* check immediate repcode */ while (ip <= ilimit) { - const U32 repIndex = (U32)((ip-base) - offset_2); + const U32 repCurrent = (U32)(ip-base); + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog); + const U32 repIndex = repCurrent - offset_2; const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */ + if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ @@ -1081,35 +2136,67 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( /* Return the last literals size */ return (size_t)(iend - anchor); } +#endif /* build exclusions */ - +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_greedy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); } +size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); +} +#endif + +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_lazy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); } +size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); +} +#endif + +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_lazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); } +size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); +} +#endif + +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btlazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) { return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); } +#endif + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_lazy.h b/vendor/github.com/DataDog/zstd/zstd_lazy.h index bb17630..579c940 100644 --- a/vendor/github.com/DataDog/zstd/zstd_lazy.h +++ b/vendor/github.com/DataDog/zstd/zstd_lazy.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,57 +12,185 @@ #ifndef ZSTD_LAZY_H #define ZSTD_LAZY_H -#if defined (__cplusplus) -extern "C" { -#endif - #include "zstd_compress_internal.h" -U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); +/** + * Dedicated Dictionary Search Structure bucket log. In the + * ZSTD_dedicatedDictSearch mode, the hashTable has + * 2 ** ZSTD_LAZY_DDSS_BUCKET_LOG entries in each bucket, rather than just + * one. + */ +#define ZSTD_LAZY_DDSS_BUCKET_LOG 2 + +#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ + +#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) +U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip); +void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip); + +void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip); void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ +#endif -size_t ZSTD_compressBlock_btlazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_greedy( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_btlazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy +#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row +#else +#define ZSTD_COMPRESSBLOCK_GREEDY NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL +#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL +#endif + +#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_lazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_lazy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_greedy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); - -size_t ZSTD_compressBlock_greedy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_lazy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy +#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row +#else +#define ZSTD_COMPRESSBLOCK_LAZY NULL +#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL +#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL +#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL +#endif + +#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_lazy2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_lazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2 +#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row +#else +#define ZSTD_COMPRESSBLOCK_LAZY2 NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL +#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL +#endif + +#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btlazy2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_btlazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -#if defined (__cplusplus) -} +#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2 +#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState +#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict +#else +#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL +#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL #endif #endif /* ZSTD_LAZY_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_ldm.c b/vendor/github.com/DataDog/zstd/zstd_ldm.c index c3312ad..d83232d 100644 --- a/vendor/github.com/DataDog/zstd/zstd_ldm.c +++ b/vendor/github.com/DataDog/zstd/zstd_ldm.c @@ -1,46 +1,168 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ #include "zstd_ldm.h" #include "debug.h" +#include "xxhash.h" #include "zstd_fast.h" /* ZSTD_fillHashTable() */ #include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */ +#include "zstd_ldm_geartab.h" -#define LDM_BUCKET_SIZE_LOG 3 +#define LDM_BUCKET_SIZE_LOG 4 #define LDM_MIN_MATCH_LENGTH 64 #define LDM_HASH_RLOG 7 -#define LDM_HASH_CHAR_OFFSET 10 + +typedef struct { + U64 rolling; + U64 stopMask; +} ldmRollingHashState_t; + +/** ZSTD_ldm_gear_init(): + * + * Initializes the rolling hash state such that it will honor the + * settings in params. */ +static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const* params) +{ + unsigned maxBitsInMask = MIN(params->minMatchLength, 64); + unsigned hashRateLog = params->hashRateLog; + + state->rolling = ~(U32)0; + + /* The choice of the splitting criterion is subject to two conditions: + * 1. it has to trigger on average every 2^(hashRateLog) bytes; + * 2. ideally, it has to depend on a window of minMatchLength bytes. + * + * In the gear hash algorithm, bit n depends on the last n bytes; + * so in order to obtain a good quality splitting criterion it is + * preferable to use bits with high weight. + * + * To match condition 1 we use a mask with hashRateLog bits set + * and, because of the previous remark, we make sure these bits + * have the highest possible weight while still respecting + * condition 2. + */ + if (hashRateLog > 0 && hashRateLog <= maxBitsInMask) { + state->stopMask = (((U64)1 << hashRateLog) - 1) << (maxBitsInMask - hashRateLog); + } else { + /* In this degenerate case we simply honor the hash rate. */ + state->stopMask = ((U64)1 << hashRateLog) - 1; + } +} + +/** ZSTD_ldm_gear_reset() + * Feeds [data, data + minMatchLength) into the hash without registering any + * splits. This effectively resets the hash state. This is used when skipping + * over data, either at the beginning of a block, or skipping sections. + */ +static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state, + BYTE const* data, size_t minMatchLength) +{ + U64 hash = state->rolling; + size_t n = 0; + +#define GEAR_ITER_ONCE() do { \ + hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ + n += 1; \ + } while (0) + while (n + 3 < minMatchLength) { + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + } + while (n < minMatchLength) { + GEAR_ITER_ONCE(); + } +#undef GEAR_ITER_ONCE +} + +/** ZSTD_ldm_gear_feed(): + * + * Registers in the splits array all the split points found in the first + * size bytes following the data pointer. This function terminates when + * either all the data has been processed or LDM_BATCH_SIZE splits are + * present in the splits array. + * + * Precondition: The splits array must not be full. + * Returns: The number of bytes processed. */ +static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state, + BYTE const* data, size_t size, + size_t* splits, unsigned* numSplits) +{ + size_t n; + U64 hash, mask; + + hash = state->rolling; + mask = state->stopMask; + n = 0; + +#define GEAR_ITER_ONCE() do { \ + hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ + n += 1; \ + if (UNLIKELY((hash & mask) == 0)) { \ + splits[*numSplits] = n; \ + *numSplits += 1; \ + if (*numSplits == LDM_BATCH_SIZE) \ + goto done; \ + } \ + } while (0) + + while (n + 3 < size) { + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + } + while (n < size) { + GEAR_ITER_ONCE(); + } + +#undef GEAR_ITER_ONCE + +done: + state->rolling = hash; + return n; +} void ZSTD_ldm_adjustParameters(ldmParams_t* params, - ZSTD_compressionParameters const* cParams) + const ZSTD_compressionParameters* cParams) { params->windowLog = cParams->windowLog; ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX); DEBUGLOG(4, "ZSTD_ldm_adjustParameters"); - if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG; - if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH; - if (cParams->strategy >= ZSTD_btopt) { - /* Get out of the way of the optimal parser */ - U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength); - assert(minMatch >= ZSTD_LDM_MINMATCH_MIN); - assert(minMatch <= ZSTD_LDM_MINMATCH_MAX); - params->minMatchLength = minMatch; + if (params->hashRateLog == 0) { + if (params->hashLog > 0) { + /* if params->hashLog is set, derive hashRateLog from it */ + assert(params->hashLog <= ZSTD_HASHLOG_MAX); + if (params->windowLog > params->hashLog) { + params->hashRateLog = params->windowLog - params->hashLog; + } + } else { + assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); + /* mapping from [fast, rate7] to [btultra2, rate4] */ + params->hashRateLog = 7 - (cParams->strategy/3); + } } if (params->hashLog == 0) { - params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG); - assert(params->hashLog <= ZSTD_HASHLOG_MAX); + params->hashLog = BOUNDED(ZSTD_HASHLOG_MIN, params->windowLog - params->hashRateLog, ZSTD_HASHLOG_MAX); } - if (params->hashRateLog == 0) { - params->hashRateLog = params->windowLog < params->hashLog - ? 0 - : params->windowLog - params->hashLog; + if (params->minMatchLength == 0) { + params->minMatchLength = LDM_MIN_MATCH_LENGTH; + if (cParams->strategy >= ZSTD_btultra) + params->minMatchLength /= 2; + } + if (params->bucketSizeLog==0) { + assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); + params->bucketSizeLog = BOUNDED(LDM_BUCKET_SIZE_LOG, (U32)cParams->strategy, ZSTD_LDM_BUCKETSIZELOG_MAX); } params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog); } @@ -52,95 +174,34 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params) size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); - return params.enableLdm ? totalSize : 0; + return params.enableLdm == ZSTD_ps_enable ? totalSize : 0; } size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) { - return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0; -} - -/** ZSTD_ldm_getSmallHash() : - * numBits should be <= 32 - * If numBits==0, returns 0. - * @return : the most significant numBits of value. */ -static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits) -{ - assert(numBits <= 32); - return numBits == 0 ? 0 : (U32)(value >> (64 - numBits)); -} - -/** ZSTD_ldm_getChecksum() : - * numBitsToDiscard should be <= 32 - * @return : the next most significant 32 bits after numBitsToDiscard */ -static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard) -{ - assert(numBitsToDiscard <= 32); - return (hash >> (64 - 32 - numBitsToDiscard)) & 0xFFFFFFFF; -} - -/** ZSTD_ldm_getTag() ; - * Given the hash, returns the most significant numTagBits bits - * after (32 + hbits) bits. - * - * If there are not enough bits remaining, return the last - * numTagBits bits. */ -static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits) -{ - assert(numTagBits < 32 && hbits <= 32); - if (32 - hbits < numTagBits) { - return hash & (((U32)1 << numTagBits) - 1); - } else { - return (hash >> (32 - hbits - numTagBits)) & (((U32)1 << numTagBits) - 1); - } + return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0; } /** ZSTD_ldm_getBucket() : * Returns a pointer to the start of the bucket associated with hash. */ static ldmEntry_t* ZSTD_ldm_getBucket( - ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams) + const ldmState_t* ldmState, size_t hash, U32 const bucketSizeLog) { - return ldmState->hashTable + (hash << ldmParams.bucketSizeLog); + return ldmState->hashTable + (hash << bucketSizeLog); } /** ZSTD_ldm_insertEntry() : * Insert the entry with corresponding hash into the hash table */ static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, size_t const hash, const ldmEntry_t entry, - ldmParams_t const ldmParams) + U32 const bucketSizeLog) { - BYTE* const bucketOffsets = ldmState->bucketOffsets; - *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry; - bucketOffsets[hash]++; - bucketOffsets[hash] &= ((U32)1 << ldmParams.bucketSizeLog) - 1; -} + BYTE* const pOffset = ldmState->bucketOffsets + hash; + unsigned const offset = *pOffset; + + *(ZSTD_ldm_getBucket(ldmState, hash, bucketSizeLog) + offset) = entry; + *pOffset = (BYTE)((offset + 1) & ((1u << bucketSizeLog) - 1)); -/** ZSTD_ldm_makeEntryAndInsertByTag() : - * - * Gets the small hash, checksum, and tag from the rollingHash. - * - * If the tag matches (1 << ldmParams.hashRateLog)-1, then - * creates an ldmEntry from the offset, and inserts it into the hash table. - * - * hBits is the length of the small hash, which is the most significant hBits - * of rollingHash. The checksum is the next 32 most significant bits, followed - * by ldmParams.hashRateLog bits that make up the tag. */ -static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState, - U64 const rollingHash, - U32 const hBits, - U32 const offset, - ldmParams_t const ldmParams) -{ - U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashRateLog); - U32 const tagMask = ((U32)1 << ldmParams.hashRateLog) - 1; - if (tag == tagMask) { - U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits); - U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); - ldmEntry_t entry; - entry.offset = offset; - entry.checksum = checksum; - ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams); - } } /** ZSTD_ldm_countBackwardsMatch() : @@ -149,10 +210,10 @@ static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState, * We count only bytes where pMatch >= pBase and pIn >= pAnchor. */ static size_t ZSTD_ldm_countBackwardsMatch( const BYTE* pIn, const BYTE* pAnchor, - const BYTE* pMatch, const BYTE* pBase) + const BYTE* pMatch, const BYTE* pMatchBase) { size_t matchLength = 0; - while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { + while (pIn > pAnchor && pMatch > pMatchBase && pIn[-1] == pMatch[-1]) { pIn--; pMatch--; matchLength++; @@ -160,6 +221,27 @@ static size_t ZSTD_ldm_countBackwardsMatch( return matchLength; } +/** ZSTD_ldm_countBackwardsMatch_2segments() : + * Returns the number of bytes that match backwards from pMatch, + * even with the backwards match spanning 2 different segments. + * + * On reaching `pMatchBase`, start counting from mEnd */ +static size_t ZSTD_ldm_countBackwardsMatch_2segments( + const BYTE* pIn, const BYTE* pAnchor, + const BYTE* pMatch, const BYTE* pMatchBase, + const BYTE* pExtDictStart, const BYTE* pExtDictEnd) +{ + size_t matchLength = ZSTD_ldm_countBackwardsMatch(pIn, pAnchor, pMatch, pMatchBase); + if (pMatch - matchLength != pMatchBase || pMatchBase == pExtDictStart) { + /* If backwards match is entirely in the extDict or prefix, immediately return */ + return matchLength; + } + DEBUGLOG(7, "ZSTD_ldm_countBackwardsMatch_2segments: found 2-parts backwards match (length in prefix==%zu)", matchLength); + matchLength += ZSTD_ldm_countBackwardsMatch(pIn - matchLength, pAnchor, pExtDictEnd, pExtDictStart); + DEBUGLOG(7, "final backwards match length = %zu", matchLength); + return matchLength; +} + /** ZSTD_ldm_fillFastTables() : * * Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies. @@ -167,7 +249,7 @@ static size_t ZSTD_ldm_countBackwardsMatch( * * The tables for the other strategies are filled within their * block compressors. */ -static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, +static size_t ZSTD_ldm_fillFastTables(ZSTD_MatchState_t* ms, void const* end) { const BYTE* const iend = (const BYTE*)end; @@ -175,11 +257,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, switch(ms->cParams.strategy) { case ZSTD_fast: - ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); + ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); break; case ZSTD_dfast: - ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); +#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); +#else + assert(0); /* shouldn't be called: cparams should've been adjusted. */ +#endif break; case ZSTD_greedy: @@ -197,30 +283,44 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, return 0; } -/** ZSTD_ldm_fillLdmHashTable() : - * - * Fills hashTable from (lastHashed + 1) to iend (non-inclusive). - * lastHash is the rolling hash that corresponds to lastHashed. - * - * Returns the rolling hash corresponding to position iend-1. */ -static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state, - U64 lastHash, const BYTE* lastHashed, - const BYTE* iend, const BYTE* base, - U32 hBits, ldmParams_t const ldmParams) +void ZSTD_ldm_fillHashTable( + ldmState_t* ldmState, const BYTE* ip, + const BYTE* iend, ldmParams_t const* params) { - U64 rollingHash = lastHash; - const BYTE* cur = lastHashed + 1; - - while (cur < iend) { - rollingHash = ZSTD_rollingHash_rotate(rollingHash, cur[-1], - cur[ldmParams.minMatchLength-1], - state->hashPower); - ZSTD_ldm_makeEntryAndInsertByTag(state, - rollingHash, hBits, - (U32)(cur - base), ldmParams); - ++cur; + U32 const minMatchLength = params->minMatchLength; + U32 const bucketSizeLog = params->bucketSizeLog; + U32 const hBits = params->hashLog - bucketSizeLog; + BYTE const* const base = ldmState->window.base; + BYTE const* const istart = ip; + ldmRollingHashState_t hashState; + size_t* const splits = ldmState->splitIndices; + unsigned numSplits; + + DEBUGLOG(5, "ZSTD_ldm_fillHashTable"); + + ZSTD_ldm_gear_init(&hashState, params); + while (ip < iend) { + size_t hashed; + unsigned n; + + numSplits = 0; + hashed = ZSTD_ldm_gear_feed(&hashState, ip, (size_t)(iend - ip), splits, &numSplits); + + for (n = 0; n < numSplits; n++) { + if (ip + splits[n] >= istart + minMatchLength) { + BYTE const* const split = ip + splits[n] - minMatchLength; + U64 const xxhash = XXH64(split, minMatchLength, 0); + U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1)); + ldmEntry_t entry; + + entry.offset = (U32)(split - base); + entry.checksum = (U32)(xxhash >> 32); + ZSTD_ldm_insertEntry(ldmState, hash, entry, params->bucketSizeLog); + } + } + + ip += hashed; } - return rollingHash; } @@ -229,27 +329,26 @@ static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state, * Sets cctx->nextToUpdate to a position corresponding closer to anchor * if it is far way * (after a long match, only update tables a limited amount). */ -static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) +static void ZSTD_ldm_limitTableUpdate(ZSTD_MatchState_t* ms, const BYTE* anchor) { - U32 const current = (U32)(anchor - ms->window.base); - if (current > ms->nextToUpdate + 1024) { + U32 const curr = (U32)(anchor - ms->window.base); + if (curr > ms->nextToUpdate + 1024) { ms->nextToUpdate = - current - MIN(512, current - ms->nextToUpdate - 1024); + curr - MIN(512, curr - ms->nextToUpdate - 1024); } } -static size_t ZSTD_ldm_generateSequences_internal( - ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t ZSTD_ldm_generateSequences_internal( + ldmState_t* ldmState, RawSeqStore_t* rawSeqStore, ldmParams_t const* params, void const* src, size_t srcSize) { /* LDM parameters */ int const extDict = ZSTD_window_hasExtDict(ldmState->window); U32 const minMatchLength = params->minMatchLength; - U64 const hashPower = ldmState->hashPower; + U32 const entsPerBucket = 1U << params->bucketSizeLog; U32 const hBits = params->hashLog - params->bucketSizeLog; - U32 const ldmBucketSize = 1U << params->bucketSizeLog; - U32 const hashRateLog = params->hashRateLog; - U32 const ldmTagMask = (1U << params->hashRateLog) - 1; /* Prefix and extDict parameters */ U32 const dictLimit = ldmState->window.dictLimit; U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit; @@ -261,45 +360,69 @@ static size_t ZSTD_ldm_generateSequences_internal( /* Input bounds */ BYTE const* const istart = (BYTE const*)src; BYTE const* const iend = istart + srcSize; - BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE); + BYTE const* const ilimit = iend - HASH_READ_SIZE; /* Input positions */ BYTE const* anchor = istart; BYTE const* ip = istart; - /* Rolling hash */ - BYTE const* lastHashed = NULL; - U64 rollingHash = 0; - - while (ip <= ilimit) { - size_t mLength; - U32 const current = (U32)(ip - base); - size_t forwardMatchLength = 0, backwardMatchLength = 0; - ldmEntry_t* bestEntry = NULL; - if (ip != istart) { - rollingHash = ZSTD_rollingHash_rotate(rollingHash, lastHashed[0], - lastHashed[minMatchLength], - hashPower); - } else { - rollingHash = ZSTD_rollingHash_compute(ip, minMatchLength); + /* Rolling hash state */ + ldmRollingHashState_t hashState; + /* Arrays for staged-processing */ + size_t* const splits = ldmState->splitIndices; + ldmMatchCandidate_t* const candidates = ldmState->matchCandidates; + unsigned numSplits; + + if (srcSize < minMatchLength) + return iend - anchor; + + /* Initialize the rolling hash state with the first minMatchLength bytes */ + ZSTD_ldm_gear_init(&hashState, params); + ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength); + ip += minMatchLength; + + while (ip < ilimit) { + size_t hashed; + unsigned n; + + numSplits = 0; + hashed = ZSTD_ldm_gear_feed(&hashState, ip, ilimit - ip, + splits, &numSplits); + + for (n = 0; n < numSplits; n++) { + BYTE const* const split = ip + splits[n] - minMatchLength; + U64 const xxhash = XXH64(split, minMatchLength, 0); + U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1)); + + candidates[n].split = split; + candidates[n].hash = hash; + candidates[n].checksum = (U32)(xxhash >> 32); + candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, params->bucketSizeLog); + PREFETCH_L1(candidates[n].bucket); } - lastHashed = ip; - /* Do not insert and do not look for a match */ - if (ZSTD_ldm_getTag(rollingHash, hBits, hashRateLog) != ldmTagMask) { - ip++; - continue; - } + for (n = 0; n < numSplits; n++) { + size_t forwardMatchLength = 0, backwardMatchLength = 0, + bestMatchLength = 0, mLength; + U32 offset; + BYTE const* const split = candidates[n].split; + U32 const checksum = candidates[n].checksum; + U32 const hash = candidates[n].hash; + ldmEntry_t* const bucket = candidates[n].bucket; + ldmEntry_t const* cur; + ldmEntry_t const* bestEntry = NULL; + ldmEntry_t newEntry; + + newEntry.offset = (U32)(split - base); + newEntry.checksum = checksum; + + /* If a split point would generate a sequence overlapping with + * the previous one, we merely register it in the hash table and + * move on */ + if (split < anchor) { + ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } - /* Get the best entry and compute the match lengths */ - { - ldmEntry_t* const bucket = - ZSTD_ldm_getBucket(ldmState, - ZSTD_ldm_getSmallHash(rollingHash, hBits), - *params); - ldmEntry_t* cur; - size_t bestMatchLength = 0; - U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); - - for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) { + for (cur = bucket; cur < bucket + entsPerBucket; cur++) { size_t curForwardMatchLength, curBackwardMatchLength, curTotalMatchLength; if (cur->checksum != checksum || cur->offset <= lowestIndex) { @@ -313,30 +436,23 @@ static size_t ZSTD_ldm_generateSequences_internal( cur->offset < dictLimit ? dictEnd : iend; BYTE const* const lowMatchPtr = cur->offset < dictLimit ? dictStart : lowPrefixPtr; - - curForwardMatchLength = ZSTD_count_2segments( - ip, pMatch, iend, - matchEnd, lowPrefixPtr); + curForwardMatchLength = + ZSTD_count_2segments(split, pMatch, iend, matchEnd, lowPrefixPtr); if (curForwardMatchLength < minMatchLength) { continue; } - curBackwardMatchLength = - ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch, - lowMatchPtr); - curTotalMatchLength = curForwardMatchLength + - curBackwardMatchLength; + curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch_2segments( + split, anchor, pMatch, lowMatchPtr, dictStart, dictEnd); } else { /* !extDict */ BYTE const* const pMatch = base + cur->offset; - curForwardMatchLength = ZSTD_count(ip, pMatch, iend); + curForwardMatchLength = ZSTD_count(split, pMatch, iend); if (curForwardMatchLength < minMatchLength) { continue; } curBackwardMatchLength = - ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch, - lowPrefixPtr); - curTotalMatchLength = curForwardMatchLength + - curBackwardMatchLength; + ZSTD_ldm_countBackwardsMatch(split, anchor, pMatch, lowPrefixPtr); } + curTotalMatchLength = curForwardMatchLength + curBackwardMatchLength; if (curTotalMatchLength > bestMatchLength) { bestMatchLength = curTotalMatchLength; @@ -345,57 +461,54 @@ static size_t ZSTD_ldm_generateSequences_internal( bestEntry = cur; } } - } - - /* No match found -- continue searching */ - if (bestEntry == NULL) { - ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, - hBits, current, - *params); - ip++; - continue; - } - /* Match found */ - mLength = forwardMatchLength + backwardMatchLength; - ip -= backwardMatchLength; + /* No match found -- insert an entry into the hash table + * and process the next candidate match */ + if (bestEntry == NULL) { + ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } - { - /* Store the sequence: - * ip = current - backwardMatchLength - * The match is at (bestEntry->offset - backwardMatchLength) - */ - U32 const matchIndex = bestEntry->offset; - U32 const offset = current - matchIndex; - rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; - - /* Out of sequence storage */ - if (rawSeqStore->size == rawSeqStore->capacity) - return ERROR(dstSize_tooSmall); - seq->litLength = (U32)(ip - anchor); - seq->matchLength = (U32)mLength; - seq->offset = offset; - rawSeqStore->size++; - } + /* Match found */ + offset = (U32)(split - base) - bestEntry->offset; + mLength = forwardMatchLength + backwardMatchLength; + { + rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; + + /* Out of sequence storage */ + if (rawSeqStore->size == rawSeqStore->capacity) + return ERROR(dstSize_tooSmall); + seq->litLength = (U32)(split - backwardMatchLength - anchor); + seq->matchLength = (U32)mLength; + seq->offset = offset; + rawSeqStore->size++; + } - /* Insert the current entry into the hash table */ - ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits, - (U32)(lastHashed - base), - *params); + /* Insert the current entry into the hash table --- it must be + * done after the previous block to avoid clobbering bestEntry */ + ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); - assert(ip + backwardMatchLength == lastHashed); + anchor = split + forwardMatchLength; - /* Fill the hash table from lastHashed+1 to ip+mLength*/ - /* Heuristic: don't need to fill the entire table at end of block */ - if (ip + mLength <= ilimit) { - rollingHash = ZSTD_ldm_fillLdmHashTable( - ldmState, rollingHash, lastHashed, - ip + mLength, base, hBits, *params); - lastHashed = ip + mLength - 1; + /* If we find a match that ends after the data that we've hashed + * then we have a repeating, overlapping, pattern. E.g. all zeros. + * If one repetition of the pattern matches our `stopMask` then all + * repetitions will. We don't need to insert them all into out table, + * only the first one. So skip over overlapping matches. + * This is a major speed boost (20x) for compressing a single byte + * repeated, when that byte ends up in the table. + */ + if (anchor > ip + hashed) { + ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength); + /* Continue the outer loop at anchor (ip + hashed == anchor). */ + ip = anchor - hashed; + break; + } } - ip += mLength; - anchor = ip; + + ip += hashed; } + return iend - anchor; } @@ -412,7 +525,7 @@ static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size, } size_t ZSTD_ldm_generateSequences( - ldmState_t* ldmState, rawSeqStore_t* sequences, + ldmState_t* ldmState, RawSeqStore_t* sequences, ldmParams_t const* params, void const* src, size_t srcSize) { U32 const maxDist = 1U << params->windowLog; @@ -444,11 +557,13 @@ size_t ZSTD_ldm_generateSequences( assert(chunkStart < iend); /* 1. Perform overflow correction if necessary. */ - if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) { + if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) { U32 const ldmHSize = 1U << params->hashLog; U32 const correction = ZSTD_window_correctOverflow( &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction); + /* invalidate dictionaries on overflow correction */ + ldmState->loadedDictEnd = 0; } /* 2. We enforce the maximum offset allowed. * @@ -456,9 +571,15 @@ size_t ZSTD_ldm_generateSequences( * the window through early invalidation. * TODO: * Test the chunk size. * * Try invalidation after the sequence generation and test the - * the offset against maxDist directly. + * offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may + * be split into two sequences. This condition holds when using + * ZSTD_window_enforceMaxDist(), but if we move to checking offsets + * against maxDist directly, we'll have to carefully handle that case. */ - ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL, NULL); + ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL); /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */ newLeftoverSize = ZSTD_ldm_generateSequences_internal( ldmState, sequences, params, chunkStart, chunkSize); @@ -480,7 +601,9 @@ size_t ZSTD_ldm_generateSequences( return 0; } -void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) { +void +ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) +{ while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; if (srcSize <= seq->litLength) { @@ -515,7 +638,7 @@ void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 cons * Returns the current sequence to handle, or if the rest of the block should * be literals, it returns a sequence with offset == 0. */ -static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, +static rawSeq maybeSplitSequence(RawSeqStore_t* rawSeqStore, U32 const remaining, U32 const minMatch) { rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos]; @@ -539,14 +662,32 @@ static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, return sequence; } -size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) { + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { + rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; + if (currPos >= currSeq.litLength + currSeq.matchLength) { + currPos -= currSeq.litLength + currSeq.matchLength; + rawSeqStore->pos++; + } else { + rawSeqStore->posInSequence = currPos; + break; + } + } + if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) { + rawSeqStore->posInSequence = 0; + } +} + +size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore, + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_ParamSwitch_e useRowMatchFinder, void const* src, size_t srcSize) { const ZSTD_compressionParameters* const cParams = &ms->cParams; unsigned const minMatch = cParams->minMatch; - ZSTD_blockCompressor const blockCompressor = - ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms)); + ZSTD_BlockCompressor_f const blockCompressor = + ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms)); /* Input bounds */ BYTE const* const istart = (BYTE const*)src; BYTE const* const iend = istart + srcSize; @@ -554,27 +695,35 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, BYTE const* ip = istart; DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize); + /* If using opt parser, use LDMs only as candidates rather than always accepting them */ + if (cParams->strategy >= ZSTD_btopt) { + size_t lastLLSize; + ms->ldmSeqStore = rawSeqStore; + lastLLSize = blockCompressor(ms, seqStore, rep, src, srcSize); + ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore, srcSize); + return lastLLSize; + } + assert(rawSeqStore->pos <= rawSeqStore->size); assert(rawSeqStore->size <= rawSeqStore->capacity); - /* Loop through each sequence and apply the block compressor to the lits */ + /* Loop through each sequence and apply the block compressor to the literals */ while (rawSeqStore->pos < rawSeqStore->size && ip < iend) { /* maybeSplitSequence updates rawSeqStore->pos */ rawSeq const sequence = maybeSplitSequence(rawSeqStore, (U32)(iend - ip), minMatch); - int i; /* End signal */ if (sequence.offset == 0) break; - assert(sequence.offset <= (1U << cParams->windowLog)); assert(ip + sequence.litLength + sequence.matchLength <= iend); /* Fill tables for block compressor */ ZSTD_ldm_limitTableUpdate(ms, ip); ZSTD_ldm_fillFastTables(ms, ip); /* Run the block compressor */ - DEBUGLOG(5, "calling block compressor on segment of size %u", sequence.litLength); + DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); { + int i; size_t const newLitLength = blockCompressor(ms, seqStore, rep, ip, sequence.litLength); ip += sequence.litLength; @@ -584,8 +733,8 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, rep[0] = sequence.offset; /* Store the sequence */ ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, - sequence.offset + ZSTD_REP_MOVE, - sequence.matchLength - MINMATCH); + OFFSET_TO_OFFBASE(sequence.offset), + sequence.matchLength); ip += sequence.matchLength; } } @@ -595,3 +744,5 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, /* Compress the last literals */ return blockCompressor(ms, seqStore, rep, ip, iend - ip); } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_ldm.h b/vendor/github.com/DataDog/zstd/zstd_ldm.h index a478461..9e1ae70 100644 --- a/vendor/github.com/DataDog/zstd/zstd_ldm.h +++ b/vendor/github.com/DataDog/zstd/zstd_ldm.h @@ -1,19 +1,17 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ #ifndef ZSTD_LDM_H #define ZSTD_LDM_H -#if defined (__cplusplus) -extern "C" { -#endif - #include "zstd_compress_internal.h" /* ldmParams_t, U32 */ #include "zstd.h" /* ZSTD_CCtx, size_t */ @@ -23,6 +21,10 @@ extern "C" { #define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_LIMIT_DEFAULT +void ZSTD_ldm_fillHashTable( + ldmState_t* state, const BYTE* ip, + const BYTE* iend, ldmParams_t const* params); + /** * ZSTD_ldm_generateSequences(): * @@ -38,7 +40,7 @@ extern "C" { * sequences. */ size_t ZSTD_ldm_generateSequences( - ldmState_t* ldms, rawSeqStore_t* sequences, + ldmState_t* ldms, RawSeqStore_t* sequences, ldmParams_t const* params, void const* src, size_t srcSize); /** @@ -59,8 +61,9 @@ size_t ZSTD_ldm_generateSequences( * two. We handle that case correctly, and update `rawSeqStore` appropriately. * NOTE: This function does not return any errors. */ -size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore, + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_ParamSwitch_e useRowMatchFinder, void const* src, size_t srcSize); /** @@ -68,11 +71,17 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, * * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`. * Avoids emitting matches less than `minMatch` bytes. - * Must be called for data with is not passed to ZSTD_ldm_blockCompress(). + * Must be called for data that is not passed to ZSTD_ldm_blockCompress(). */ -void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, +void ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch); +/* ZSTD_ldm_skipRawSeqStoreBytes(): + * Moves forward in rawSeqStore by nbBytes, updating fields 'pos' and 'posInSequence'. + * Not to be used in conjunction with ZSTD_ldm_skipSequences(). + * Must be called for data with is not passed to ZSTD_ldm_blockCompress(). + */ +void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes); /** ZSTD_ldm_getTableSize() : * Estimate the space needed for long distance matching tables or 0 if LDM is @@ -98,8 +107,6 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize); void ZSTD_ldm_adjustParameters(ldmParams_t* params, ZSTD_compressionParameters const* cParams); -#if defined (__cplusplus) -} -#endif - #endif /* ZSTD_FAST_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_ldm_geartab.h b/vendor/github.com/DataDog/zstd/zstd_ldm_geartab.h new file mode 100644 index 0000000..5d799e7 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/zstd_ldm_geartab.h @@ -0,0 +1,109 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_LDM_GEARTAB_H +#define ZSTD_LDM_GEARTAB_H + +#include "compiler.h" /* UNUSED_ATTR */ +#include "mem.h" /* U64 */ + +static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = { + 0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc, + 0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05, + 0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e, + 0x9c8528f65badeaca, 0x86563706e2097529, 0x2902475fa375d889, + 0xafb32a9739a5ebe6, 0xce2714da3883e639, 0x21eaf821722e69e, + 0x37b628620b628, 0x49a8d455d88caf5, 0x8556d711e6958140, + 0x4f7ae74fc605c1f, 0x829f0c3468bd3a20, 0x4ffdc885c625179e, + 0x8473de048a3daf1b, 0x51008822b05646b2, 0x69d75d12b2d1cc5f, + 0x8c9d4a19159154bc, 0xc3cc10f4abbd4003, 0xd06ddc1cecb97391, + 0xbe48e6e7ed80302e, 0x3481db31cee03547, 0xacc3f67cdaa1d210, + 0x65cb771d8c7f96cc, 0x8eb27177055723dd, 0xc789950d44cd94be, + 0x934feadc3700b12b, 0x5e485f11edbdf182, 0x1e2e2a46fd64767a, + 0x2969ca71d82efa7c, 0x9d46e9935ebbba2e, 0xe056b67e05e6822b, + 0x94d73f55739d03a0, 0xcd7010bdb69b5a03, 0x455ef9fcd79b82f4, + 0x869cb54a8749c161, 0x38d1a4fa6185d225, 0xb475166f94bbe9bb, + 0xa4143548720959f1, 0x7aed4780ba6b26ba, 0xd0ce264439e02312, + 0x84366d746078d508, 0xa8ce973c72ed17be, 0x21c323a29a430b01, + 0x9962d617e3af80ee, 0xab0ce91d9c8cf75b, 0x530e8ee6d19a4dbc, + 0x2ef68c0cf53f5d72, 0xc03a681640a85506, 0x496e4e9f9c310967, + 0x78580472b59b14a0, 0x273824c23b388577, 0x66bf923ad45cb553, + 0x47ae1a5a2492ba86, 0x35e304569e229659, 0x4765182a46870b6f, + 0x6cbab625e9099412, 0xddac9a2e598522c1, 0x7172086e666624f2, + 0xdf5003ca503b7837, 0x88c0c1db78563d09, 0x58d51865acfc289d, + 0x177671aec65224f1, 0xfb79d8a241e967d7, 0x2be1e101cad9a49a, + 0x6625682f6e29186b, 0x399553457ac06e50, 0x35dffb4c23abb74, + 0x429db2591f54aade, 0xc52802a8037d1009, 0x6acb27381f0b25f3, + 0xf45e2551ee4f823b, 0x8b0ea2d99580c2f7, 0x3bed519cbcb4e1e1, + 0xff452823dbb010a, 0x9d42ed614f3dd267, 0x5b9313c06257c57b, + 0xa114b8008b5e1442, 0xc1fe311c11c13d4b, 0x66e8763ea34c5568, + 0x8b982af1c262f05d, 0xee8876faaa75fbb7, 0x8a62a4d0d172bb2a, + 0xc13d94a3b7449a97, 0x6dbbba9dc15d037c, 0xc786101f1d92e0f1, + 0xd78681a907a0b79b, 0xf61aaf2962c9abb9, 0x2cfd16fcd3cb7ad9, + 0x868c5b6744624d21, 0x25e650899c74ddd7, 0xba042af4a7c37463, + 0x4eb1a539465a3eca, 0xbe09dbf03b05d5ca, 0x774e5a362b5472ba, + 0x47a1221229d183cd, 0x504b0ca18ef5a2df, 0xdffbdfbde2456eb9, + 0x46cd2b2fbee34634, 0xf2aef8fe819d98c3, 0x357f5276d4599d61, + 0x24a5483879c453e3, 0x88026889192b4b9, 0x28da96671782dbec, + 0x4ef37c40588e9aaa, 0x8837b90651bc9fb3, 0xc164f741d3f0e5d6, + 0xbc135a0a704b70ba, 0x69cd868f7622ada, 0xbc37ba89e0b9c0ab, + 0x47c14a01323552f6, 0x4f00794bacee98bb, 0x7107de7d637a69d5, + 0x88af793bb6f2255e, 0xf3c6466b8799b598, 0xc288c616aa7f3b59, + 0x81ca63cf42fca3fd, 0x88d85ace36a2674b, 0xd056bd3792389e7, + 0xe55c396c4e9dd32d, 0xbefb504571e6c0a6, 0x96ab32115e91e8cc, + 0xbf8acb18de8f38d1, 0x66dae58801672606, 0x833b6017872317fb, + 0xb87c16f2d1c92864, 0xdb766a74e58b669c, 0x89659f85c61417be, + 0xc8daad856011ea0c, 0x76a4b565b6fe7eae, 0xa469d085f6237312, + 0xaaf0365683a3e96c, 0x4dbb746f8424f7b8, 0x638755af4e4acc1, + 0x3d7807f5bde64486, 0x17be6d8f5bbb7639, 0x903f0cd44dc35dc, + 0x67b672eafdf1196c, 0xa676ff93ed4c82f1, 0x521d1004c5053d9d, + 0x37ba9ad09ccc9202, 0x84e54d297aacfb51, 0xa0b4b776a143445, + 0x820d471e20b348e, 0x1874383cb83d46dc, 0x97edeec7a1efe11c, + 0xb330e50b1bdc42aa, 0x1dd91955ce70e032, 0xa514cdb88f2939d5, + 0x2791233fd90db9d3, 0x7b670a4cc50f7a9b, 0x77c07d2a05c6dfa5, + 0xe3778b6646d0a6fa, 0xb39c8eda47b56749, 0x933ed448addbef28, + 0xaf846af6ab7d0bf4, 0xe5af208eb666e49, 0x5e6622f73534cd6a, + 0x297daeca42ef5b6e, 0x862daef3d35539a6, 0xe68722498f8e1ea9, + 0x981c53093dc0d572, 0xfa09b0bfbf86fbf5, 0x30b1e96166219f15, + 0x70e7d466bdc4fb83, 0x5a66736e35f2a8e9, 0xcddb59d2b7c1baef, + 0xd6c7d247d26d8996, 0xea4e39eac8de1ba3, 0x539c8bb19fa3aff2, + 0x9f90e4c5fd508d8, 0xa34e5956fbaf3385, 0x2e2f8e151d3ef375, + 0x173691e9b83faec1, 0xb85a8d56bf016379, 0x8382381267408ae3, + 0xb90f901bbdc0096d, 0x7c6ad32933bcec65, 0x76bb5e2f2c8ad595, + 0x390f851a6cf46d28, 0xc3e6064da1c2da72, 0xc52a0c101cfa5389, + 0xd78eaf84a3fbc530, 0x3781b9e2288b997e, 0x73c2f6dea83d05c4, + 0x4228e364c5b5ed7, 0x9d7a3edf0da43911, 0x8edcfeda24686756, + 0x5e7667a7b7a9b3a1, 0x4c4f389fa143791d, 0xb08bc1023da7cddc, + 0x7ab4be3ae529b1cc, 0x754e6132dbe74ff9, 0x71635442a839df45, + 0x2f6fb1643fbe52de, 0x961e0a42cf7a8177, 0xf3b45d83d89ef2ea, + 0xee3de4cf4a6e3e9b, 0xcd6848542c3295e7, 0xe4cee1664c78662f, + 0x9947548b474c68c4, 0x25d73777a5ed8b0b, 0xc915b1d636b7fc, + 0x21c2ba75d9b0d2da, 0x5f6b5dcf608a64a1, 0xdcf333255ff9570c, + 0x633b922418ced4ee, 0xc136dde0b004b34a, 0x58cc83b05d4b2f5a, + 0x5eb424dda28e42d2, 0x62df47369739cd98, 0xb4e0b42485e4ce17, + 0x16e1f0c1f9a8d1e7, 0x8ec3916707560ebf, 0x62ba6e2df2cc9db3, + 0xcbf9f4ff77d83a16, 0x78d9d7d07d2bbcc4, 0xef554ce1e02c41f4, + 0x8d7581127eccf94d, 0xa9b53336cb3c8a05, 0x38c42c0bf45c4f91, + 0x640893cdf4488863, 0x80ec34bc575ea568, 0x39f324f5b48eaa40, + 0xe9d9ed1f8eff527f, 0x9224fc058cc5a214, 0xbaba00b04cfe7741, + 0x309a9f120fcf52af, 0xa558f3ec65626212, 0x424bec8b7adabe2f, + 0x41622513a6aea433, 0xb88da2d5324ca798, 0xd287733b245528a4, + 0x9a44697e6d68aec3, 0x7b1093be2f49bb28, 0x50bbec632e3d8aad, + 0x6cd90723e1ea8283, 0x897b9e7431b02bf3, 0x219efdcb338a7047, + 0x3b0311f0a27c0656, 0xdb17bf91c0db96e7, 0x8cd4fd6b4e85a5b2, + 0xfab071054ba6409d, 0x40d6fe831fa9dfd9, 0xaf358debad7d791e, + 0xeb8d0e25a65e3e58, 0xbbcbd3df14e08580, 0xcf751f27ecdab2b, + 0x2b4da14f2613d8f4 +}; + +#endif /* ZSTD_LDM_GEARTAB_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_legacy.h b/vendor/github.com/DataDog/zstd/zstd_legacy.h index 0dbd3c7..3bea26b 100644 --- a/vendor/github.com/DataDog/zstd/zstd_legacy.h +++ b/vendor/github.com/DataDog/zstd/zstd_legacy.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -124,6 +125,20 @@ MEM_STATIC size_t ZSTD_decompressLegacy( const void* dict,size_t dictSize) { U32 const version = ZSTD_isLegacy(src, compressedSize); + char x; + /* Avoid passing NULL to legacy decoding. */ + if (dst == NULL) { + assert(dstCapacity == 0); + dst = &x; + } + if (src == NULL) { + assert(compressedSize == 0); + src = &x; + } + if (dict == NULL) { + assert(dictSize == 0); + dict = &x; + } (void)dst; (void)dstCapacity; (void)dict; (void)dictSize; /* unused when ZSTD_LEGACY_SUPPORT >= 8 */ switch(version) { @@ -242,6 +257,13 @@ MEM_STATIC ZSTD_frameSizeInfo ZSTD_findFrameSizeInfoLegacy(const void *src, size frameSizeInfo.compressedSize = ERROR(srcSize_wrong); frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR; } + /* In all cases, decompressedBound == nbBlocks * ZSTD_BLOCKSIZE_MAX. + * So we can compute nbBlocks without having to change every function. + */ + if (frameSizeInfo.decompressedBound != ZSTD_CONTENTSIZE_ERROR) { + assert((frameSizeInfo.decompressedBound & (ZSTD_BLOCKSIZE_MAX - 1)) == 0); + frameSizeInfo.nbBlocks = (size_t)(frameSizeInfo.decompressedBound / ZSTD_BLOCKSIZE_MAX); + } return frameSizeInfo; } @@ -280,6 +302,12 @@ MEM_STATIC size_t ZSTD_freeLegacyStreamContext(void* legacyContext, U32 version) MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U32 newVersion, const void* dict, size_t dictSize) { + char x; + /* Avoid passing NULL to legacy decoding. */ + if (dict == NULL) { + assert(dictSize == 0); + dict = &x; + } DEBUGLOG(5, "ZSTD_initLegacyStream for v0.%u", newVersion); if (prevVersion != newVersion) ZSTD_freeLegacyStreamContext(*legacyContext, prevVersion); switch(newVersion) @@ -339,6 +367,16 @@ MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version, ZSTD_outBuffer* output, ZSTD_inBuffer* input) { + static char x; + /* Avoid passing NULL to legacy decoding. */ + if (output->dst == NULL) { + assert(output->size == 0); + output->dst = &x; + } + if (input->src == NULL) { + assert(input->size == 0); + input->src = &x; + } DEBUGLOG(5, "ZSTD_decompressLegacyStream for v0.%u", version); switch(version) { @@ -413,3 +451,5 @@ MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version, #endif #endif /* ZSTD_LEGACY_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_opt.c b/vendor/github.com/DataDog/zstd/zstd_opt.c index 2e50fca..35eac21 100644 --- a/vendor/github.com/DataDog/zstd/zstd_opt.c +++ b/vendor/github.com/DataDog/zstd/zstd_opt.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -12,42 +13,52 @@ #include "hist.h" #include "zstd_opt.h" +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ -#define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */ #define ZSTD_MAX_PRICE (1<<30) -#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ +#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ /*-************************************* * Price functions for optimal parser ***************************************/ -#if 0 /* approximation at bit level */ +#if 0 /* approximation at bit level (for tests) */ # define BITCOST_ACCURACY 0 # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) -# define WEIGHT(stat) ((void)opt, ZSTD_bitWeight(stat)) -#elif 0 /* fractional bit accuracy */ +# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) +#elif 0 /* fractional bit accuracy (for tests) */ # define BITCOST_ACCURACY 8 # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) -# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) +# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) #else /* opt==approx, ultra==accurate */ # define BITCOST_ACCURACY 8 # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) -# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) +# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) #endif +/* ZSTD_bitWeight() : + * provide estimated "cost" of a stat in full bits only */ MEM_STATIC U32 ZSTD_bitWeight(U32 stat) { return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); } +/* ZSTD_fracWeight() : + * provide fractional-bit "cost" of a stat, + * using linear interpolation approximation */ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) { U32 const stat = rawStat + 1; U32 const hb = ZSTD_highbit32(stat); U32 const BWeight = hb * BITCOST_MULTIPLIER; + /* Fweight was meant for "Fractional weight" + * but it's effectively a value between 1 and 2 + * using fixed point arithmetic */ U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; U32 const weight = BWeight + FWeight; assert(hb + BITCOST_ACCURACY < 31); @@ -58,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) /* debugging function, * @return price in bytes as fractional value * for debug messages only */ -MEM_STATIC double ZSTD_fCost(U32 price) +MEM_STATIC double ZSTD_fCost(int price) { return (double)price / (BITCOST_MULTIPLIER*8); } @@ -66,7 +77,7 @@ MEM_STATIC double ZSTD_fCost(U32 price) static int ZSTD_compressedLiterals(optState_t const* const optPtr) { - return optPtr->literalCompressionMode != ZSTD_lcm_uncompressed; + return optPtr->literalCompressionMode != ZSTD_ps_disable; } static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) @@ -79,25 +90,52 @@ static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) } -/* ZSTD_downscaleStat() : - * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus) - * return the resulting sum of elements */ -static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus) +static U32 sum_u32(const unsigned table[], size_t nbElts) +{ + size_t n; + U32 total = 0; + for (n=0; n 0 && ZSTD_FREQ_DIV+malus < 31); + DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", + (unsigned)lastEltIndex+1, (unsigned)shift ); + assert(shift < 30); for (s=0; s> (ZSTD_FREQ_DIV+malus)); - sum += table[s]; + unsigned const base = base1 ? 1 : (table[s]>0); + unsigned const newStat = base + (table[s] >> shift); + sum += newStat; + table[s] = newStat; } return sum; } +/* ZSTD_scaleStats() : + * reduce all elt frequencies in table if sum too large + * return the resulting sum of elements */ +static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) +{ + U32 const prevsum = sum_u32(table, lastEltIndex+1); + U32 const factor = prevsum >> logTarget; + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; + return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); +} + /* ZSTD_rescaleFreqs() : * if first block (detected by optPtr->litLengthSum == 0) : init statistics * take hints from dictionary if there is one - * or init from zero, using src for literals stats, or flat 1 for match symbols + * and init from zero if there is none, + * using src for literals stats, and baseline stats for sequence symbols * otherwise downscale existing stats, to be used as seed for next block. */ static void @@ -109,24 +147,28 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); optPtr->priceType = zop_dynamic; - if (optPtr->litLengthSum == 0) { /* first block : init */ - if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ - DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); + if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ + + /* heuristic: use pre-defined stats for too small inputs */ + if (srcSize <= ZSTD_PREDEF_THRESHOLD) { + DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); optPtr->priceType = zop_predef; } assert(optPtr->symbolCosts != NULL); if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { - /* huffman table presumed generated by dictionary */ + + /* huffman stats covering the full value set : table presumed generated by dictionary */ optPtr->priceType = zop_dynamic; if (compressedLiterals) { + /* generate literals statistics from huffman table */ unsigned lit; assert(optPtr->litFreq != NULL); optPtr->litSum = 0; for (lit=0; lit<=MaxLit; lit++) { U32 const scaleLog = 11; /* scale to 2K */ - U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit); + U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit); assert(bitCost <= scaleLog); optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; optPtr->litSum += optPtr->litFreq[lit]; @@ -168,20 +210,26 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, optPtr->offCodeSum += optPtr->offCodeFreq[of]; } } - } else { /* not a dictionary */ + } else { /* first block, no dictionary */ assert(optPtr->litFreq != NULL); if (compressedLiterals) { + /* base initial cost of literals on direct frequency within src */ unsigned lit = MaxLit; HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ - optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); + optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); } - { unsigned ll; - for (ll=0; ll<=MaxLL; ll++) - optPtr->litLengthFreq[ll] = 1; + { unsigned const baseLLfreqs[MaxLL+1] = { + 4, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 + }; + ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs)); + optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL+1); } - optPtr->litLengthSum = MaxLL+1; { unsigned ml; for (ml=0; ml<=MaxML; ml++) @@ -189,21 +237,25 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, } optPtr->matchLengthSum = MaxML+1; - { unsigned of; - for (of=0; of<=MaxOff; of++) - optPtr->offCodeFreq[of] = 1; + { unsigned const baseOFCfreqs[MaxOff+1] = { + 6, 2, 1, 1, 2, 3, 4, 4, + 4, 3, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs)); + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); } - optPtr->offCodeSum = MaxOff+1; } - } else { /* new block : re-use previous statistics, scaled down */ + } else { /* new block : scale down accumulated statistics */ if (compressedLiterals) - optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); - optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0); - optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0); - optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0); + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); + optPtr->litLengthSum = ZSTD_scaleStats(optPtr->litLengthFreq, MaxLL, 11); + optPtr->matchLengthSum = ZSTD_scaleStats(optPtr->matchLengthFreq, MaxML, 11); + optPtr->offCodeSum = ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff, 11); } ZSTD_setBasePrices(optPtr, optLevel); @@ -216,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, const optState_t* const optPtr, int optLevel) { + DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength); if (litLength == 0) return 0; if (!ZSTD_compressedLiterals(optPtr)) @@ -225,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ /* dynamic statistics */ - { U32 price = litLength * optPtr->litSumBasePrice; + { U32 price = optPtr->litSumBasePrice * litLength; + U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; U32 u; + assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); for (u=0; u < litLength; u++) { - assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ - price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); + U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); + if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; + price -= litPrice; } return price; } @@ -239,7 +295,17 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, * cost of literalLength symbol */ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel) { - if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel); + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); + + /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX + * because it isn't representable in the zstd format. + * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. + * In such a case, the block would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); /* dynamic statistics */ { U32 const llCode = ZSTD_LLcode(litLength); @@ -249,57 +315,26 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP } } -/* ZSTD_litLengthContribution() : - * @return ( cost(litlength) - cost(0) ) - * this value can then be added to rawLiteralsCost() - * to provide a cost which is directly comparable to a match ending at same position */ -static int ZSTD_litLengthContribution(U32 const litLength, const optState_t* const optPtr, int optLevel) -{ - if (optPtr->priceType >= zop_predef) return (int)WEIGHT(litLength, optLevel); - - /* dynamic statistics */ - { U32 const llCode = ZSTD_LLcode(litLength); - int const contribution = (int)(LL_bits[llCode] * BITCOST_MULTIPLIER) - + (int)WEIGHT(optPtr->litLengthFreq[0], optLevel) /* note: log2litLengthSum cancel out */ - - (int)WEIGHT(optPtr->litLengthFreq[llCode], optLevel); -#if 1 - return contribution; -#else - return MAX(0, contribution); /* sometimes better, sometimes not ... */ -#endif - } -} - -/* ZSTD_literalsContribution() : - * creates a fake cost for the literals part of a sequence - * which can be compared to the ending cost of a match - * should a new match start at this position */ -static int ZSTD_literalsContribution(const BYTE* const literals, U32 const litLength, - const optState_t* const optPtr, - int optLevel) -{ - int const contribution = (int)ZSTD_rawLiteralsCost(literals, litLength, optPtr, optLevel) - + ZSTD_litLengthContribution(litLength, optPtr, optLevel); - return contribution; -} - /* ZSTD_getMatchPrice() : - * Provides the cost of the match part (offset + matchLength) of a sequence + * Provides the cost of the match part (offset + matchLength) of a sequence. * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. - * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */ + * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ FORCE_INLINE_TEMPLATE U32 -ZSTD_getMatchPrice(U32 const offset, +ZSTD_getMatchPrice(U32 const offBase, U32 const matchLength, const optState_t* const optPtr, int const optLevel) { U32 price; - U32 const offCode = ZSTD_highbit32(offset+1); + U32 const offCode = ZSTD_highbit32(offBase); U32 const mlBase = matchLength - MINMATCH; assert(matchLength >= MINMATCH); - if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ - return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); + if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ + return WEIGHT(mlBase, optLevel) + + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ /* dynamic statistics */ price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); @@ -318,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offset, } /* ZSTD_updateStats() : - * assumption : literals + litLengtn <= iend */ + * assumption : literals + litLength <= iend */ static void ZSTD_updateStats(optState_t* const optPtr, U32 litLength, const BYTE* literals, - U32 offsetCode, U32 matchLength) + U32 offBase, U32 matchLength) { /* literals */ if (ZSTD_compressedLiterals(optPtr)) { @@ -337,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, optPtr->litLengthSum++; } - /* match offset code (0-2=>repCode; 3+=>offset+2) */ - { U32 const offCode = ZSTD_highbit32(offsetCode+1); + /* offset code : follows storeSeq() numeric representation */ + { U32 const offCode = ZSTD_highbit32(offBase); assert(offCode <= MaxOff); optPtr->offCodeFreq[offCode]++; optPtr->offCodeSum++; @@ -372,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) /* Update hashTable3 up to ip (excluded) Assumption : always within prefix (i.e. not within extDict) */ -static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, - U32* nextToUpdate3, - const BYTE* const ip) +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip) { U32* const hashTable3 = ms->hashTable3; U32 const hashLog3 = ms->hashLog3; @@ -398,11 +435,15 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, * Binary Tree search ***************************************/ /** ZSTD_insertBt1() : add one or multiple positions to tree. - * ip : assumed <= iend-8 . + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position * @return : nb of positions added */ -static U32 ZSTD_insertBt1( - ZSTD_matchState_t* ms, +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_insertBt1( + const ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iend, + U32 const target, U32 const mls, const int extDict) { const ZSTD_compressionParameters* const cParams = &ms->cParams; @@ -420,32 +461,36 @@ static U32 ZSTD_insertBt1( const BYTE* const dictEnd = dictBase + dictLimit; const BYTE* const prefixStart = base + dictLimit; const BYTE* match; - const U32 current = (U32)(ip-base); - const U32 btLow = btMask >= current ? 0 : current - btMask; - U32* smallerPtr = bt + 2*(current&btMask); + const U32 curr = (U32)(ip-base); + const U32 btLow = btMask >= curr ? 0 : curr - btMask; + U32* smallerPtr = bt + 2*(curr&btMask); U32* largerPtr = smallerPtr + 1; U32 dummy32; /* to be nullified at the end */ - U32 const windowLow = ms->window.lowLimit; - U32 matchEndIdx = current+8+1; + /* windowLow is based on target because + * we only need positions that will be in the window at the end of the tree update. + */ + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog); + U32 matchEndIdx = curr+8+1; size_t bestLength = 8; U32 nbCompares = 1U << cParams->searchLog; #ifdef ZSTD_C_PREDICT - U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0); - U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1); + U32 predictedSmall = *(bt + 2*((curr-1)&btMask) + 0); + U32 predictedLarge = *(bt + 2*((curr-1)&btMask) + 1); predictedSmall += (predictedSmall>0); predictedLarge += (predictedLarge>0); #endif /* ZSTD_C_PREDICT */ - DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current); + DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr); + assert(curr <= target); assert(ip <= iend-8); /* required for h calculation */ - hashTable[h] = current; /* Update Hash Table */ + hashTable[h] = curr; /* Update Hash Table */ assert(windowLow > 0); - while (nbCompares-- && (matchIndex >= windowLow)) { + for (; nbCompares && (matchIndex >= windowLow); --nbCompares) { U32* const nextPtr = bt + 2*(matchIndex & btMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ - assert(matchIndex < current); + assert(matchIndex < curr); #ifdef ZSTD_C_PREDICT /* note : can create issues when hlog small <= 11 */ const U32* predictPtr = bt + 2*((matchIndex-1) & btMask); /* written this way, as bt is a roll buffer */ @@ -508,25 +553,26 @@ static U32 ZSTD_insertBt1( *smallerPtr = *largerPtr = 0; { U32 positions = 0; if (bestLength > 384) positions = MIN(192, (U32)(bestLength - 384)); /* speed optimization */ - assert(matchEndIdx > current + 8); - return MAX(positions, matchEndIdx - (current + 8)); + assert(matchEndIdx > curr + 8); + return MAX(positions, matchEndIdx - (curr + 8)); } } FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR void ZSTD_updateTree_internal( - ZSTD_matchState_t* ms, + ZSTD_MatchState_t* ms, const BYTE* const ip, const BYTE* const iend, const U32 mls, const ZSTD_dictMode_e dictMode) { const BYTE* const base = ms->window.base; U32 const target = (U32)(ip - base); U32 idx = ms->nextToUpdate; - DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", idx, target, dictMode); while(idx < target) { - U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict); + U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, target, mls, dictMode == ZSTD_extDict); assert(idx < (U32)(idx + forward)); idx += forward; } @@ -535,25 +581,28 @@ void ZSTD_updateTree_internal( ms->nextToUpdate = target; } -void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { +void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend) { ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); } FORCE_INLINE_TEMPLATE -U32 ZSTD_insertBtAndGetAllMatches ( - ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ - ZSTD_matchState_t* ms, - U32* nextToUpdate3, - const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, - const U32 rep[ZSTD_REP_NUM], - U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ - const U32 lengthToBeat, - U32 const mls /* template */) +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 +ZSTD_insertBtAndGetAllMatches ( + ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ + ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip, const BYTE* const iLimit, + const ZSTD_dictMode_e dictMode, + const U32 rep[ZSTD_REP_NUM], + const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ + const U32 lengthToBeat, + const U32 mls /* template */) { const ZSTD_compressionParameters* const cParams = &ms->cParams; U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); const BYTE* const base = ms->window.base; - U32 const current = (U32)(ip-base); + U32 const curr = (U32)(ip-base); U32 const hashLog = cParams->hashLog; U32 const minMatch = (mls==3) ? 3 : 4; U32* const hashTable = ms->hashTable; @@ -567,17 +616,17 @@ U32 ZSTD_insertBtAndGetAllMatches ( U32 const dictLimit = ms->window.dictLimit; const BYTE* const dictEnd = dictBase + dictLimit; const BYTE* const prefixStart = base + dictLimit; - U32 const btLow = (btMask >= current) ? 0 : current - btMask; - U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog); + U32 const btLow = (btMask >= curr) ? 0 : curr - btMask; + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog); U32 const matchLow = windowLow ? windowLow : 1; - U32* smallerPtr = bt + 2*(current&btMask); - U32* largerPtr = bt + 2*(current&btMask) + 1; - U32 matchEndIdx = current+8+1; /* farthest referenced position of any match => detects repetitive patterns */ + U32* smallerPtr = bt + 2*(curr&btMask); + U32* largerPtr = bt + 2*(curr&btMask) + 1; + U32 matchEndIdx = curr+8+1; /* farthest referenced position of any match => detects repetitive patterns */ U32 dummy32; /* to be nullified at the end */ U32 mnum = 0; U32 nbCompares = 1U << cParams->searchLog; - const ZSTD_matchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; + const ZSTD_MatchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; const ZSTD_compressionParameters* const dmsCParams = dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL; const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL; @@ -591,7 +640,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( U32 const dmsBtLow = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit; size_t bestLength = lengthToBeat-1; - DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", current); + DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", curr); /* check repCode */ assert(ll0 <= 1); /* necessarily 1 or 0 */ @@ -599,27 +648,30 @@ U32 ZSTD_insertBtAndGetAllMatches ( U32 repCode; for (repCode = ll0; repCode < lastR; repCode++) { U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; - U32 const repIndex = current - repOffset; + U32 const repIndex = curr - repOffset; U32 repLen = 0; - assert(current >= dictLimit); - if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < current-dictLimit) { /* equivalent to `current > repIndex >= dictLimit` */ - if (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch)) { + assert(curr >= dictLimit); + if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < curr-dictLimit) { /* equivalent to `curr > repIndex >= dictLimit` */ + /* We must validate the repcode offset because when we're using a dictionary the + * valid offset range shrinks when the dictionary goes out of bounds. + */ + if ((repIndex >= windowLow) & (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch))) { repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch; } - } else { /* repIndex < dictLimit || repIndex >= current */ + } else { /* repIndex < dictLimit || repIndex >= curr */ const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ? dmsBase + repIndex - dmsIndexDelta : dictBase + repIndex; - assert(current >= windowLow); + assert(curr >= windowLow); if ( dictMode == ZSTD_extDict - && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow) /* equivalent to `current > repIndex >= windowLow` */ - & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */) + && ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow) /* equivalent to `curr > repIndex >= windowLow` */ + & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch; } if (dictMode == ZSTD_dictMatchState - && ( ((repOffset-1) /*intentional overflow*/ < current - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `current > repIndex >= dmsLowLimit` */ - & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */ + && ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `curr > repIndex >= dmsLowLimit` */ + & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch; } } @@ -628,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", repCode, ll0, repOffset, repLen); bestLength = repLen; - matches[mnum].off = repCode - ll0; + matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ matches[mnum].len = (U32)repLen; mnum++; if ( (repLen > sufficient_len) @@ -640,7 +692,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( if ((mls == 3) /*static*/ && (bestLength < mls)) { U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, nextToUpdate3, ip); if ((matchIndex3 >= matchLow) - & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) { + & (curr - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) { size_t mlen; if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) { const BYTE* const match = base + matchIndex3; @@ -655,26 +707,26 @@ U32 ZSTD_insertBtAndGetAllMatches ( DEBUGLOG(8, "found small match with hlog3, of length %u", (U32)mlen); bestLength = mlen; - assert(current > matchIndex3); + assert(curr > matchIndex3); assert(mnum==0); /* no prior solution */ - matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE; + matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); matches[0].len = (U32)mlen; mnum = 1; if ( (mlen > sufficient_len) | (ip+mlen == iLimit) ) { /* best possible length */ - ms->nextToUpdate = current+1; /* skip insertion */ + ms->nextToUpdate = curr+1; /* skip insertion */ return 1; } } } /* no dictMatchState lookup: dicts don't have a populated HC3 table */ - } + } /* if (mls == 3) */ - hashTable[h] = current; /* Update Hash Table */ + hashTable[h] = curr; /* Update Hash Table */ - while (nbCompares-- && (matchIndex >= matchLow)) { + for (; nbCompares && (matchIndex >= matchLow); --nbCompares) { U32* const nextPtr = bt + 2*(matchIndex & btMask); const BYTE* match; size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ - assert(current > matchIndex); + assert(curr > matchIndex); if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) { assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */ @@ -690,21 +742,20 @@ U32 ZSTD_insertBtAndGetAllMatches ( } if (matchLength > bestLength) { - DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", - (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE); + DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", + (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); assert(matchEndIdx > matchIndex); if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; bestLength = matchLength; - matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); matches[mnum].len = (U32)matchLength; mnum++; if ( (matchLength > ZSTD_OPT_NUM) | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */ break; /* drop, to preserve bt consistency (miss a little bit of compression) */ - } - } + } } if (match[matchLength] < ip[matchLength]) { /* match smaller than current */ @@ -723,12 +774,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( *smallerPtr = *largerPtr = 0; + assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ if (dictMode == ZSTD_dictMatchState && nbCompares) { size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls); U32 dictMatchIndex = dms->hashTable[dmsH]; const U32* const dmsBt = dms->chainTable; commonLengthSmaller = commonLengthLarger = 0; - while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) { + for (; nbCompares && (dictMatchIndex > dmsLowLimit); --nbCompares) { const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ const BYTE* match = dmsBase + dictMatchIndex; @@ -738,19 +790,18 @@ U32 ZSTD_insertBtAndGetAllMatches ( if (matchLength > bestLength) { matchIndex = dictMatchIndex + dmsIndexDelta; - DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", - (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE); + DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", + (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; bestLength = matchLength; - matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); matches[mnum].len = (U32)matchLength; mnum++; if ( (matchLength > ZSTD_OPT_NUM) | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { break; /* drop, to guarantee consistency (miss a little bit of compression) */ - } - } + } } if (dictMatchIndex <= dmsBtLow) { break; } /* beyond tree size, stop the search */ if (match[matchLength] < ip[matchLength]) { @@ -760,76 +811,246 @@ U32 ZSTD_insertBtAndGetAllMatches ( /* match is larger than current */ commonLengthLarger = matchLength; dictMatchIndex = nextPtr[0]; - } - } - } + } } } /* if (dictMode == ZSTD_dictMatchState) */ - assert(matchEndIdx > current+8); + assert(matchEndIdx > curr+8); ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ return mnum; } +typedef U32 (*ZSTD_getAllMatchesFn)( + ZSTD_match_t*, + ZSTD_MatchState_t*, + U32*, + const BYTE*, + const BYTE*, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat); -FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( - ZSTD_match_t* matches, /* store result (match found, increasing size) in this table */ - ZSTD_matchState_t* ms, - U32* nextToUpdate3, - const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode, - const U32 rep[ZSTD_REP_NUM], - U32 const ll0, - U32 const lengthToBeat) +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +U32 ZSTD_btGetAllMatches_internal( + ZSTD_match_t* matches, + ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* ip, + const BYTE* const iHighLimit, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat, + const ZSTD_dictMode_e dictMode, + const U32 mls) { - const ZSTD_compressionParameters* const cParams = &ms->cParams; - U32 const matchLengthSearch = cParams->minMatch; - DEBUGLOG(8, "ZSTD_BtGetAllMatches"); - if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ - ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode); - switch(matchLengthSearch) - { - case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3); - default : - case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4); - case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5); - case 7 : - case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6); - } + assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls); + DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls); + if (ip < ms->window.base + ms->nextToUpdate) + return 0; /* skipped area */ + ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode); + return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls); } +#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls + +#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \ + static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \ + ZSTD_match_t* matches, \ + ZSTD_MatchState_t* ms, \ + U32* nextToUpdate3, \ + const BYTE* ip, \ + const BYTE* const iHighLimit, \ + const U32 rep[ZSTD_REP_NUM], \ + U32 const ll0, \ + U32 const lengthToBeat) \ + { \ + return ZSTD_btGetAllMatches_internal( \ + matches, ms, nextToUpdate3, ip, iHighLimit, \ + rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \ + } -/*-******************************* -* Optimal parser -*********************************/ -typedef struct repcodes_s { - U32 rep[3]; -} repcodes_t; +#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6) + +GEN_ZSTD_BT_GET_ALL_MATCHES(noDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(extDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState) + +#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode) \ + { \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6) \ + } -static repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0) +static ZSTD_getAllMatchesFn +ZSTD_selectBtGetAllMatches(ZSTD_MatchState_t const* ms, ZSTD_dictMode_e const dictMode) +{ + ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = { + ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict), + ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict), + ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState) + }; + U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6); + assert((U32)dictMode < 3); + assert(mls - 3 < 4); + return getAllMatchesFns[(int)dictMode][mls - 3]; +} + +/************************* +* LDM helper functions * +*************************/ + +/* Struct containing info needed to make decision about ldm inclusion */ +typedef struct { + RawSeqStore_t seqStore; /* External match candidates store for this block */ + U32 startPosInBlock; /* Start position of the current match candidate */ + U32 endPosInBlock; /* End position of the current match candidate */ + U32 offset; /* Offset of the match candidate */ +} ZSTD_optLdm_t; + +/* ZSTD_optLdm_skipRawSeqStoreBytes(): + * Moves forward in @rawSeqStore by @nbBytes, + * which will update the fields 'pos' and 'posInSequence'. + */ +static void ZSTD_optLdm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) { - repcodes_t newReps; - if (offset >= ZSTD_REP_NUM) { /* full offset */ - newReps.rep[2] = rep[1]; - newReps.rep[1] = rep[0]; - newReps.rep[0] = offset - ZSTD_REP_MOVE; - } else { /* repcode */ - U32 const repCode = offset + ll0; - if (repCode > 0) { /* note : if repCode==0, no change */ - U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; - newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2]; - newReps.rep[1] = rep[0]; - newReps.rep[0] = currentOffset; - } else { /* repCode == 0 */ - memcpy(&newReps, rep, sizeof(newReps)); + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { + rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; + if (currPos >= currSeq.litLength + currSeq.matchLength) { + currPos -= currSeq.litLength + currSeq.matchLength; + rawSeqStore->pos++; + } else { + rawSeqStore->posInSequence = currPos; + break; } } - return newReps; + if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) { + rawSeqStore->posInSequence = 0; + } } +/* ZSTD_opt_getNextMatchAndUpdateSeqStore(): + * Calculates the beginning and end of the next match in the current block. + * Updates 'pos' and 'posInSequence' of the ldmSeqStore. + */ +static void +ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock, + U32 blockBytesRemaining) +{ + rawSeq currSeq; + U32 currBlockEndPos; + U32 literalsBytesRemaining; + U32 matchBytesRemaining; + + /* Setting match end position to MAX to ensure we never use an LDM during this block */ + if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) { + optLdm->startPosInBlock = UINT_MAX; + optLdm->endPosInBlock = UINT_MAX; + return; + } + /* Calculate appropriate bytes left in matchLength and litLength + * after adjusting based on ldmSeqStore->posInSequence */ + currSeq = optLdm->seqStore.seq[optLdm->seqStore.pos]; + assert(optLdm->seqStore.posInSequence <= currSeq.litLength + currSeq.matchLength); + currBlockEndPos = currPosInBlock + blockBytesRemaining; + literalsBytesRemaining = (optLdm->seqStore.posInSequence < currSeq.litLength) ? + currSeq.litLength - (U32)optLdm->seqStore.posInSequence : + 0; + matchBytesRemaining = (literalsBytesRemaining == 0) ? + currSeq.matchLength - ((U32)optLdm->seqStore.posInSequence - currSeq.litLength) : + currSeq.matchLength; + + /* If there are more literal bytes than bytes remaining in block, no ldm is possible */ + if (literalsBytesRemaining >= blockBytesRemaining) { + optLdm->startPosInBlock = UINT_MAX; + optLdm->endPosInBlock = UINT_MAX; + ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, blockBytesRemaining); + return; + } + + /* Matches may be < minMatch by this process. In that case, we will reject them + when we are deciding whether or not to add the ldm */ + optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining; + optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining; + optLdm->offset = currSeq.offset; + + if (optLdm->endPosInBlock > currBlockEndPos) { + /* Match ends after the block ends, we can't use the whole match */ + optLdm->endPosInBlock = currBlockEndPos; + ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, currBlockEndPos - currPosInBlock); + } else { + /* Consume nb of bytes equal to size of sequence left */ + ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, literalsBytesRemaining + matchBytesRemaining); + } +} -static U32 ZSTD_totalLen(ZSTD_optimal_t sol) +/* ZSTD_optLdm_maybeAddMatch(): + * Adds a match if it's long enough, + * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock', + * into 'matches'. Maintains the correct ordering of 'matches'. + */ +static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + const ZSTD_optLdm_t* optLdm, U32 currPosInBlock, + U32 minMatch) { - return sol.litlen + sol.mlen; + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; + /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; + + /* Ensure that current block position is not outside of the match */ + if (currPosInBlock < optLdm->startPosInBlock + || currPosInBlock >= optLdm->endPosInBlock + || candidateMatchLength < minMatch) { + return; + } + + if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { + U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); + DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", + candidateOffBase, candidateMatchLength, currPosInBlock); + matches[*nbMatches].len = candidateMatchLength; + matches[*nbMatches].off = candidateOffBase; + (*nbMatches)++; + } } +/* ZSTD_optLdm_processMatchCandidate(): + * Wrapper function to update ldm seq store and call ldm functions as necessary. + */ +static void +ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + ZSTD_match_t* matches, U32* nbMatches, + U32 currPosInBlock, U32 remainingBytes, + U32 minMatch) +{ + if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) { + return; + } + + if (currPosInBlock >= optLdm->endPosInBlock) { + if (currPosInBlock > optLdm->endPosInBlock) { + /* The position at which ZSTD_optLdm_processMatchCandidate() is called is not necessarily + * at the end of a match from the ldm seq store, and will often be some bytes + * over beyond matchEndPosInBlock. As such, we need to correct for these "overshoots" + */ + U32 const posOvershoot = currPosInBlock - optLdm->endPosInBlock; + ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot); + } + ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes); + } + ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock, minMatch); +} + + +/*-******************************* +* Optimal parser +*********************************/ + #if 0 /* debug */ static void @@ -839,7 +1060,7 @@ listStats(const U32* table, int lastEltID) int enb; for (enb=0; enb < nbElts; enb++) { (void)table; - //RAWLOG(2, "%3i:%3i, ", enb, table[enb]); + /* RAWLOG(2, "%3i:%3i, ", enb, table[enb]); */ RAWLOG(2, "%4i,", table[enb]); } RAWLOG(2, " \n"); @@ -847,9 +1068,15 @@ listStats(const U32* table, int lastEltID) #endif -FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - seqStore_t* seqStore, +#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) +#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) +#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) + +FORCE_INLINE_TEMPLATE +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +size_t +ZSTD_compressBlock_opt_generic(ZSTD_MatchState_t* ms, + SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize, const int optLevel, @@ -865,13 +1092,22 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, const BYTE* const prefixStart = base + ms->window.dictLimit; const ZSTD_compressionParameters* const cParams = &ms->cParams; + ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode); + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4; U32 nextToUpdate3 = ms->nextToUpdate; ZSTD_optimal_t* const opt = optStatePtr->priceTable; ZSTD_match_t* const matches = optStatePtr->matchTable; - ZSTD_optimal_t lastSequence; + ZSTD_optimal_t lastStretch; + ZSTD_optLdm_t optLdm; + + ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t)); + + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; + optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; + ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); /* init */ DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u", @@ -887,88 +1123,144 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, /* find first match */ { U32 const litlen = (U32)(ip - anchor); U32 const ll0 = !litlen; - U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch); - if (!nbMatches) { ip++; continue; } + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, + (U32)(ip-istart), (U32)(iend-ip), + minMatch); + if (!nbMatches) { + DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); + ip++; + continue; + } + + /* Match found: let's store this solution, and eventually find more candidates. + * During this forward pass, @opt is used to store stretches, + * defined as "a match followed by N literals". + * Note how this is different from a Sequence, which is "N literals followed by a match". + * Storing stretches allows us to store different match predecessors + * for each literal position part of a literals run. */ /* initialize opt[0] */ - { U32 i ; for (i=0; i immediate encoding */ { U32 const maxML = matches[nbMatches-1].len; - U32 const maxOffset = matches[nbMatches-1].off; - DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", - nbMatches, maxML, maxOffset, (U32)(ip-prefixStart)); + U32 const maxOffBase = matches[nbMatches-1].off; + DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", + nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); if (maxML > sufficient_len) { - lastSequence.litlen = litlen; - lastSequence.mlen = maxML; - lastSequence.off = maxOffset; - DEBUGLOG(6, "large match (%u>%u), immediate encoding", + lastStretch.litlen = 0; + lastStretch.mlen = maxML; + lastStretch.off = maxOffBase; + DEBUGLOG(6, "large match (%u>%u) => immediate encoding", maxML, sufficient_len); cur = 0; - last_pos = ZSTD_totalLen(lastSequence); + last_pos = maxML; goto _shortestPath; } } /* set prices for first matches starting position == 0 */ - { U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); - U32 pos; + assert(opt[0].price >= 0); + { U32 pos; U32 matchNb; for (pos = 1; pos < minMatch; pos++) { - opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ + opt[pos].price = ZSTD_MAX_PRICE; + opt[pos].mlen = 0; + opt[pos].litlen = litlen + pos; } for (matchNb = 0; matchNb < nbMatches; matchNb++) { - U32 const offset = matches[matchNb].off; + U32 const offBase = matches[matchNb].off; U32 const end = matches[matchNb].len; - repcodes_t const repHistory = ZSTD_updateRep(rep, offset, ll0); for ( ; pos <= end ; pos++ ) { - U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel); - U32 const sequencePrice = literalsPrice + matchPrice; + int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); + int const sequencePrice = opt[0].price + matchPrice; DEBUGLOG(7, "rPos:%u => set initial price : %.2f", pos, ZSTD_fCost(sequencePrice)); opt[pos].mlen = pos; - opt[pos].off = offset; - opt[pos].litlen = litlen; - opt[pos].price = sequencePrice; - ZSTD_STATIC_ASSERT(sizeof(opt[pos].rep) == sizeof(repHistory)); - memcpy(opt[pos].rep, &repHistory, sizeof(repHistory)); - } } + opt[pos].off = offBase; + opt[pos].litlen = 0; /* end of match */ + opt[pos].price = sequencePrice + LL_PRICE(0); + } + } last_pos = pos-1; + opt[pos].price = ZSTD_MAX_PRICE; } } /* check further positions */ for (cur = 1; cur <= last_pos; cur++) { const BYTE* const inr = ip + cur; - assert(cur < ZSTD_OPT_NUM); - DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) + assert(cur <= ZSTD_OPT_NUM); + DEBUGLOG(7, "cPos:%i==rPos:%u", (int)(inr-istart), cur); /* Fix current position with one literal if cheaper */ - { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; + { U32 const litlen = opt[cur-1].litlen + 1; int const price = opt[cur-1].price - + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) - + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) - - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); + + LIT_PRICE(ip+cur-1) + + LL_INCPRICE(litlen); assert(price < 1000000000); /* overflow check */ if (price <= opt[cur].price) { - DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", - inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + ZSTD_optimal_t const prevMatch = opt[cur]; + DEBUGLOG(7, "cPos:%i==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", + (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); - opt[cur].mlen = 0; - opt[cur].off = 0; + opt[cur] = opt[cur-1]; opt[cur].litlen = litlen; opt[cur].price = price; - memcpy(opt[cur].rep, opt[cur-1].rep, sizeof(opt[cur].rep)); + if ( (optLevel >= 1) /* additional check only for higher modes */ + && (prevMatch.litlen == 0) /* replace a match */ + && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */ + && LIKELY(ip + cur < iend) + ) { + /* check next position, in case it would be cheaper */ + int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1); + int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1); + DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f", + cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals)); + if ( (with1literal < withMoreLiterals) + && (with1literal < opt[cur+1].price) ) { + /* update offset history - before it disappears */ + U32 const prev = cur - prevMatch.mlen; + Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); + assert(cur >= prevMatch.mlen); + DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", + ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), + newReps.rep[0], newReps.rep[1], newReps.rep[2] ); + opt[cur+1] = prevMatch; /* mlen & offbase */ + ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(Repcodes_t)); + opt[cur+1].litlen = 1; + opt[cur+1].price = with1literal; + if (last_pos < cur+1) last_pos = cur+1; + } + } } else { - DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", - inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), - opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); + DEBUGLOG(7, "cPos:%i==rPos:%u : literal would cost more (%.2f>%.2f)", + (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); } } + /* Offset history is not updated during match comparison. + * Do it here, now that the match is selected and confirmed. + */ + ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(Repcodes_t)); + assert(cur >= opt[cur].mlen); + if (opt[cur].litlen == 0) { + /* just finished a match => alter offset history */ + U32 const prev = cur - opt[cur].mlen; + Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); + ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(Repcodes_t)); + } + /* last match must start at a minimum distance of 8 from oend */ if (inr > ilimit) continue; @@ -976,105 +1268,156 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, if ( (optLevel==0) /*static_test*/ && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { - DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); + DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1); continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ } - { U32 const ll0 = (opt[cur].mlen != 0); - U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; - U32 const previousPrice = opt[cur].price; - U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); - U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch); + assert(opt[cur].price >= 0); + { U32 const ll0 = (opt[cur].litlen == 0); + int const previousPrice = opt[cur].price; + int const basePrice = previousPrice + LL_PRICE(0); + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); U32 matchNb; + + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, + (U32)(inr-istart), (U32)(iend-inr), + minMatch); + if (!nbMatches) { DEBUGLOG(7, "rPos:%u : no match found", cur); continue; } - { U32 const maxML = matches[nbMatches-1].len; - DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", - inr-istart, cur, nbMatches, maxML); - - if ( (maxML > sufficient_len) - || (cur + maxML >= ZSTD_OPT_NUM) ) { - lastSequence.mlen = maxML; - lastSequence.off = matches[nbMatches-1].off; - lastSequence.litlen = litlen; - cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ - last_pos = cur + ZSTD_totalLen(lastSequence); - if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ + { U32 const longestML = matches[nbMatches-1].len; + DEBUGLOG(7, "cPos:%i==rPos:%u, found %u matches, of longest ML=%u", + (int)(inr-istart), cur, nbMatches, longestML); + + if ( (longestML > sufficient_len) + || (cur + longestML >= ZSTD_OPT_NUM) + || (ip + cur + longestML >= iend) ) { + lastStretch.mlen = longestML; + lastStretch.off = matches[nbMatches-1].off; + lastStretch.litlen = 0; + last_pos = cur + longestML; goto _shortestPath; } } /* set prices using matches found at position == cur */ for (matchNb = 0; matchNb < nbMatches; matchNb++) { U32 const offset = matches[matchNb].off; - repcodes_t const repHistory = ZSTD_updateRep(opt[cur].rep, offset, ll0); U32 const lastML = matches[matchNb].len; U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; U32 mlen; - DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", - matchNb, matches[matchNb].off, lastML, litlen); + DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", + matchNb, matches[matchNb].off, lastML, opt[cur].litlen); for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ U32 const pos = cur + mlen; - int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); if ((pos > last_pos) || (price < opt[pos].price)) { DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); - while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ + while (last_pos < pos) { + /* fill empty positions, for future comparisons */ + last_pos++; + opt[last_pos].price = ZSTD_MAX_PRICE; + opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */ + } opt[pos].mlen = mlen; opt[pos].off = offset; - opt[pos].litlen = litlen; + opt[pos].litlen = 0; opt[pos].price = price; - ZSTD_STATIC_ASSERT(sizeof(opt[pos].rep) == sizeof(repHistory)); - memcpy(opt[pos].rep, &repHistory, sizeof(repHistory)); } else { DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ } } } } + opt[last_pos+1].price = ZSTD_MAX_PRICE; } /* for (cur = 1; cur <= last_pos; cur++) */ - lastSequence = opt[last_pos]; - cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ - assert(cur < ZSTD_OPT_NUM); /* control overflow*/ + lastStretch = opt[last_pos]; + assert(cur >= lastStretch.mlen); + cur = last_pos - lastStretch.mlen; _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ assert(opt[0].mlen == 0); + assert(last_pos >= lastStretch.mlen); + assert(cur == last_pos - lastStretch.mlen); + + if (lastStretch.mlen==0) { + /* no solution : all matches have been converted into literals */ + assert(lastStretch.litlen == (ip - anchor) + last_pos); + ip += last_pos; + continue; + } + assert(lastStretch.off > 0); + + /* Update offset history */ + if (lastStretch.litlen == 0) { + /* finishing on a match : update offset history */ + Repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); + ZSTD_memcpy(rep, &reps, sizeof(Repcodes_t)); + } else { + ZSTD_memcpy(rep, lastStretch.rep, sizeof(Repcodes_t)); + assert(cur >= lastStretch.litlen); + cur -= lastStretch.litlen; + } - { U32 const storeEnd = cur + 1; + /* Let's write the shortest path solution. + * It is stored in @opt in reverse order, + * starting from @storeEnd (==cur+2), + * effectively partially @opt overwriting. + * Content is changed too: + * - So far, @opt stored stretches, aka a match followed by literals + * - Now, it will store sequences, aka literals followed by a match + */ + { U32 const storeEnd = cur + 2; U32 storeStart = storeEnd; - U32 seqPos = cur; + U32 stretchPos = cur; DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", last_pos, cur); (void)last_pos; - assert(storeEnd < ZSTD_OPT_NUM); - DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", - storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); - opt[storeEnd] = lastSequence; - while (seqPos > 0) { - U32 const backDist = ZSTD_totalLen(opt[seqPos]); + assert(storeEnd < ZSTD_OPT_SIZE); + DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", + storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off); + if (lastStretch.litlen > 0) { + /* last "sequence" is unfinished: just a bunch of literals */ + opt[storeEnd].litlen = lastStretch.litlen; + opt[storeEnd].mlen = 0; + storeStart = storeEnd-1; + opt[storeStart] = lastStretch; + } { + opt[storeEnd] = lastStretch; /* note: litlen will be fixed */ + storeStart = storeEnd; + } + while (1) { + ZSTD_optimal_t nextStretch = opt[stretchPos]; + opt[storeStart].litlen = nextStretch.litlen; + DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)", + opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off); + if (nextStretch.mlen == 0) { + /* reaching beginning of segment */ + break; + } storeStart--; - DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", - seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); - opt[storeStart] = opt[seqPos]; - seqPos = (seqPos > backDist) ? seqPos - backDist : 0; + opt[storeStart] = nextStretch; /* note: litlen will be fixed */ + assert(nextStretch.litlen + nextStretch.mlen <= stretchPos); + stretchPos -= nextStretch.litlen + nextStretch.mlen; } /* save sequences */ - DEBUGLOG(6, "sending selected sequences into seqStore") + DEBUGLOG(6, "sending selected sequences into seqStore"); { U32 storePos; for (storePos=storeStart; storePos <= storeEnd; storePos++) { U32 const llen = opt[storePos].litlen; U32 const mlen = opt[storePos].mlen; - U32 const offCode = opt[storePos].off; + U32 const offBase = opt[storePos].off; U32 const advance = llen + mlen; - DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", - anchor - istart, (unsigned)llen, (unsigned)mlen); + DEBUGLOG(6, "considering seq starting at %i, llen=%u, mlen=%u", + (int)(anchor - istart), (unsigned)llen, (unsigned)mlen); if (mlen==0) { /* only literals => must be last "sequence", actually starting a new stream of sequences */ assert(storePos == storeEnd); /* must be last sequence */ @@ -1082,81 +1425,70 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, continue; /* will finish */ } - /* repcodes update : like ZSTD_updateRep(), but update in place */ - if (offCode >= ZSTD_REP_NUM) { /* full offset */ - rep[2] = rep[1]; - rep[1] = rep[0]; - rep[0] = offCode - ZSTD_REP_MOVE; - } else { /* repcode */ - U32 const repCode = offCode + (llen==0); - if (repCode) { /* note : if repCode==0, no change */ - U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; - if (repCode >= 2) rep[2] = rep[1]; - rep[1] = rep[0]; - rep[0] = currentOffset; - } } - assert(anchor + llen <= iend); - ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); - ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen-MINMATCH); + ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); + ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); anchor += advance; ip = anchor; } } + DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]); + + /* update all costs */ ZSTD_setBasePrices(optStatePtr, optLevel); } - } /* while (ip < ilimit) */ /* Return the last literals size */ return (size_t)(iend - anchor); } +#endif /* build exclusions */ +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR +static size_t ZSTD_compressBlock_opt0( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); +} +#endif + +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +static size_t ZSTD_compressBlock_opt2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); +} +#endif +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btopt( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressBlock_btopt"); - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); } +#endif -/* used in 2-pass strategy */ -static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus) -{ - U32 s, sum=0; - assert(ZSTD_FREQ_DIV+bonus >= 0); - for (s=0; slitSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0); - optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 0); - optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 0); - optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0); -} +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR /* ZSTD_initStats_ultra(): * make a first compression pass, just to seed stats with more accurate starting values. * only works on first block, with no dictionary and no ldm. - * this function cannot error, hence its contract must be respected. + * this function cannot error out, its narrow contract must be respected. */ -static void -ZSTD_initStats_ultra(ZSTD_matchState_t* ms, - seqStore_t* seqStore, - U32 rep[ZSTD_REP_NUM], - const void* src, size_t srcSize) +static +ZSTD_ALLOW_POINTER_OVERFLOW_ATTR +void ZSTD_initStats_ultra(ZSTD_MatchState_t* ms, + SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) { U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ - memcpy(tmpRep, rep, sizeof(tmpRep)); + ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize); assert(ms->opt.litLengthSum == 0); /* first block */ @@ -1164,38 +1496,36 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */ assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */ - ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/ + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ - /* invalidate first scan from history */ + /* invalidate first scan from history, only keep entropy stats */ ZSTD_resetSeqStore(seqStore); ms->window.base -= srcSize; ms->window.dictLimit += (U32)srcSize; ms->window.lowLimit = ms->window.dictLimit; ms->nextToUpdate = ms->window.dictLimit; - /* re-inforce weight of collected statistics */ - ZSTD_upscaleStats(&ms->opt); } size_t ZSTD_compressBlock_btultra( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); } size_t ZSTD_compressBlock_btultra2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - U32 const current = (U32)((const BYTE*)src - ms->window.base); + U32 const curr = (U32)((const BYTE*)src - ms->window.base); DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); - /* 2-pass strategy: + /* 2-passes strategy: * this strategy makes a first pass over first block to collect statistics - * and seed next round's statistics with it. - * After 1st pass, function forgets everything, and starts a new block. + * in order to seed next round's statistics with it. + * After 1st pass, function forgets history, and starts a new block. * Consequently, this can only work if no data has been previously loaded in tables, * aka, no dictionary, no prefix, no ldm preprocessing. * The compression ratio gain is generally small (~0.5% on first block), @@ -1204,43 +1534,50 @@ size_t ZSTD_compressBlock_btultra2( if ( (ms->opt.litLengthSum==0) /* first block */ && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ - && (current == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ - && (srcSize > ZSTD_PREDEF_THRESHOLD) + && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ + && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ ) { ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); } - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); } +#endif +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btopt_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); } -size_t ZSTD_compressBlock_btultra_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); } +#endif -size_t ZSTD_compressBlock_btopt_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); } size_t ZSTD_compressBlock_btultra_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); } +#endif /* note : no btultra2 variant for extDict nor dictMatchState, * because btultra2 is not meant to work with dictionaries * and is only specific for the first block (no prefix) */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_opt.h b/vendor/github.com/DataDog/zstd/zstd_opt.h index 094f747..6ecc6e1 100644 --- a/vendor/github.com/DataDog/zstd/zstd_opt.h +++ b/vendor/github.com/DataDog/zstd/zstd_opt.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,46 +12,64 @@ #ifndef ZSTD_OPT_H #define ZSTD_OPT_H -#if defined (__cplusplus) -extern "C" { -#endif - #include "zstd_compress_internal.h" +#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ + || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) /* used in ZSTD_loadDictionaryContent() */ -void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); +void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend); +#endif +#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR size_t ZSTD_compressBlock_btopt( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_btultra( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -size_t ZSTD_compressBlock_btultra2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt +#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState +#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict +#else +#define ZSTD_COMPRESSBLOCK_BTOPT NULL +#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL +#endif -size_t ZSTD_compressBlock_btopt_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR +size_t ZSTD_compressBlock_btultra( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_btultra_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); - -size_t ZSTD_compressBlock_btopt_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); size_t ZSTD_compressBlock_btultra_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); /* note : no btultra2 variant for extDict nor dictMatchState, * because btultra2 is not meant to work with dictionaries * and is only specific for the first block (no prefix) */ +size_t ZSTD_compressBlock_btultra2( + ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); -#if defined (__cplusplus) -} +#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra +#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState +#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict +#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2 +#else +#define ZSTD_COMPRESSBLOCK_BTULTRA NULL +#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL +#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL +#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL #endif #endif /* ZSTD_OPT_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_preSplit.c b/vendor/github.com/DataDog/zstd/zstd_preSplit.c new file mode 100644 index 0000000..0b4b1ee --- /dev/null +++ b/vendor/github.com/DataDog/zstd/zstd_preSplit.c @@ -0,0 +1,241 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "compiler.h" /* ZSTD_ALIGNOF */ +#include "mem.h" /* S64 */ +#include "zstd_deps.h" /* ZSTD_memset */ +#include "zstd_internal.h" /* ZSTD_STATIC_ASSERT */ +#include "hist.h" /* HIST_add */ +#include "zstd_preSplit.h" + + +#define BLOCKSIZE_MIN 3500 +#define THRESHOLD_PENALTY_RATE 16 +#define THRESHOLD_BASE (THRESHOLD_PENALTY_RATE - 2) +#define THRESHOLD_PENALTY 3 + +#define HASHLENGTH 2 +#define HASHLOG_MAX 10 +#define HASHTABLESIZE (1 << HASHLOG_MAX) +#define HASHMASK (HASHTABLESIZE - 1) +#define KNUTH 0x9e3779b9 + +/* for hashLog > 8, hash 2 bytes. + * for hashLog == 8, just take the byte, no hashing. + * The speed of this method relies on compile-time constant propagation */ +FORCE_INLINE_TEMPLATE unsigned hash2(const void *p, unsigned hashLog) +{ + assert(hashLog >= 8); + if (hashLog == 8) return (U32)((const BYTE*)p)[0]; + assert(hashLog <= HASHLOG_MAX); + return (U32)(MEM_read16(p)) * KNUTH >> (32 - hashLog); +} + + +typedef struct { + unsigned events[HASHTABLESIZE]; + size_t nbEvents; +} Fingerprint; +typedef struct { + Fingerprint pastEvents; + Fingerprint newEvents; +} FPStats; + +static void initStats(FPStats* fpstats) +{ + ZSTD_memset(fpstats, 0, sizeof(FPStats)); +} + +FORCE_INLINE_TEMPLATE void +addEvents_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog) +{ + const char* p = (const char*)src; + size_t limit = srcSize - HASHLENGTH + 1; + size_t n; + assert(srcSize >= HASHLENGTH); + for (n = 0; n < limit; n+=samplingRate) { + fp->events[hash2(p+n, hashLog)]++; + } + fp->nbEvents += limit/samplingRate; +} + +FORCE_INLINE_TEMPLATE void +recordFingerprint_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog) +{ + ZSTD_memset(fp, 0, sizeof(unsigned) * ((size_t)1 << hashLog)); + fp->nbEvents = 0; + addEvents_generic(fp, src, srcSize, samplingRate, hashLog); +} + +typedef void (*RecordEvents_f)(Fingerprint* fp, const void* src, size_t srcSize); + +#define FP_RECORD(_rate) ZSTD_recordFingerprint_##_rate + +#define ZSTD_GEN_RECORD_FINGERPRINT(_rate, _hSize) \ + static void FP_RECORD(_rate)(Fingerprint* fp, const void* src, size_t srcSize) \ + { \ + recordFingerprint_generic(fp, src, srcSize, _rate, _hSize); \ + } + +ZSTD_GEN_RECORD_FINGERPRINT(1, 10) +ZSTD_GEN_RECORD_FINGERPRINT(5, 10) +ZSTD_GEN_RECORD_FINGERPRINT(11, 9) +ZSTD_GEN_RECORD_FINGERPRINT(43, 8) + + +static U64 abs64(S64 s64) { return (U64)((s64 < 0) ? -s64 : s64); } + +static U64 fpDistance(const Fingerprint* fp1, const Fingerprint* fp2, unsigned hashLog) +{ + U64 distance = 0; + size_t n; + assert(hashLog <= HASHLOG_MAX); + for (n = 0; n < ((size_t)1 << hashLog); n++) { + distance += + abs64((S64)fp1->events[n] * (S64)fp2->nbEvents - (S64)fp2->events[n] * (S64)fp1->nbEvents); + } + return distance; +} + +/* Compare newEvents with pastEvents + * return 1 when considered "too different" + */ +static int compareFingerprints(const Fingerprint* ref, + const Fingerprint* newfp, + int penalty, + unsigned hashLog) +{ + assert(ref->nbEvents > 0); + assert(newfp->nbEvents > 0); + { U64 p50 = (U64)ref->nbEvents * (U64)newfp->nbEvents; + U64 deviation = fpDistance(ref, newfp, hashLog); + U64 threshold = p50 * (U64)(THRESHOLD_BASE + penalty) / THRESHOLD_PENALTY_RATE; + return deviation >= threshold; + } +} + +static void mergeEvents(Fingerprint* acc, const Fingerprint* newfp) +{ + size_t n; + for (n = 0; n < HASHTABLESIZE; n++) { + acc->events[n] += newfp->events[n]; + } + acc->nbEvents += newfp->nbEvents; +} + +static void flushEvents(FPStats* fpstats) +{ + size_t n; + for (n = 0; n < HASHTABLESIZE; n++) { + fpstats->pastEvents.events[n] = fpstats->newEvents.events[n]; + } + fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents; + ZSTD_memset(&fpstats->newEvents, 0, sizeof(fpstats->newEvents)); +} + +static void removeEvents(Fingerprint* acc, const Fingerprint* slice) +{ + size_t n; + for (n = 0; n < HASHTABLESIZE; n++) { + assert(acc->events[n] >= slice->events[n]); + acc->events[n] -= slice->events[n]; + } + acc->nbEvents -= slice->nbEvents; +} + +#define CHUNKSIZE (8 << 10) +static size_t ZSTD_splitBlock_byChunks(const void* blockStart, size_t blockSize, + int level, + void* workspace, size_t wkspSize) +{ + static const RecordEvents_f records_fs[] = { + FP_RECORD(43), FP_RECORD(11), FP_RECORD(5), FP_RECORD(1) + }; + static const unsigned hashParams[] = { 8, 9, 10, 10 }; + const RecordEvents_f record_f = (assert(0<=level && level<=3), records_fs[level]); + FPStats* const fpstats = (FPStats*)workspace; + const char* p = (const char*)blockStart; + int penalty = THRESHOLD_PENALTY; + size_t pos = 0; + assert(blockSize == (128 << 10)); + assert(workspace != NULL); + assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0); + ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats)); + assert(wkspSize >= sizeof(FPStats)); (void)wkspSize; + + initStats(fpstats); + record_f(&fpstats->pastEvents, p, CHUNKSIZE); + for (pos = CHUNKSIZE; pos <= blockSize - CHUNKSIZE; pos += CHUNKSIZE) { + record_f(&fpstats->newEvents, p + pos, CHUNKSIZE); + if (compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, penalty, hashParams[level])) { + return pos; + } else { + mergeEvents(&fpstats->pastEvents, &fpstats->newEvents); + if (penalty > 0) penalty--; + } + } + assert(pos == blockSize); + return blockSize; + (void)flushEvents; (void)removeEvents; +} + +/* ZSTD_splitBlock_fromBorders(): very fast strategy : + * compare fingerprint from beginning and end of the block, + * derive from their difference if it's preferable to split in the middle, + * repeat the process a second time, for finer grained decision. + * 3 times did not brought improvements, so I stopped at 2. + * Benefits are good enough for a cheap heuristic. + * More accurate splitting saves more, but speed impact is also more perceptible. + * For better accuracy, use more elaborate variant *_byChunks. + */ +static size_t ZSTD_splitBlock_fromBorders(const void* blockStart, size_t blockSize, + void* workspace, size_t wkspSize) +{ +#define SEGMENT_SIZE 512 + FPStats* const fpstats = (FPStats*)workspace; + Fingerprint* middleEvents = (Fingerprint*)(void*)((char*)workspace + 512 * sizeof(unsigned)); + assert(blockSize == (128 << 10)); + assert(workspace != NULL); + assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0); + ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats)); + assert(wkspSize >= sizeof(FPStats)); (void)wkspSize; + + initStats(fpstats); + HIST_add(fpstats->pastEvents.events, blockStart, SEGMENT_SIZE); + HIST_add(fpstats->newEvents.events, (const char*)blockStart + blockSize - SEGMENT_SIZE, SEGMENT_SIZE); + fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents = SEGMENT_SIZE; + if (!compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, 0, 8)) + return blockSize; + + HIST_add(middleEvents->events, (const char*)blockStart + blockSize/2 - SEGMENT_SIZE/2, SEGMENT_SIZE); + middleEvents->nbEvents = SEGMENT_SIZE; + { U64 const distFromBegin = fpDistance(&fpstats->pastEvents, middleEvents, 8); + U64 const distFromEnd = fpDistance(&fpstats->newEvents, middleEvents, 8); + U64 const minDistance = SEGMENT_SIZE * SEGMENT_SIZE / 3; + if (abs64((S64)distFromBegin - (S64)distFromEnd) < minDistance) + return 64 KB; + return (distFromBegin > distFromEnd) ? 32 KB : 96 KB; + } +} + +size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize, + int level, + void* workspace, size_t wkspSize) +{ + DEBUGLOG(6, "ZSTD_splitBlock (level=%i)", level); + assert(0<=level && level<=4); + if (level == 0) + return ZSTD_splitBlock_fromBorders(blockStart, blockSize, workspace, wkspSize); + /* level >= 1*/ + return ZSTD_splitBlock_byChunks(blockStart, blockSize, level-1, workspace, wkspSize); +} + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_preSplit.h b/vendor/github.com/DataDog/zstd/zstd_preSplit.h new file mode 100644 index 0000000..b687916 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/zstd_preSplit.h @@ -0,0 +1,36 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_PRESPLIT_H +#define ZSTD_PRESPLIT_H + +#include /* size_t */ + +#define ZSTD_SLIPBLOCK_WORKSPACESIZE 8208 + +/* ZSTD_splitBlock(): + * @level must be a value between 0 and 4. + * higher levels spend more energy to detect block boundaries. + * @workspace must be aligned for size_t. + * @wkspSize must be at least >= ZSTD_SLIPBLOCK_WORKSPACESIZE + * note: + * For the time being, this function only accepts full 128 KB blocks. + * Therefore, @blockSize must be == 128 KB. + * While this could be extended to smaller sizes in the future, + * it is not yet clear if this would be useful. TBD. + */ +size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize, + int level, + void* workspace, size_t wkspSize); + +#endif /* ZSTD_PRESPLIT_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_stream.go b/vendor/github.com/DataDog/zstd/zstd_stream.go index 2df078c..714ecfe 100644 --- a/vendor/github.com/DataDog/zstd/zstd_stream.go +++ b/vendor/github.com/DataDog/zstd/zstd_stream.go @@ -1,8 +1,6 @@ package zstd /* -#define ZSTD_STATIC_LINKING_ONLY -#include "stdint.h" // for uintptr_t #include "zstd.h" typedef struct compressStream2_result_s { @@ -11,9 +9,10 @@ typedef struct compressStream2_result_s { size_t bytes_written; } compressStream2_result; -static void ZSTD_compressStream2_wrapper(compressStream2_result* result, ZSTD_CCtx* ctx, uintptr_t dst, size_t maxDstSize, const uintptr_t src, size_t srcSize) { - ZSTD_outBuffer outBuffer = { (void*)dst, maxDstSize, 0 }; - ZSTD_inBuffer inBuffer = { (void*)src, srcSize, 0 }; +static void ZSTD_compressStream2_wrapper(compressStream2_result* result, ZSTD_CCtx* ctx, + void* dst, size_t maxDstSize, const void* src, size_t srcSize) { + ZSTD_outBuffer outBuffer = { dst, maxDstSize, 0 }; + ZSTD_inBuffer inBuffer = { src, srcSize, 0 }; size_t retCode = ZSTD_compressStream2(ctx, &outBuffer, &inBuffer, ZSTD_e_continue); result->return_code = retCode; @@ -21,9 +20,21 @@ static void ZSTD_compressStream2_wrapper(compressStream2_result* result, ZSTD_CC result->bytes_written = outBuffer.pos; } -static void ZSTD_compressStream2_finish(compressStream2_result* result, ZSTD_CCtx* ctx, uintptr_t dst, size_t maxDstSize, const uintptr_t src, size_t srcSize) { - ZSTD_outBuffer outBuffer = { (void*)dst, maxDstSize, 0 }; - ZSTD_inBuffer inBuffer = { (void*)src, srcSize, 0 }; +static void ZSTD_compressStream2_flush(compressStream2_result* result, ZSTD_CCtx* ctx, + void* dst, size_t maxDstSize, const void* src, size_t srcSize) { + ZSTD_outBuffer outBuffer = { dst, maxDstSize, 0 }; + ZSTD_inBuffer inBuffer = { src, srcSize, 0 }; + size_t retCode = ZSTD_compressStream2(ctx, &outBuffer, &inBuffer, ZSTD_e_flush); + + result->return_code = retCode; + result->bytes_consumed = inBuffer.pos; + result->bytes_written = outBuffer.pos; +} + +static void ZSTD_compressStream2_finish(compressStream2_result* result, ZSTD_CCtx* ctx, + void* dst, size_t maxDstSize, const void* src, size_t srcSize) { + ZSTD_outBuffer outBuffer = { dst, maxDstSize, 0 }; + ZSTD_inBuffer inBuffer = { src, srcSize, 0 }; size_t retCode = ZSTD_compressStream2(ctx, &outBuffer, &inBuffer, ZSTD_e_end); result->return_code = retCode; @@ -38,9 +49,10 @@ typedef struct decompressStream2_result_s { size_t bytes_written; } decompressStream2_result; -static void ZSTD_decompressStream_wrapper(decompressStream2_result* result, ZSTD_DCtx* ctx, uintptr_t dst, size_t maxDstSize, const uintptr_t src, size_t srcSize) { - ZSTD_outBuffer outBuffer = { (void*)dst, maxDstSize, 0 }; - ZSTD_inBuffer inBuffer = { (void*)src, srcSize, 0 }; +static void ZSTD_decompressStream_wrapper(decompressStream2_result* result, ZSTD_DCtx* ctx, + void* dst, size_t maxDstSize, const void* src, size_t srcSize) { + ZSTD_outBuffer outBuffer = { dst, maxDstSize, 0 }; + ZSTD_inBuffer inBuffer = { src, srcSize, 0 }; size_t retCode = ZSTD_decompressStream(ctx, &outBuffer, &inBuffer); result->return_code = retCode; @@ -60,6 +72,7 @@ import ( var errShortRead = errors.New("short read") var errReaderClosed = errors.New("Reader is closed") +var ErrNoParallelSupport = errors.New("No parallel support") // Writer is an io.WriteCloser that zstd-compresses its input. type Writer struct { @@ -67,7 +80,6 @@ type Writer struct { ctx *C.ZSTD_CCtx dict []byte - srcBuffer []byte dstBuffer []byte firstError error underlyingWriter io.Writer @@ -125,7 +137,6 @@ func NewWriterLevelDict(w io.Writer, level int, dict []byte) *Writer { CompressionLevel: level, ctx: ctx, dict: dict, - srcBuffer: make([]byte, 0), dstBuffer: make([]byte, CompressBound(1024)), firstError: err, underlyingWriter: w, @@ -141,66 +152,96 @@ func (w *Writer) Write(p []byte) (int, error) { if len(p) == 0 { return 0, nil } + total := len(p) // Check if dstBuffer is enough w.dstBuffer = w.dstBuffer[0:cap(w.dstBuffer)] if len(w.dstBuffer) < CompressBound(len(p)) { w.dstBuffer = make([]byte, CompressBound(len(p))) } - // Do not do an extra memcopy if zstd ingest all input data - srcData := p - fastPath := len(w.srcBuffer) == 0 - if !fastPath { - w.srcBuffer = append(w.srcBuffer, p...) - srcData = w.srcBuffer - } - - srcPtr := C.uintptr_t(uintptr(0)) // Do not point anywhere, if src is empty - if len(srcData) > 0 { - srcPtr = C.uintptr_t(uintptr(unsafe.Pointer(&srcData[0]))) - } - - C.ZSTD_compressStream2_wrapper( - w.resultBuffer, - w.ctx, - C.uintptr_t(uintptr(unsafe.Pointer(&w.dstBuffer[0]))), - C.size_t(len(w.dstBuffer)), - srcPtr, - C.size_t(len(srcData)), - ) - runtime.KeepAlive(p) // Ensure p is kept until here so pointer doesn't disappear during C call - ret := int(w.resultBuffer.return_code) - if err := getError(ret); err != nil { - return 0, err - } + dstoff := 0 + consumed := 0 + for len(p) > 0 { + C.ZSTD_compressStream2_wrapper( + w.resultBuffer, + w.ctx, + unsafe.Pointer(&w.dstBuffer[dstoff]), + C.size_t(len(w.dstBuffer[dstoff:])), + unsafe.Pointer(&p[0]), + C.size_t(len(p)), + ) + ret := int(w.resultBuffer.return_code) + if err := getError(ret); err != nil { + // The stream is dead after this. + w.firstError = err + return 0, err + } + p = p[w.resultBuffer.bytes_consumed:] + dstoff += int(w.resultBuffer.bytes_written) + consumed += int(w.resultBuffer.bytes_consumed) + if len(p) > 0 && dstoff == len(w.dstBuffer) { + // We have bytes remaining to compress and our output buffer + // filled up. This shouldn't happen since we calculated it + // in advance using CompressBound, but we need to handle it + // in case there was some miscalculation, or the internal + // stream buffer contained enough data from previous writes + // to overflow dstBuffer. (it's not clear from the docs + // whether this is possible) + // + // Allocate space for whatever we haven't compressed yet. + newbuf := make([]byte, len(w.dstBuffer)+CompressBound(total-consumed)) + copy(newbuf, w.dstBuffer) + w.dstBuffer = newbuf - consumed := int(w.resultBuffer.bytes_consumed) - if !fastPath { - w.srcBuffer = w.srcBuffer[consumed:] - } else { - remaining := len(p) - consumed - if remaining > 0 { - // We still have some non-consumed data, copy remaining data to srcBuffer - // Try to not reallocate w.srcBuffer if we already have enough space - if cap(w.srcBuffer) >= remaining { - w.srcBuffer = w.srcBuffer[0:remaining] - } else { - w.srcBuffer = make([]byte, remaining) - } - copy(w.srcBuffer, p[consumed:]) } } - written := int(w.resultBuffer.bytes_written) // Write to underlying buffer - _, err := w.underlyingWriter.Write(w.dstBuffer[:written]) + _, err := w.underlyingWriter.Write(w.dstBuffer[:dstoff]) // Same behaviour as zlib, we can't know how much data we wrote, only // if there was an error if err != nil { return 0, err } - return len(p), err + return total, err +} + +// Flush writes any unwritten data to the underlying io.Writer. +func (w *Writer) Flush() error { + if w.firstError != nil { + return w.firstError + } + + ret := 1 // So we loop at least once + for ret > 0 { + C.ZSTD_compressStream2_flush( + w.resultBuffer, + w.ctx, + unsafe.Pointer(&w.dstBuffer[0]), + C.size_t(len(w.dstBuffer)), + unsafe.Pointer(uintptr(0)), + C.size_t(0), + ) + ret = int(w.resultBuffer.return_code) + if err := getError(ret); err != nil { + return err + } + written := int(w.resultBuffer.bytes_written) + _, err := w.underlyingWriter.Write(w.dstBuffer[:written]) + if err != nil { + return err + } + + if ret > 0 { // We have a hint if we need to resize the dstBuffer + w.dstBuffer = w.dstBuffer[:cap(w.dstBuffer)] + if len(w.dstBuffer) < ret { + w.dstBuffer = make([]byte, ret) + } + } + } + + return nil } // Close closes the Writer, flushing any unwritten data to the underlying @@ -212,26 +253,24 @@ func (w *Writer) Close() error { ret := 1 // So we loop at least once for ret > 0 { - srcPtr := C.uintptr_t(uintptr(0)) // Do not point anywhere, if src is empty - if len(w.srcBuffer) > 0 { - srcPtr = C.uintptr_t(uintptr(unsafe.Pointer(&w.srcBuffer[0]))) - } - C.ZSTD_compressStream2_finish( w.resultBuffer, w.ctx, - C.uintptr_t(uintptr(unsafe.Pointer(&w.dstBuffer[0]))), + unsafe.Pointer(&w.dstBuffer[0]), C.size_t(len(w.dstBuffer)), - srcPtr, - C.size_t(len(w.srcBuffer)), + unsafe.Pointer(uintptr(0)), + C.size_t(0), ) ret = int(w.resultBuffer.return_code) if err := getError(ret); err != nil { return err } - w.srcBuffer = w.srcBuffer[w.resultBuffer.bytes_consumed:] written := int(w.resultBuffer.bytes_written) - w.underlyingWriter.Write(w.dstBuffer[:written]) + _, err := w.underlyingWriter.Write(w.dstBuffer[:written]) + if err != nil { + C.ZSTD_freeCStream(w.ctx) + return err + } if ret > 0 { // We have a hint if we need to resize the dstBuffer w.dstBuffer = w.dstBuffer[:cap(w.dstBuffer)] @@ -244,6 +283,28 @@ func (w *Writer) Close() error { return getError(int(C.ZSTD_freeCStream(w.ctx))) } +// Set the number of workers to run the compression in parallel using multiple threads +// If > 1, the Write() call will become asynchronous. This means data will be buffered until processed. +// If you call Write() too fast, you might incur a memory buffer up to as large as your input. +// Consider calling Flush() periodically if you need to compress a very large file that would not fit all in memory. +// By default only one worker is used. +func (w *Writer) SetNbWorkers(n int) error { + if w.firstError != nil { + return w.firstError + } + if err := getError(int(C.ZSTD_CCtx_setParameter(w.ctx, C.ZSTD_c_nbWorkers, C.int(n)))); err != nil { + w.firstError = err + // First error case, a shared libary is used, and the library was compiled without parallel support + if err.Error() == "Unsupported parameter" { + return ErrNoParallelSupport + } else { + // This could happen if a very large number is passed in, and possibly zstd refuse to create as many threads, or the OS fails to do so + return err + } + } + return nil +} + // cSize is the recommended size of reader.compressionBuffer. This func and // invocation allow for a one-time check for validity. var cSize = func() int { @@ -363,50 +424,87 @@ func (r *reader) Read(p []byte) (int, error) { return 0, r.firstError } - // If we already have enough bytes, return - if r.decompSize-r.decompOff >= len(p) { - copy(p, r.decompressionBuffer[r.decompOff:]) - r.decompOff += len(p) - return len(p), nil + if len(p) == 0 { + return 0, nil } - copy(p, r.decompressionBuffer[r.decompOff:r.decompSize]) - got := r.decompSize - r.decompOff - r.decompSize = 0 - r.decompOff = 0 - - for got < len(p) { - // Populate src - src := r.compressionBuffer - reader := r.underlyingReader - n, err := TryReadFull(reader, src[r.compressionLeft:]) - if err != nil && err != errShortRead { // Handle underlying reader errors first - return 0, fmt.Errorf("failed to read from underlying reader: %s", err) - } else if n == 0 && r.compressionLeft == 0 { - return got, io.EOF + // If we already have some uncompressed bytes, return without blocking + if r.decompSize > r.decompOff { + if r.decompSize-r.decompOff > len(p) { + copy(p, r.decompressionBuffer[r.decompOff:]) + r.decompOff += len(p) + return len(p), nil + } + // From https://golang.org/pkg/io/#Reader + // > Read conventionally returns what is available instead of waiting for more. + copy(p, r.decompressionBuffer[r.decompOff:r.decompSize]) + got := r.decompSize - r.decompOff + r.decompOff = r.decompSize + return got, nil + } + + // Repeatedly read from the underlying reader until we get + // at least one zstd block, so that we don't block if the + // other end has flushed a block. + for { + // - If the last decompression didn't entirely fill the decompression buffer, + // zstd flushed all it could, and needs new data. In that case, do 1 Read. + // - If the last decompression did entirely fill the decompression buffer, + // it might have needed more room to decompress the input. In that case, + // don't do any unnecessary Read that might block. + needsData := r.decompSize < len(r.decompressionBuffer) + + var src []byte + if !needsData { + src = r.compressionBuffer[:r.compressionLeft] + } else { + src = r.compressionBuffer + var n int + var err error + // Read until data arrives or an error occurs. + for n == 0 && err == nil { + n, err = r.underlyingReader.Read(src[r.compressionLeft:]) + } + if err != nil && err != io.EOF { // Handle underlying reader errors first + return 0, fmt.Errorf("failed to read from underlying reader: %w", err) + } + if n == 0 { + // Ideally, we'd return with ErrUnexpectedEOF in all cases where the stream was unexpectedly EOF'd + // during a block or frame, i.e. when there are incomplete, pending compression data. + // However, it's hard to detect those cases with zstd. Namely, there is no way to know the size of + // the current buffered compression data in the zstd stream internal buffers. + // Best effort: throw ErrUnexpectedEOF if we still have some pending buffered compression data that + // zstd doesn't want to accept. + // If we don't have any buffered compression data but zstd still has some in its internal buffers, + // we will return with EOF instead. + if r.compressionLeft > 0 { + return 0, io.ErrUnexpectedEOF + } + return 0, io.EOF + } + src = src[:r.compressionLeft+n] } - src = src[:r.compressionLeft+n] // C code - srcPtr := C.uintptr_t(uintptr(0)) // Do not point anywhere, if src is empty + var srcPtr *byte // Do not point anywhere, if src is empty if len(src) > 0 { - srcPtr = C.uintptr_t(uintptr(unsafe.Pointer(&src[0]))) + srcPtr = &src[0] } C.ZSTD_decompressStream_wrapper( r.resultBuffer, r.ctx, - C.uintptr_t(uintptr(unsafe.Pointer(&r.decompressionBuffer[0]))), + unsafe.Pointer(&r.decompressionBuffer[0]), C.size_t(len(r.decompressionBuffer)), - srcPtr, + unsafe.Pointer(srcPtr), C.size_t(len(src)), ) retCode := int(r.resultBuffer.return_code) - // Keep src here eventhough we reuse later, the code might be deleted at some point + // Keep src here even though we reuse later, the code might be deleted at some point runtime.KeepAlive(src) - if err = getError(retCode); err != nil { - return 0, fmt.Errorf("failed to decompress: %s", err) + if err := getError(retCode); err != nil { + return 0, fmt.Errorf("failed to decompress: %w", err) } // Put everything in buffer @@ -415,10 +513,9 @@ func (r *reader) Read(p []byte) (int, error) { left := src[bytesConsumed:] copy(r.compressionBuffer, left) } - r.compressionLeft = len(src) - int(bytesConsumed) + r.compressionLeft = len(src) - bytesConsumed r.decompSize = int(r.resultBuffer.bytes_written) - r.decompOff = copy(p[got:], r.decompressionBuffer[:r.decompSize]) - got += r.decompOff + r.decompOff = copy(p, r.decompressionBuffer[:r.decompSize]) // Resize buffers nsize := retCode // Hint for next src buffer size @@ -430,25 +527,9 @@ func (r *reader) Read(p []byte) (int, error) { nsize = r.compressionLeft } r.compressionBuffer = resize(r.compressionBuffer, nsize) - } - return got, nil -} -// TryReadFull reads buffer just as ReadFull does -// Here we expect that buffer may end and we do not return ErrUnexpectedEOF as ReadAtLeast does. -// We return errShortRead instead to distinguish short reads and failures. -// We cannot use ReadFull/ReadAtLeast because it masks Reader errors, such as network failures -// and causes panic instead of error. -func TryReadFull(r io.Reader, buf []byte) (n int, err error) { - for n < len(buf) && err == nil { - var nn int - nn, err = r.Read(buf[n:]) - n += nn - } - if n == len(buf) && err == io.EOF { - err = nil // EOF at the end is somewhat expected - } else if err == io.EOF { - err = errShortRead + if r.decompOff > 0 { + return r.decompOff, nil + } } - return } diff --git a/vendor/github.com/DataDog/zstd/zstd_trace.h b/vendor/github.com/DataDog/zstd/zstd_trace.h new file mode 100644 index 0000000..96be633 --- /dev/null +++ b/vendor/github.com/DataDog/zstd/zstd_trace.h @@ -0,0 +1,159 @@ +#ifndef USE_EXTERNAL_ZSTD +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_TRACE_H +#define ZSTD_TRACE_H + +#include + +/* weak symbol support + * For now, enable conservatively: + * - Only GNUC + * - Only ELF + * - Only x86-64, i386, aarch64 and risc-v. + * Also, explicitly disable on platforms known not to work so they aren't + * forgotten in the future. + */ +#if !defined(ZSTD_HAVE_WEAK_SYMBOLS) && \ + defined(__GNUC__) && defined(__ELF__) && \ + (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) || defined(__aarch64__) || defined(__riscv)) && \ + !defined(__APPLE__) && !defined(_WIN32) && !defined(__MINGW32__) && \ + !defined(__CYGWIN__) && !defined(_AIX) +# define ZSTD_HAVE_WEAK_SYMBOLS 1 +#else +# define ZSTD_HAVE_WEAK_SYMBOLS 0 +#endif +#if ZSTD_HAVE_WEAK_SYMBOLS +# define ZSTD_WEAK_ATTR __attribute__((__weak__)) +#else +# define ZSTD_WEAK_ATTR +#endif + +/* Only enable tracing when weak symbols are available. */ +#ifndef ZSTD_TRACE +# define ZSTD_TRACE ZSTD_HAVE_WEAK_SYMBOLS +#endif + +#if ZSTD_TRACE + +struct ZSTD_CCtx_s; +struct ZSTD_DCtx_s; +struct ZSTD_CCtx_params_s; + +typedef struct { + /** + * ZSTD_VERSION_NUMBER + * + * This is guaranteed to be the first member of ZSTD_trace. + * Otherwise, this struct is not stable between versions. If + * the version number does not match your expectation, you + * should not interpret the rest of the struct. + */ + unsigned version; + /** + * Non-zero if streaming (de)compression is used. + */ + int streaming; + /** + * The dictionary ID. + */ + unsigned dictionaryID; + /** + * Is the dictionary cold? + * Only set on decompression. + */ + int dictionaryIsCold; + /** + * The dictionary size or zero if no dictionary. + */ + size_t dictionarySize; + /** + * The uncompressed size of the data. + */ + size_t uncompressedSize; + /** + * The compressed size of the data. + */ + size_t compressedSize; + /** + * The fully resolved CCtx parameters (NULL on decompression). + */ + struct ZSTD_CCtx_params_s const* params; + /** + * The ZSTD_CCtx pointer (NULL on decompression). + */ + struct ZSTD_CCtx_s const* cctx; + /** + * The ZSTD_DCtx pointer (NULL on compression). + */ + struct ZSTD_DCtx_s const* dctx; +} ZSTD_Trace; + +/** + * A tracing context. It must be 0 when tracing is disabled. + * Otherwise, any non-zero value returned by a tracing begin() + * function is presented to any subsequent calls to end(). + * + * Any non-zero value is treated as tracing is enabled and not + * interpreted by the library. + * + * Two possible uses are: + * * A timestamp for when the begin() function was called. + * * A unique key identifying the (de)compression, like the + * address of the [dc]ctx pointer if you need to track + * more information than just a timestamp. + */ +typedef unsigned long long ZSTD_TraceCtx; + +/** + * Trace the beginning of a compression call. + * @param cctx The dctx pointer for the compression. + * It can be used as a key to map begin() to end(). + * @returns Non-zero if tracing is enabled. The return value is + * passed to ZSTD_trace_compress_end(). + */ +ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_compress_begin( + struct ZSTD_CCtx_s const* cctx); + +/** + * Trace the end of a compression call. + * @param ctx The return value of ZSTD_trace_compress_begin(). + * @param trace The zstd tracing info. + */ +ZSTD_WEAK_ATTR void ZSTD_trace_compress_end( + ZSTD_TraceCtx ctx, + ZSTD_Trace const* trace); + +/** + * Trace the beginning of a decompression call. + * @param dctx The dctx pointer for the decompression. + * It can be used as a key to map begin() to end(). + * @returns Non-zero if tracing is enabled. The return value is + * passed to ZSTD_trace_compress_end(). + */ +ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_decompress_begin( + struct ZSTD_DCtx_s const* dctx); + +/** + * Trace the end of a decompression call. + * @param ctx The return value of ZSTD_trace_decompress_begin(). + * @param trace The zstd tracing info. + */ +ZSTD_WEAK_ATTR void ZSTD_trace_decompress_end( + ZSTD_TraceCtx ctx, + ZSTD_Trace const* trace); + +#endif /* ZSTD_TRACE */ + +#endif /* ZSTD_TRACE_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v01.c b/vendor/github.com/DataDog/zstd/zstd_v01.c index 8112527..05b3eb6 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v01.c +++ b/vendor/github.com/DataDog/zstd/zstd_v01.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -14,6 +15,7 @@ ******************************************/ #include /* size_t, ptrdiff_t */ #include "zstd_v01.h" +#include "compiler.h" #include "error_private.h" @@ -190,28 +192,6 @@ typedef signed long long S64; /**************************************************************** * Memory I/O *****************************************************************/ -/* FSE_FORCE_MEMORY_ACCESS - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets generating assembly depending on alignment. - * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef FSE_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define FSE_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) -# define FSE_FORCE_MEMORY_ACCESS 1 -# endif -#endif - static unsigned FSE_32bits(void) { @@ -224,24 +204,6 @@ static unsigned FSE_isLittleEndian(void) return one.c[0]; } -#if defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==2) - -static U16 FSE_read16(const void* memPtr) { return *(const U16*) memPtr; } -static U32 FSE_read32(const void* memPtr) { return *(const U32*) memPtr; } -static U64 FSE_read64(const void* memPtr) { return *(const U64*) memPtr; } - -#elif defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==1) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign; - -static U16 FSE_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } -static U32 FSE_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -static U64 FSE_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } - -#else - static U16 FSE_read16(const void* memPtr) { U16 val; memcpy(&val, memPtr, sizeof(val)); return val; @@ -257,8 +219,6 @@ static U64 FSE_read64(const void* memPtr) U64 val; memcpy(&val, memPtr, sizeof(val)); return val; } -#endif // FSE_FORCE_MEMORY_ACCESS - static U16 FSE_readLE16(const void* memPtr) { if (FSE_isLittleEndian()) @@ -343,8 +303,7 @@ FORCE_INLINE unsigned FSE_highbit32 (U32 val) { # if defined(_MSC_VER) /* Visual */ unsigned long r; - _BitScanReverse ( &r, val ); - return (unsigned) r; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; # elif defined(__GNUC__) && (GCC_VERSION >= 304) /* GCC Intrinsic */ return __builtin_clz (val) ^ 31; # else /* Software version */ @@ -1078,7 +1037,7 @@ static size_t HUF_decompress_usingDTable( /* -3% slower when non static */ BYTE* const ostart = (BYTE*) dst; BYTE* op = ostart; BYTE* const omax = op + maxDstSize; - BYTE* const olimit = omax-15; + BYTE* const olimit = maxDstSize < 15 ? op : omax-15; const void* ptr = DTable; const HUF_DElt* const dt = (const HUF_DElt*)(ptr)+1; @@ -1092,7 +1051,7 @@ static size_t HUF_decompress_usingDTable( /* -3% slower when non static */ const size_t length1 = FSE_readLE16(jumpTable); const size_t length2 = FSE_readLE16(jumpTable+1); const size_t length3 = FSE_readLE16(jumpTable+2); - const size_t length4 = cSrcSize - 6 - length1 - length2 - length3; // check coherency !! + const size_t length4 = cSrcSize - 6 - length1 - length2 - length3; /* check coherency !! */ const char* const start1 = (const char*)(cSrc) + 6; const char* const start2 = start1 + length1; const char* const start3 = start2 + length2; @@ -1150,11 +1109,11 @@ static size_t HUF_decompress_usingDTable( /* -3% slower when non static */ /* tail */ { - // bitTail = bitD1; // *much* slower : -20% !??! + /* bitTail = bitD1; */ /* *much* slower : -20% !??! */ FSE_DStream_t bitTail; bitTail.ptr = bitD1.ptr; bitTail.bitsConsumed = bitD1.bitsConsumed; - bitTail.bitContainer = bitD1.bitContainer; // required in case of FSE_DStream_endOfBuffer + bitTail.bitContainer = bitD1.bitContainer; /* required in case of FSE_DStream_endOfBuffer */ bitTail.start = start1; for ( ; (FSE_reloadDStream(&bitTail) < FSE_DStream_completed) && (op= 199901L /* C99 */ -# include +# if defined(_AIX) +# include +# else +# include /* intptr_t */ +# endif typedef uint8_t BYTE; typedef uint16_t U16; typedef int16_t S16; @@ -1421,7 +1384,7 @@ typedef struct { BYTE* matchLength; BYTE* dumpsStart; BYTE* dumps; -} seqStore_t; +} SeqStore_t; typedef struct ZSTD_Cctx_s @@ -1429,7 +1392,7 @@ typedef struct ZSTD_Cctx_s const BYTE* base; U32 current; U32 nextUpdate; - seqStore_t seqStore; + SeqStore_t seqStore; #ifdef __AVX2__ __m256i hashTable[HASH_TABLESIZE>>3]; #else @@ -1483,7 +1446,9 @@ static size_t ZSTDv01_getcBlockSize(const void* src, size_t srcSize, blockProper static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize) { if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall); - memcpy(dst, src, srcSize); + if (srcSize > 0) { + memcpy(dst, src, srcSize); + } return srcSize; } @@ -1502,7 +1467,7 @@ static size_t ZSTD_decompressLiterals(void* ctx, if (srcSize <= 3) return ERROR(corruption_detected); litSize = ip[1] + (ip[0]<<8); - litSize += ((ip[-3] >> 3) & 7) << 16; // mmmmh.... + litSize += ((ip[-3] >> 3) & 7) << 16; /* mmmmh.... */ op = oend - litSize; (void)ctx; @@ -1541,7 +1506,9 @@ static size_t ZSTDv01_decodeLiteralsBlock(void* ctx, size_t rleSize = litbp.origSize; if (rleSize>maxDstSize) return ERROR(dstSize_tooSmall); if (!srcSize) return ERROR(srcSize_wrong); - memset(oend - rleSize, *ip, rleSize); + if (rleSize > 0) { + memset(oend - rleSize, *ip, rleSize); + } *litStart = oend - rleSize; *litSize = rleSize; ip++; @@ -1755,20 +1722,26 @@ static size_t ZSTD_execSequence(BYTE* op, static const int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4}; /* added */ static const int dec64table[] = {8, 8, 8, 7, 8, 9,10,11}; /* subtracted */ const BYTE* const ostart = op; + BYTE* const oLitEnd = op + sequence.litLength; const size_t litLength = sequence.litLength; BYTE* const endMatch = op + litLength + sequence.matchLength; /* risk : address space overflow (32-bits) */ const BYTE* const litEnd = *litPtr + litLength; - /* check */ + /* checks */ + size_t const seqLength = sequence.litLength + sequence.matchLength; + + if (seqLength > (size_t)(oend - op)) return ERROR(dstSize_tooSmall); + if (sequence.litLength > (size_t)(litLimit - *litPtr)) return ERROR(corruption_detected); + /* Now we know there are no overflow in literal nor match lengths, can use pointer checks */ + if (sequence.offset > (U32)(oLitEnd - base)) return ERROR(corruption_detected); + if (endMatch > oend) return ERROR(dstSize_tooSmall); /* overwrite beyond dst buffer */ - if (litEnd > litLimit) return ERROR(corruption_detected); - if (sequence.matchLength > (size_t)(*litPtr-op)) return ERROR(dstSize_tooSmall); /* overwrite literal segment */ + if (litEnd > litLimit) return ERROR(corruption_detected); /* overRead beyond lit buffer */ + if (sequence.matchLength > (size_t)(*litPtr-op)) return ERROR(dstSize_tooSmall); /* overwrite literal segment */ /* copy Literals */ - if (((size_t)(*litPtr - op) < 8) || ((size_t)(oend-litEnd) < 8) || (op+litLength > oend-8)) - memmove(op, *litPtr, litLength); /* overwrite risk */ - else - ZSTD_wildcopy(op, *litPtr, litLength); + ZSTD_memmove(op, *litPtr, sequence.litLength); /* note : v0.1 seems to allow scenarios where output or input are close to end of buffer */ + op += litLength; *litPtr = litEnd; /* update for next sequence */ @@ -1901,8 +1874,10 @@ static size_t ZSTD_decompressSequences( { size_t lastLLSize = litEnd - litPtr; if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall); - if (op != litPtr) memmove(op, litPtr, lastLLSize); - op += lastLLSize; + if (lastLLSize > 0) { + if (op != litPtr) memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } } } @@ -2145,8 +2120,11 @@ size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSi } ctx->phase = 1; ctx->expected = ZSTD_blockHeaderSize; + if (ZSTDv01_isError(rSize)) return rSize; ctx->previousDstEnd = (void*)( ((char*)dst) + rSize); return rSize; } } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v01.h b/vendor/github.com/DataDog/zstd/zstd_v01.h index 245f9dd..29c7143 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v01.h +++ b/vendor/github.com/DataDog/zstd/zstd_v01.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -92,3 +93,5 @@ size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSi #endif #endif /* ZSTD_V01_H_28739879432 */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v02.c b/vendor/github.com/DataDog/zstd/zstd_v02.c index c878379..348f042 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v02.c +++ b/vendor/github.com/DataDog/zstd/zstd_v02.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,6 +12,7 @@ #include /* size_t, ptrdiff_t */ #include "zstd_v02.h" +#include "compiler.h" #include "error_private.h" @@ -28,7 +30,7 @@ low-level memory access routines Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -71,25 +73,15 @@ extern "C" { #include /* memcpy */ -/****************************************** -* Compiler-specific -******************************************/ -#if defined(__GNUC__) -# define MEM_STATIC static __attribute__((unused)) -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define MEM_STATIC static inline -#elif defined(_MSC_VER) -# define MEM_STATIC static __inline -#else -# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ -#endif - - /**************************************************************** * Basic Types *****************************************************************/ #if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# include +# if defined(_AIX) +# include +# else +# include /* intptr_t */ +# endif typedef uint8_t BYTE; typedef uint16_t U16; typedef int16_t S16; @@ -111,27 +103,6 @@ extern "C" { /**************************************************************** * Memory I/O *****************************************************************/ -/* MEM_FORCE_MEMORY_ACCESS - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets generating assembly depending on alignment. - * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define MEM_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) -# define MEM_FORCE_MEMORY_ACCESS 1 -# endif -#endif MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; } MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; } @@ -142,33 +113,6 @@ MEM_STATIC unsigned MEM_isLittleEndian(void) return one.c[0]; } -#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) - -/* violates C standard on structure alignment. -Only use if no other choice to achieve best performance on target platform */ -MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } -MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } -MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } - -#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign; - -MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } -MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } - -#else - -/* default method, safe and standard. - can sometimes prove slower */ - MEM_STATIC U16 MEM_read16(const void* memPtr) { U16 val; memcpy(&val, memPtr, sizeof(val)); return val; @@ -189,9 +133,6 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) memcpy(memPtr, &value, sizeof(value)); } -#endif // MEM_FORCE_MEMORY_ACCESS - - MEM_STATIC U16 MEM_readLE16(const void* memPtr) { if (MEM_isLittleEndian()) @@ -268,7 +209,7 @@ MEM_STATIC size_t MEM_readLEST(const void* memPtr) header file (to include) Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -349,9 +290,8 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); MEM_STATIC unsigned BIT_highbit32 (U32 val) { # if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - _BitScanReverse ( &r, val ); - return (unsigned) r; + unsigned long r; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; # elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ return __builtin_clz (val) ^ 31; # else /* Software version */ @@ -433,7 +373,7 @@ MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits) } /*! BIT_lookBitsFast : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits) { const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1; @@ -453,7 +393,7 @@ MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits) } /*!BIT_readBitsFast : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits) { size_t value = BIT_lookBitsFast(bitD, nbBits); @@ -510,7 +450,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) Error codes and messages Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -609,7 +549,7 @@ typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be mor header file for static linking (only) Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -753,7 +693,7 @@ MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) header file for static linking (only) Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -822,7 +762,7 @@ static size_t HUF_decompress4X6 (void* dst, size_t dstSize, const void* cSrc, si Header File Copyright (C) 2014-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -882,7 +822,7 @@ typedef struct ZSTD_CCtx_s ZSTD_CCtx; /* incomplete type */ Header File for static linking only Copyright (C) 2014-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -923,7 +863,7 @@ extern "C" { * Streaming functions ***************************************/ -typedef struct ZSTD_DCtx_s ZSTD_DCtx; +typedef struct ZSTDv02_Dctx_s ZSTD_DCtx; /* Use above functions alternatively. @@ -946,7 +886,7 @@ typedef struct ZSTD_DCtx_s ZSTD_DCtx; FSE : Finite State Entropy coder Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1450,7 +1390,7 @@ static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, siz Huff0 : Huffman coder, part of New Generation Entropy library Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2609,7 +2549,7 @@ static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_ zstd - standard compression library Copyright (C) 2014-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2783,7 +2723,7 @@ typedef struct { BYTE* matchLength; BYTE* dumpsStart; BYTE* dumps; -} seqStore_t; +} SeqStore_t; /* ************************************* @@ -2798,7 +2738,7 @@ static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); } /* ************************************************************* * Decompression section ***************************************************************/ -struct ZSTD_DCtx_s +struct ZSTDv02_Dctx_s { U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)]; U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)]; @@ -2836,7 +2776,9 @@ static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockPropertie static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize) { if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall); - memcpy(dst, src, srcSize); + if (srcSize > 0) { + memcpy(dst, src, srcSize); + } return srcSize; } @@ -3112,12 +3054,19 @@ static size_t ZSTD_execSequence(BYTE* op, const BYTE* const litEnd = *litPtr + sequence.litLength; /* checks */ - if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of 8 from oend */ + size_t const seqLength = sequence.litLength + sequence.matchLength; + + if (seqLength > (size_t)(oend - op)) return ERROR(dstSize_tooSmall); + if (sequence.litLength > (size_t)(litLimit - *litPtr)) return ERROR(corruption_detected); + /* Now we know there are no overflow in literal nor match lengths, can use the pointer check */ + if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); + if (sequence.offset > (U32)(oLitEnd - base)) return ERROR(corruption_detected); + if (oMatchEnd > oend) return ERROR(dstSize_tooSmall); /* overwrite beyond dst buffer */ if (litEnd > litLimit) return ERROR(corruption_detected); /* overRead beyond lit buffer */ /* copy Literals */ - ZSTD_wildcopy(op, *litPtr, sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ + ZSTD_wildcopy(op, *litPtr, (ptrdiff_t)sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ op = oLitEnd; *litPtr = litEnd; /* update for next sequence */ @@ -3229,8 +3178,10 @@ static size_t ZSTD_decompressSequences( size_t lastLLSize = litEnd - litPtr; if (litPtr > litEnd) return ERROR(corruption_detected); if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall); - if (op != litPtr) memmove(op, litPtr, lastLLSize); - op += lastLLSize; + if (lastLLSize > 0) { + if (op != litPtr) memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } } } @@ -3468,6 +3419,7 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSi } ctx->phase = 1; ctx->expected = ZSTD_blockHeaderSize; + if (ZSTD_isError(rSize)) return rSize; ctx->previousDstEnd = (void*)( ((char*)dst) + rSize); return rSize; } @@ -3512,3 +3464,5 @@ size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSi { return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize); } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v02.h b/vendor/github.com/DataDog/zstd/zstd_v02.h index 9d7d8d9..71d79ca 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v02.h +++ b/vendor/github.com/DataDog/zstd/zstd_v02.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -91,3 +92,5 @@ size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSi #endif #endif /* ZSTD_V02_H_4174539423 */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v03.c b/vendor/github.com/DataDog/zstd/zstd_v03.c index 162bd63..c26f037 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v03.c +++ b/vendor/github.com/DataDog/zstd/zstd_v03.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,6 +12,7 @@ #include /* size_t, ptrdiff_t */ #include "zstd_v03.h" +#include "compiler.h" #include "error_private.h" @@ -29,7 +31,7 @@ low-level memory access routines Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -72,25 +74,15 @@ extern "C" { #include /* memcpy */ -/****************************************** -* Compiler-specific -******************************************/ -#if defined(__GNUC__) -# define MEM_STATIC static __attribute__((unused)) -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define MEM_STATIC static inline -#elif defined(_MSC_VER) -# define MEM_STATIC static __inline -#else -# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ -#endif - - /**************************************************************** * Basic Types *****************************************************************/ #if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# include +# if defined(_AIX) +# include +# else +# include /* intptr_t */ +# endif typedef uint8_t BYTE; typedef uint16_t U16; typedef int16_t S16; @@ -112,27 +104,6 @@ extern "C" { /**************************************************************** * Memory I/O *****************************************************************/ -/* MEM_FORCE_MEMORY_ACCESS - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets generating assembly depending on alignment. - * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define MEM_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) -# define MEM_FORCE_MEMORY_ACCESS 1 -# endif -#endif MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; } MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; } @@ -143,33 +114,6 @@ MEM_STATIC unsigned MEM_isLittleEndian(void) return one.c[0]; } -#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) - -/* violates C standard on structure alignment. -Only use if no other choice to achieve best performance on target platform */ -MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } -MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } -MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } - -#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign; - -MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } -MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } - -#else - -/* default method, safe and standard. - can sometimes prove slower */ - MEM_STATIC U16 MEM_read16(const void* memPtr) { U16 val; memcpy(&val, memPtr, sizeof(val)); return val; @@ -190,10 +134,6 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) memcpy(memPtr, &value, sizeof(value)); } - -#endif // MEM_FORCE_MEMORY_ACCESS - - MEM_STATIC U16 MEM_readLE16(const void* memPtr) { if (MEM_isLittleEndian()) @@ -270,7 +210,7 @@ MEM_STATIC size_t MEM_readLEST(const void* memPtr) header file (to include) Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -352,9 +292,8 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); MEM_STATIC unsigned BIT_highbit32 (U32 val) { # if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - _BitScanReverse ( &r, val ); - return (unsigned) r; + unsigned long r; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; # elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ return __builtin_clz (val) ^ 31; # else /* Software version */ @@ -435,7 +374,7 @@ MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits) } /*! BIT_lookBitsFast : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits) { const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1; @@ -455,7 +394,7 @@ MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits) } /*!BIT_readBitsFast : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits) { size_t value = BIT_lookBitsFast(bitD, nbBits); @@ -512,7 +451,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) Error codes and messages Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -611,7 +550,7 @@ typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be mor header file for static linking (only) Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -755,7 +694,7 @@ MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) header file for static linking (only) Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -823,7 +762,7 @@ static size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, si Header File Copyright (C) 2014-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -883,7 +822,7 @@ typedef struct ZSTD_CCtx_s ZSTD_CCtx; /* incomplete type */ Header File for static linking only Copyright (C) 2014-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -924,7 +863,7 @@ extern "C" { * Streaming functions ***************************************/ -typedef struct ZSTD_DCtx_s ZSTD_DCtx; +typedef struct ZSTDv03_Dctx_s ZSTD_DCtx; /* Use above functions alternatively. @@ -947,7 +886,7 @@ typedef struct ZSTD_DCtx_s ZSTD_DCtx; FSE : Finite State Entropy coder Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1451,7 +1390,7 @@ static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, siz Huff0 : Huffman coder, part of New Generation Entropy library Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2248,7 +2187,7 @@ static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_ zstd - standard compression library Copyright (C) 2014-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2424,7 +2363,7 @@ typedef struct { BYTE* matchLength; BYTE* dumpsStart; BYTE* dumps; -} seqStore_t; +} SeqStore_t; /* ************************************* @@ -2439,7 +2378,7 @@ static unsigned ZSTD_isError(size_t code) { return ERR_isError(code); } /* ************************************************************* * Decompression section ***************************************************************/ -struct ZSTD_DCtx_s +struct ZSTDv03_Dctx_s { U32 LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)]; U32 OffTable[FSE_DTABLE_SIZE_U32(OffFSELog)]; @@ -2477,7 +2416,9 @@ static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockPropertie static size_t ZSTD_copyUncompressedBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize) { if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall); - memcpy(dst, src, srcSize); + if (srcSize > 0) { + memcpy(dst, src, srcSize); + } return srcSize; } @@ -2753,18 +2694,24 @@ static size_t ZSTD_execSequence(BYTE* op, const BYTE* const litEnd = *litPtr + sequence.litLength; /* checks */ - if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of 8 from oend */ + size_t const seqLength = sequence.litLength + sequence.matchLength; + + if (seqLength > (size_t)(oend - op)) return ERROR(dstSize_tooSmall); + if (sequence.litLength > (size_t)(litLimit - *litPtr)) return ERROR(corruption_detected); + /* Now we know there are no overflow in literal nor match lengths, can use pointer checks */ + if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); + if (sequence.offset > (U32)(oLitEnd - base)) return ERROR(corruption_detected); + if (oMatchEnd > oend) return ERROR(dstSize_tooSmall); /* overwrite beyond dst buffer */ if (litEnd > litLimit) return ERROR(corruption_detected); /* overRead beyond lit buffer */ /* copy Literals */ - ZSTD_wildcopy(op, *litPtr, sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ + ZSTD_wildcopy(op, *litPtr, (ptrdiff_t)sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ op = oLitEnd; *litPtr = litEnd; /* update for next sequence */ /* copy Match */ - { - const BYTE* match = op - sequence.offset; + { const BYTE* match = op - sequence.offset; /* check */ if (sequence.offset > (size_t)op) return ERROR(corruption_detected); /* address space overflow test (this test seems kept by clang optimizer) */ @@ -2870,8 +2817,10 @@ static size_t ZSTD_decompressSequences( size_t lastLLSize = litEnd - litPtr; if (litPtr > litEnd) return ERROR(corruption_detected); if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall); - if (op != litPtr) memmove(op, litPtr, lastLLSize); - op += lastLLSize; + if (lastLLSize > 0) { + if (op != litPtr) memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } } } @@ -3110,6 +3059,7 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSi } ctx->phase = 1; ctx->expected = ZSTD_blockHeaderSize; + if (ZSTD_isError(rSize)) return rSize; ctx->previousDstEnd = (void*)( ((char*)dst) + rSize); return rSize; } @@ -3154,3 +3104,5 @@ size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSi { return ZSTD_decompressContinue((ZSTD_DCtx*)dctx, dst, maxDstSize, src, srcSize); } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v03.h b/vendor/github.com/DataDog/zstd/zstd_v03.h index efd8c2b..ec3d897 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v03.h +++ b/vendor/github.com/DataDog/zstd/zstd_v03.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -91,3 +92,5 @@ size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSi #endif #endif /* ZSTD_V03_H_298734209782 */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v04.c b/vendor/github.com/DataDog/zstd/zstd_v04.c index 4dec308..c93b711 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v04.c +++ b/vendor/github.com/DataDog/zstd/zstd_v04.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -16,6 +17,7 @@ #include /* memcpy */ #include "zstd_v04.h" +#include "compiler.h" #include "error_private.h" @@ -37,22 +39,17 @@ extern "C" { # include /* _byteswap_ulong */ # include /* _byteswap_* */ #endif -#if defined(__GNUC__) -# define MEM_STATIC static __attribute__((unused)) -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define MEM_STATIC static inline -#elif defined(_MSC_VER) -# define MEM_STATIC static __inline -#else -# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ -#endif /**************************************************************** * Basic Types *****************************************************************/ #if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# include +# if defined(_AIX) +# include +# else +# include /* intptr_t */ +# endif typedef uint8_t BYTE; typedef uint16_t U16; typedef int16_t S16; @@ -83,27 +80,6 @@ extern "C" { /**************************************************************** * Memory I/O *****************************************************************/ -/* MEM_FORCE_MEMORY_ACCESS - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets generating assembly depending on alignment. - * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define MEM_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) -# define MEM_FORCE_MEMORY_ACCESS 1 -# endif -#endif MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; } MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; } @@ -114,33 +90,6 @@ MEM_STATIC unsigned MEM_isLittleEndian(void) return one.c[0]; } -#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) - -/* violates C standard on structure alignment. -Only use if no other choice to achieve best performance on target platform */ -MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } -MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } -MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } - -#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign; - -MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } -MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } - -#else - -/* default method, safe and standard. - can sometimes prove slower */ - MEM_STATIC U16 MEM_read16(const void* memPtr) { U16 val; memcpy(&val, memPtr, sizeof(val)); return val; @@ -161,9 +110,6 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) memcpy(memPtr, &value, sizeof(value)); } -#endif // MEM_FORCE_MEMORY_ACCESS - - MEM_STATIC U16 MEM_readLE16(const void* memPtr) { if (MEM_isLittleEndian()) @@ -541,7 +487,7 @@ If there is an error, the function will return an error code, which can be teste header file (to include) Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -623,9 +569,8 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); MEM_STATIC unsigned BIT_highbit32 (U32 val) { # if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - _BitScanReverse ( &r, val ); - return (unsigned) r; + unsigned long r; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; # elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ return __builtin_clz (val) ^ 31; # else /* Software version */ @@ -700,7 +645,7 @@ MEM_STATIC size_t BIT_lookBits(BIT_DStream_t* bitD, U32 nbBits) } /*! BIT_lookBitsFast : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BIT_lookBitsFast(BIT_DStream_t* bitD, U32 nbBits) { const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1; @@ -720,7 +665,7 @@ MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits) } /*!BIT_readBitsFast : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits) { size_t value = BIT_lookBitsFast(bitD, nbBits); @@ -781,7 +726,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) header file for static linking (only) Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -930,7 +875,7 @@ MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) FSE : Finite State Entropy coder Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1436,7 +1381,7 @@ static size_t FSE_decompress(void* dst, size_t maxDstSize, const void* cSrc, siz header file Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1514,7 +1459,7 @@ static unsigned HUF_isError(size_t code); /* tells if a return value i header file for static linking (only) Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1601,7 +1546,7 @@ static size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const Huff0 : Huffman coder, part of New Generation Entropy library Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2401,7 +2346,7 @@ static size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_ zstd - decompression module fo v0.4 legacy format Copyright (C) 2015-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2603,7 +2548,9 @@ static size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockPropertie static size_t ZSTD_copyRawBlock(void* dst, size_t maxDstSize, const void* src, size_t srcSize) { if (srcSize > maxDstSize) return ERROR(dstSize_tooSmall); - memcpy(dst, src, srcSize); + if (srcSize > 0) { + memcpy(dst, src, srcSize); + } return srcSize; } @@ -2874,13 +2821,19 @@ static size_t ZSTD_execSequence(BYTE* op, const BYTE* const litEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset; - /* check */ - if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of 8 from oend */ + /* checks */ + size_t const seqLength = sequence.litLength + sequence.matchLength; + + if (seqLength > (size_t)(oend - op)) return ERROR(dstSize_tooSmall); + if (sequence.litLength > (size_t)(litLimit - *litPtr)) return ERROR(corruption_detected); + /* Now we know there are no overflow in literal nor match lengths, can use pointer checks */ + if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); + if (oMatchEnd > oend) return ERROR(dstSize_tooSmall); /* overwrite beyond dst buffer */ - if (litEnd > litLimit) return ERROR(corruption_detected); /* risk read beyond lit buffer */ + if (litEnd > litLimit) return ERROR(corruption_detected); /* overRead beyond lit buffer */ /* copy Literals */ - ZSTD_wildcopy(op, *litPtr, sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ + ZSTD_wildcopy(op, *litPtr, (ptrdiff_t)sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ op = oLitEnd; *litPtr = litEnd; /* update for next sequence */ @@ -3008,8 +2961,10 @@ static size_t ZSTD_decompressSequences( size_t lastLLSize = litEnd - litPtr; if (litPtr > litEnd) return ERROR(corruption_detected); if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall); - if (op != litPtr) memcpy(op, litPtr, lastLLSize); - op += lastLLSize; + if (lastLLSize > 0) { + if (op != litPtr) memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } } } @@ -3256,6 +3211,7 @@ static size_t ZSTD_decompressContinue(ZSTD_DCtx* ctx, void* dst, size_t maxDstSi } ctx->stage = ZSTDds_decodeBlockHeader; ctx->expected = ZSTD_blockHeaderSize; + if (ZSTD_isError(rSize)) return rSize; ctx->previousDstEnd = (char*)dst + rSize; return rSize; } @@ -3279,7 +3235,7 @@ static void ZSTD_decompress_insertDictionary(ZSTD_DCtx* ctx, const void* dict, s Buffered version of Zstd compression library Copyright (C) 2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -3407,7 +3363,9 @@ static size_t ZBUFF_decompressWithDictionary(ZBUFF_DCtx* zbc, const void* src, s static size_t ZBUFF_limitCopy(void* dst, size_t maxDstSize, const void* src, size_t srcSize) { size_t length = MIN(maxDstSize, srcSize); - memcpy(dst, src, length); + if (length > 0) { + memcpy(dst, src, length); + } return length; } @@ -3581,8 +3539,8 @@ static size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbc, void* dst, size_t* maxDs unsigned ZBUFFv04_isError(size_t errorCode) { return ERR_isError(errorCode); } const char* ZBUFFv04_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } -size_t ZBUFFv04_recommendedDInSize() { return BLOCKSIZE + 3; } -size_t ZBUFFv04_recommendedDOutSize() { return BLOCKSIZE; } +size_t ZBUFFv04_recommendedDInSize(void) { return BLOCKSIZE + 3; } +size_t ZBUFFv04_recommendedDOutSize(void) { return BLOCKSIZE; } @@ -3639,3 +3597,5 @@ size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDs ZSTD_DCtx* ZSTDv04_createDCtx(void) { return ZSTD_createDCtx(); } size_t ZSTDv04_freeDCtx(ZSTD_DCtx* dctx) { return ZSTD_freeDCtx(dctx); } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v04.h b/vendor/github.com/DataDog/zstd/zstd_v04.h index bb5f3b7..b904be0 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v04.h +++ b/vendor/github.com/DataDog/zstd/zstd_v04.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -140,3 +141,5 @@ size_t ZBUFFv04_recommendedDOutSize(void); #endif #endif /* ZSTD_V04_H_91868324769238 */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v05.c b/vendor/github.com/DataDog/zstd/zstd_v05.c index 570e0ff..4e63cb7 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v05.c +++ b/vendor/github.com/DataDog/zstd/zstd_v05.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -19,7 +20,7 @@ low-level memory access routines Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -80,7 +81,11 @@ extern "C" { * Basic Types *****************************************************************/ #if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# include +# if defined(_AIX) +# include +# else +# include /* intptr_t */ +# endif typedef uint8_t BYTE; typedef uint16_t U16; typedef int16_t S16; @@ -102,27 +107,6 @@ extern "C" { /*-************************************************************** * Memory I/O *****************************************************************/ -/* MEM_FORCE_MEMORY_ACCESS : - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets depending on alignment. - * In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define MEM_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) -# define MEM_FORCE_MEMORY_ACCESS 1 -# endif -#endif MEM_STATIC unsigned MEM_32bits(void) { return sizeof(void*)==4; } MEM_STATIC unsigned MEM_64bits(void) { return sizeof(void*)==8; } @@ -133,37 +117,6 @@ MEM_STATIC unsigned MEM_isLittleEndian(void) return one.c[0]; } -#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) - -/* violates C standard, by lying on structure alignment. -Only use if no other choice to achieve best performance on target platform */ -MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } -MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } -MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } -MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } -MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; } - -#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign; - -MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } -MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } -MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; } -MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign*)memPtr)->u64 = value; } - -#else - -/* default method, safe and standard. - can sometimes prove slower */ - MEM_STATIC U16 MEM_read16(const void* memPtr) { U16 val; memcpy(&val, memPtr, sizeof(val)); return val; @@ -194,9 +147,6 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) memcpy(memPtr, &value, sizeof(value)); } -#endif /* MEM_FORCE_MEMORY_ACCESS */ - - MEM_STATIC U16 MEM_readLE16(const void* memPtr) { if (MEM_isLittleEndian()) @@ -261,7 +211,7 @@ MEM_STATIC size_t MEM_readLEST(const void* memPtr) Header File for static linking only Copyright (C) 2014-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -285,7 +235,7 @@ MEM_STATIC size_t MEM_readLEST(const void* memPtr) OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - zstd homepage : http://www.zstd.net + - zstd homepage : https://facebook.github.io/zstd */ #ifndef ZSTD_STATIC_H #define ZSTD_STATIC_H @@ -397,7 +347,7 @@ size_t ZSTDv05_decompressBlock(ZSTDv05_DCtx* dctx, void* dst, size_t dstCapacity Header File for include Copyright (C) 2014-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -484,7 +434,7 @@ static const size_t ZSTDv05_frameHeaderSize_min = 5; #define FSEv05_ENCODING_DYNAMIC 3 -#define HufLog 12 +#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ #define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ @@ -542,7 +492,7 @@ typedef struct { U32 litLengthSum; U32 litSum; U32 offCodeSum; -} seqStore_t; +} SeqStore_t; @@ -552,7 +502,7 @@ typedef struct { header file Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -671,7 +621,7 @@ size_t FSEv05_decompress_usingDTable(void* dst, size_t dstCapacity, const void* header file (to include) Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -752,9 +702,8 @@ MEM_STATIC size_t BITv05_readBitsFast(BITv05_DStream_t* bitD, unsigned nbBits); MEM_STATIC unsigned BITv05_highbit32 (U32 val) { # if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - _BitScanReverse ( &r, val ); - return (unsigned) r; + unsigned long r; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; # elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ return __builtin_clz (val) ^ 31; # else /* Software version */ @@ -826,7 +775,7 @@ MEM_STATIC size_t BITv05_lookBits(BITv05_DStream_t* bitD, U32 nbBits) } /*! BITv05_lookBitsFast : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BITv05_lookBitsFast(BITv05_DStream_t* bitD, U32 nbBits) { const U32 bitMask = sizeof(bitD->bitContainer)*8 - 1; @@ -846,7 +795,7 @@ MEM_STATIC size_t BITv05_readBits(BITv05_DStream_t* bitD, unsigned nbBits) } /*!BITv05_readBitsFast : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BITv05_readBitsFast(BITv05_DStream_t* bitD, unsigned nbBits) { size_t value = BITv05_lookBitsFast(bitD, nbBits); @@ -901,7 +850,7 @@ MEM_STATIC unsigned BITv05_endOfDStream(const BITv05_DStream_t* DStream) header file for static linking (only) Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1051,7 +1000,7 @@ MEM_STATIC unsigned FSEv05_endOfDState(const FSEv05_DState_t* DStatePtr) FSEv05 : Finite State Entropy coder Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1537,7 +1486,7 @@ size_t FSEv05_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t header file Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1610,7 +1559,7 @@ const char* HUFv05_getErrorName(size_t code); /* provides error code string (u header file, for static linking only Copyright (C) 2013-2016, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1702,7 +1651,7 @@ size_t HUFv05_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void Huff0 : Huffman coder, part of New Generation Entropy library Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1804,7 +1753,7 @@ static size_t HUFv05_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, if (!srcSize) return ERROR(srcSize_wrong); iSize = ip[0]; - //memset(huffWeight, 0, hwSize); /* is not necessary, even though some analyzer complain ... */ + /* memset(huffWeight, 0, hwSize); */ /* is not necessary, even though some analyzer complain ... */ if (iSize >= 128) { /* special header */ if (iSize >= (242)) { /* RLE */ @@ -1879,7 +1828,7 @@ size_t HUFv05_readDTableX2 (U16* DTable, const void* src, size_t srcSize) HUFv05_DEltX2* const dt = (HUFv05_DEltX2*)dtPtr; HUFv05_STATIC_ASSERT(sizeof(HUFv05_DEltX2) == sizeof(U16)); /* if compilation fails here, assertion is false */ - //memset(huffWeight, 0, sizeof(huffWeight)); /* is not necessary, even though some analyzer complain ... */ + /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ iSize = HUFv05_readStats(huffWeight, HUFv05_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize); if (HUFv05_isError(iSize)) return iSize; @@ -2210,7 +2159,7 @@ size_t HUFv05_readDTableX4 (unsigned* DTable, const void* src, size_t srcSize) HUFv05_STATIC_ASSERT(sizeof(HUFv05_DEltX4) == sizeof(unsigned)); /* if compilation fails here, assertion is false */ if (memLog > HUFv05_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge); - //memset(weightList, 0, sizeof(weightList)); /* is not necessary, even though some analyzer complain ... */ + /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ iSize = HUFv05_readStats(weightList, HUFv05_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize); if (HUFv05_isError(iSize)) return iSize; @@ -2539,15 +2488,15 @@ size_t HUFv05_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cS return decompress[algoNb](dst, dstSize, cSrc, cSrcSize); - //return HUFv05_decompress4X2(dst, dstSize, cSrc, cSrcSize); /* multi-streams single-symbol decoding */ - //return HUFv05_decompress4X4(dst, dstSize, cSrc, cSrcSize); /* multi-streams double-symbols decoding */ - //return HUFv05_decompress4X6(dst, dstSize, cSrc, cSrcSize); /* multi-streams quad-symbols decoding */ + /* return HUFv05_decompress4X2(dst, dstSize, cSrc, cSrcSize); */ /* multi-streams single-symbol decoding */ + /* return HUFv05_decompress4X4(dst, dstSize, cSrc, cSrcSize); */ /* multi-streams double-symbols decoding */ + /* return HUFv05_decompress4X6(dst, dstSize, cSrc, cSrcSize); */ /* multi-streams quad-symbols decoding */ } /* zstd - standard compression library Copyright (C) 2014-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2645,7 +2594,7 @@ struct ZSTDv05_DCtx_s FSEv05_DTable LLTable[FSEv05_DTABLE_SIZE_U32(LLFSEv05Log)]; FSEv05_DTable OffTable[FSEv05_DTABLE_SIZE_U32(OffFSEv05Log)]; FSEv05_DTable MLTable[FSEv05_DTABLE_SIZE_U32(MLFSEv05Log)]; - unsigned hufTableX4[HUFv05_DTABLE_SIZE(HufLog)]; + unsigned hufTableX4[HUFv05_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; const void* previousDstEnd; const void* base; const void* vBase; @@ -2673,7 +2622,7 @@ size_t ZSTDv05_decompressBegin(ZSTDv05_DCtx* dctx) dctx->base = NULL; dctx->vBase = NULL; dctx->dictEnd = NULL; - dctx->hufTableX4[0] = HufLog; + dctx->hufTableX4[0] = ZSTD_HUFFDTABLE_CAPACITY_LOG; dctx->flagStaticTables = 0; return 0; } @@ -2829,7 +2778,7 @@ static size_t ZSTDv05_decodeFrameHeader_Part2(ZSTDv05_DCtx* zc, const void* src, static size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr) { - const BYTE* const in = (const BYTE* const)src; + const BYTE* const in = (const BYTE*)src; BYTE headerFlags; U32 cSize; @@ -2998,7 +2947,7 @@ static size_t ZSTDv05_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t FSEv05_DTable* DTableLL, FSEv05_DTable* DTableML, FSEv05_DTable* DTableOffb, const void* src, size_t srcSize, U32 flagStaticTable) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* ip = istart; const BYTE* const iend = istart + srcSize; U32 LLtype, Offtype, MLtype; @@ -3234,13 +3183,19 @@ static size_t ZSTDv05_execSequence(BYTE* op, const BYTE* const litEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset; - /* check */ - if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of 8 from oend */ + /* checks */ + size_t const seqLength = sequence.litLength + sequence.matchLength; + + if (seqLength > (size_t)(oend - op)) return ERROR(dstSize_tooSmall); + if (sequence.litLength > (size_t)(litLimit - *litPtr)) return ERROR(corruption_detected); + /* Now we know there are no overflow in literal nor match lengths, can use pointer checks */ + if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); + if (oMatchEnd > oend) return ERROR(dstSize_tooSmall); /* overwrite beyond dst buffer */ - if (litEnd > litLimit) return ERROR(corruption_detected); /* risk read beyond lit buffer */ + if (litEnd > litLimit) return ERROR(corruption_detected); /* overRead beyond lit buffer */ /* copy Literals */ - ZSTDv05_wildcopy(op, *litPtr, sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ + ZSTDv05_wildcopy(op, *litPtr, (ptrdiff_t)sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ op = oLitEnd; *litPtr = litEnd; /* update for next sequence */ @@ -3306,7 +3261,7 @@ static size_t ZSTDv05_decompressSequences( { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; BYTE* const oend = ostart + maxDstSize; size_t errorCode, dumpsLength=0; @@ -3362,8 +3317,10 @@ static size_t ZSTDv05_decompressSequences( size_t lastLLSize = litEnd - litPtr; if (litPtr > litEnd) return ERROR(corruption_detected); /* too many literals already used */ if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall); - memcpy(op, litPtr, lastLLSize); - op += lastLLSize; + if (lastLLSize > 0) { + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } } return op-ostart; @@ -3417,7 +3374,7 @@ static size_t ZSTDv05_decompress_continueDCtx(ZSTDv05_DCtx* dctx, { const BYTE* ip = (const BYTE*)src; const BYTE* iend = ip + srcSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; BYTE* const oend = ostart + maxDstSize; size_t remainingSize = srcSize; @@ -3644,6 +3601,7 @@ size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t maxDstSi } dctx->stage = ZSTDv05ds_decodeBlockHeader; dctx->expected = ZSTDv05_blockHeaderSize; + if (ZSTDv05_isError(rSize)) return rSize; dctx->previousDstEnd = (char*)dst + rSize; return rSize; } @@ -3744,7 +3702,7 @@ size_t ZSTDv05_decompressBegin_usingDict(ZSTDv05_DCtx* dctx, const void* dict, s Buffered version of Zstd compression library Copyright (C) 2015-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -3791,7 +3749,9 @@ static size_t ZBUFFv05_blockHeaderSize = 3; static size_t ZBUFFv05_limitCopy(void* dst, size_t maxDstSize, const void* src, size_t srcSize) { size_t length = MIN(maxDstSize, srcSize); - memcpy(dst, src, length); + if (length > 0) { + memcpy(dst, src, length); + } return length; } @@ -3928,7 +3888,7 @@ size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* zbc, void* dst, size_t* maxDst *maxDstSizePtr = 0; return headerSize - zbc->hPos; } - // zbc->stage = ZBUFFv05ds_decodeHeader; break; /* useless : stage follows */ + /* zbc->stage = ZBUFFv05ds_decodeHeader; break; */ /* useless : stage follows */ } /* fall-through */ case ZBUFFv05ds_decodeHeader: @@ -4001,7 +3961,7 @@ size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* zbc, void* dst, size_t* maxDst if (!decodedSize) { zbc->stage = ZBUFFv05ds_read; break; } /* this was just a header */ zbc->outEnd = zbc->outStart + decodedSize; zbc->stage = ZBUFFv05ds_flush; - // break; /* ZBUFFv05ds_flush follows */ + /* break; */ /* ZBUFFv05ds_flush follows */ } } /* fall-through */ @@ -4044,3 +4004,5 @@ const char* ZBUFFv05_getErrorName(size_t errorCode) { return ERR_getErrorName(er size_t ZBUFFv05_recommendedDInSize(void) { return BLOCKSIZE + ZBUFFv05_blockHeaderSize /* block header size*/ ; } size_t ZBUFFv05_recommendedDOutSize(void) { return BLOCKSIZE; } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v05.h b/vendor/github.com/DataDog/zstd/zstd_v05.h index 4a97985..ebb7831 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v05.h +++ b/vendor/github.com/DataDog/zstd/zstd_v05.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -160,3 +161,5 @@ size_t ZBUFFv05_recommendedDOutSize(void); #endif #endif /* ZSTDv0505_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v06.c b/vendor/github.com/DataDog/zstd/zstd_v06.c index 2a08e8d..b90cd83 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v06.c +++ b/vendor/github.com/DataDog/zstd/zstd_v06.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -14,6 +15,7 @@ #include /* size_t, ptrdiff_t */ #include /* memcpy */ #include /* malloc, free, qsort */ +#include "compiler.h" #include "error_private.h" @@ -23,7 +25,7 @@ low-level memory access routines Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -67,22 +69,17 @@ extern "C" { # include /* _byteswap_ulong */ # include /* _byteswap_* */ #endif -#if defined(__GNUC__) -# define MEM_STATIC static __attribute__((unused)) -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define MEM_STATIC static inline -#elif defined(_MSC_VER) -# define MEM_STATIC static __inline -#else -# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ -#endif /*-************************************************************** * Basic Types *****************************************************************/ #if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include +# if defined(_AIX) +# include +# else +# include /* intptr_t */ +# endif typedef uint8_t BYTE; typedef uint16_t U16; typedef int16_t S16; @@ -104,27 +101,6 @@ extern "C" { /*-************************************************************** * Memory I/O *****************************************************************/ -/* MEM_FORCE_MEMORY_ACCESS : - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets depending on alignment. - * In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define MEM_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) -# define MEM_FORCE_MEMORY_ACCESS 1 -# endif -#endif MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; } MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; } @@ -135,33 +111,6 @@ MEM_STATIC unsigned MEM_isLittleEndian(void) return one.c[0]; } -#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) - -/* violates C standard, by lying on structure alignment. -Only use if no other choice to achieve best performance on target platform */ -MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } -MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } -MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } - -#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign; - -MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } -MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } - -#else - -/* default method, safe and standard. - can sometimes prove slower */ - MEM_STATIC U16 MEM_read16(const void* memPtr) { U16 val; memcpy(&val, memPtr, sizeof(val)); return val; @@ -182,9 +131,6 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) memcpy(memPtr, &value, sizeof(value)); } - -#endif /* MEM_FORCE_MEMORY_ACCESS */ - MEM_STATIC U32 MEM_swap32(U32 in) { #if defined(_MSC_VER) /* Visual Studio */ @@ -280,7 +226,7 @@ MEM_STATIC size_t MEM_readLEST(const void* memPtr) Header File for static linking only Copyright (C) 2014-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -304,7 +250,7 @@ MEM_STATIC size_t MEM_readLEST(const void* memPtr) OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - zstd homepage : http://www.zstd.net + - zstd homepage : https://facebook.github.io/zstd */ #ifndef ZSTDv06_STATIC_H #define ZSTDv06_STATIC_H @@ -411,7 +357,7 @@ ZSTDLIBv06_API size_t ZSTDv06_decompressBlock(ZSTDv06_DCtx* dctx, void* dst, siz Header File for include Copyright (C) 2014-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -478,7 +424,7 @@ typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t; #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ #define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ -#define HufLog 12 +#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 #define IS_HUF 0 #define IS_PCH 1 @@ -607,9 +553,9 @@ typedef struct { U32 cachedLitLength; const BYTE* cachedLiterals; ZSTDv06_stats_t stats; -} seqStore_t; +} SeqStore_t; -void ZSTDv06_seqToCodes(const seqStore_t* seqStorePtr, size_t const nbSeq); +void ZSTDv06_seqToCodes(const SeqStore_t* seqStorePtr, size_t const nbSeq); #endif /* ZSTDv06_CCOMMON_H_MODULE */ @@ -618,7 +564,7 @@ void ZSTDv06_seqToCodes(const seqStore_t* seqStorePtr, size_t const nbSeq); Public Prototypes declaration Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -766,7 +712,7 @@ If there is an error, the function will return an error code, which can be teste header file (to include) Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -856,9 +802,8 @@ MEM_STATIC size_t BITv06_readBitsFast(BITv06_DStream_t* bitD, unsigned nbBits); MEM_STATIC unsigned BITv06_highbit32 ( U32 val) { # if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - _BitScanReverse ( &r, val ); - return (unsigned) r; + unsigned long r; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; # elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ return __builtin_clz (val) ^ 31; # else /* Software version */ @@ -928,7 +873,7 @@ MEM_STATIC size_t BITv06_initDStream(BITv06_DStream_t* bitD, const void* srcBuff } /*! BITv06_lookBitsFast() : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BITv06_lookBitsFast(const BITv06_DStream_t* bitD, U32 nbBits) { U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1; @@ -948,7 +893,7 @@ MEM_STATIC size_t BITv06_readBits(BITv06_DStream_t* bitD, U32 nbBits) } /*! BITv06_readBitsFast() : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BITv06_readBitsFast(BITv06_DStream_t* bitD, U32 nbBits) { size_t const value = BITv06_lookBitsFast(bitD, nbBits); @@ -1002,7 +947,7 @@ MEM_STATIC unsigned BITv06_endOfDStream(const BITv06_DStream_t* DStream) header file for static linking (only) Copyright (C) 2013-2015, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1210,7 +1155,7 @@ MEM_STATIC BYTE FSEv06_decodeSymbolFast(FSEv06_DState_t* DStatePtr, BITv06_DStre Common functions of New Generation Entropy library Copyright (C) 2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1355,7 +1300,7 @@ size_t FSEv06_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned FSE : Finite State Entropy decoder Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1679,7 +1624,7 @@ size_t FSEv06_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t header file Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1749,7 +1694,7 @@ size_t HUFv06_compressBound(size_t size); /**< maximum compressed size */ header file, for static linking only Copyright (C) 2013-2016, Yann Collet - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1862,7 +1807,7 @@ MEM_STATIC size_t HUFv06_readStats(BYTE* huffWeight, size_t hwSize, U32* rankSta if (!srcSize) return ERROR(srcSize_wrong); iSize = ip[0]; - //memset(huffWeight, 0, hwSize); /* is not necessary, even though some analyzer complain ... */ + /* memset(huffWeight, 0, hwSize); */ /* is not necessary, even though some analyzer complain ... */ if (iSize >= 128) { /* special header */ if (iSize >= (242)) { /* RLE */ @@ -1931,7 +1876,7 @@ MEM_STATIC size_t HUFv06_readStats(BYTE* huffWeight, size_t hwSize, U32* rankSta Huffman decoder, part of New Generation Entropy library Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2014,7 +1959,7 @@ size_t HUFv06_readDTableX2 (U16* DTable, const void* src, size_t srcSize) HUFv06_DEltX2* const dt = (HUFv06_DEltX2*)dtPtr; HUFv06_STATIC_ASSERT(sizeof(HUFv06_DEltX2) == sizeof(U16)); /* if compilation fails here, assertion is false */ - //memset(huffWeight, 0, sizeof(huffWeight)); /* is not necessary, even though some analyzer complain ... */ + /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ iSize = HUFv06_readStats(huffWeight, HUFv06_MAX_SYMBOL_VALUE + 1, rankVal, &nbSymbols, &tableLog, src, srcSize); if (HUFv06_isError(iSize)) return iSize; @@ -2340,7 +2285,7 @@ size_t HUFv06_readDTableX4 (U32* DTable, const void* src, size_t srcSize) HUFv06_STATIC_ASSERT(sizeof(HUFv06_DEltX4) == sizeof(U32)); /* if compilation fails here, assertion is false */ if (memLog > HUFv06_ABSOLUTEMAX_TABLELOG) return ERROR(tableLog_tooLarge); - //memset(weightList, 0, sizeof(weightList)); /* is not necessary, even though some analyzer complain ... */ + /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ iSize = HUFv06_readStats(weightList, HUFv06_MAX_SYMBOL_VALUE + 1, rankStats, &nbSymbols, &tableLog, src, srcSize); if (HUFv06_isError(iSize)) return iSize; @@ -2664,19 +2609,19 @@ size_t HUFv06_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cS { U32 algoNb = 0; if (Dtime[1] < Dtime[0]) algoNb = 1; - // if (Dtime[2] < Dtime[algoNb]) algoNb = 2; /* current speed of HUFv06_decompress4X6 is not good */ + /* if (Dtime[2] < Dtime[algoNb]) algoNb = 2; */ /* current speed of HUFv06_decompress4X6 is not good */ return decompress[algoNb](dst, dstSize, cSrc, cSrcSize); } - //return HUFv06_decompress4X2(dst, dstSize, cSrc, cSrcSize); /* multi-streams single-symbol decoding */ - //return HUFv06_decompress4X4(dst, dstSize, cSrc, cSrcSize); /* multi-streams double-symbols decoding */ - //return HUFv06_decompress4X6(dst, dstSize, cSrc, cSrcSize); /* multi-streams quad-symbols decoding */ + /* return HUFv06_decompress4X2(dst, dstSize, cSrc, cSrcSize); */ /* multi-streams single-symbol decoding */ + /* return HUFv06_decompress4X4(dst, dstSize, cSrc, cSrcSize); */ /* multi-streams double-symbols decoding */ + /* return HUFv06_decompress4X6(dst, dstSize, cSrc, cSrcSize); */ /* multi-streams quad-symbols decoding */ } /* Common functions of Zstd compression library Copyright (C) 2015-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2700,7 +2645,7 @@ size_t HUFv06_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cS OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - zstd homepage : http://www.zstd.net/ + - zstd homepage : https://facebook.github.io/zstd/ */ @@ -2730,7 +2675,7 @@ const char* ZBUFFv06_getErrorName(size_t errorCode) { return ERR_getErrorName(er zstd - standard compression library Copyright (C) 2014-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2754,7 +2699,7 @@ const char* ZBUFFv06_getErrorName(size_t errorCode) { return ERR_getErrorName(er OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - zstd homepage : http://www.zstd.net + - zstd homepage : https://facebook.github.io/zstd */ /* *************************************************************** @@ -2806,7 +2751,7 @@ struct ZSTDv06_DCtx_s FSEv06_DTable LLTable[FSEv06_DTABLE_SIZE_U32(LLFSELog)]; FSEv06_DTable OffTable[FSEv06_DTABLE_SIZE_U32(OffFSELog)]; FSEv06_DTable MLTable[FSEv06_DTABLE_SIZE_U32(MLFSELog)]; - unsigned hufTableX4[HUFv06_DTABLE_SIZE(HufLog)]; + unsigned hufTableX4[HUFv06_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; const void* previousDstEnd; const void* base; const void* vBase; @@ -2834,7 +2779,7 @@ size_t ZSTDv06_decompressBegin(ZSTDv06_DCtx* dctx) dctx->base = NULL; dctx->vBase = NULL; dctx->dictEnd = NULL; - dctx->hufTableX4[0] = HufLog; + dctx->hufTableX4[0] = ZSTD_HUFFDTABLE_CAPACITY_LOG; dctx->flagRepeatTable = 0; return 0; } @@ -3025,7 +2970,7 @@ typedef struct * Provides the size of compressed block from block header `src` */ static size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr) { - const BYTE* const in = (const BYTE* const)src; + const BYTE* const in = (const BYTE*)src; U32 cSize; if (srcSize < ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong); @@ -3219,7 +3164,7 @@ static size_t ZSTDv06_decodeSeqHeaders(int* nbSeqPtr, FSEv06_DTable* DTableLL, FSEv06_DTable* DTableML, FSEv06_DTable* DTableOffb, U32 flagRepeatTable, const void* src, size_t srcSize) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; @@ -3370,13 +3315,19 @@ static size_t ZSTDv06_execSequence(BYTE* op, const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset; - /* check */ - if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of 8 from oend */ + /* checks */ + size_t const seqLength = sequence.litLength + sequence.matchLength; + + if (seqLength > (size_t)(oend - op)) return ERROR(dstSize_tooSmall); + if (sequence.litLength > (size_t)(litLimit - *litPtr)) return ERROR(corruption_detected); + /* Now we know there are no overflow in literal nor match lengths, can use pointer checks */ + if (oLitEnd > oend_8) return ERROR(dstSize_tooSmall); + if (oMatchEnd > oend) return ERROR(dstSize_tooSmall); /* overwrite beyond dst buffer */ - if (iLitEnd > litLimit) return ERROR(corruption_detected); /* over-read beyond lit buffer */ + if (iLitEnd > litLimit) return ERROR(corruption_detected); /* overRead beyond lit buffer */ /* copy Literals */ - ZSTDv06_wildcopy(op, *litPtr, sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ + ZSTDv06_wildcopy(op, *litPtr, (ptrdiff_t)sequence.litLength); /* note : oLitEnd <= oend-8 : no risk of overwrite beyond oend */ op = oLitEnd; *litPtr = iLitEnd; /* update for next sequence */ @@ -3441,7 +3392,7 @@ static size_t ZSTDv06_decompressSequences( { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; @@ -3501,8 +3452,10 @@ static size_t ZSTDv06_decompressSequences( { size_t const lastLLSize = litEnd - litPtr; if (litPtr > litEnd) return ERROR(corruption_detected); /* too many literals already used */ if (op+lastLLSize > oend) return ERROR(dstSize_tooSmall); - memcpy(op, litPtr, lastLLSize); - op += lastLLSize; + if (lastLLSize > 0) { + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } } return op-ostart; @@ -3555,7 +3508,7 @@ static size_t ZSTDv06_decompressFrame(ZSTDv06_DCtx* dctx, { const BYTE* ip = (const BYTE*)src; const BYTE* const iend = ip + srcSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; BYTE* const oend = ostart + dstCapacity; size_t remainingSize = srcSize; @@ -3785,6 +3738,7 @@ size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapac } dctx->stage = ZSTDds_decodeBlockHeader; dctx->expected = ZSTDv06_blockHeaderSize; + if (ZSTDv06_isError(rSize)) return rSize; dctx->previousDstEnd = (char*)dst + rSize; return rSize; } @@ -3887,7 +3841,7 @@ size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, s Buffered version of Zstd compression library Copyright (C) 2015-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -3911,7 +3865,7 @@ size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, s OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - zstd homepage : http://www.zstd.net/ + - zstd homepage : https://facebook.github.io/zstd/ */ @@ -3966,6 +3920,10 @@ ZBUFFv06_DCtx* ZBUFFv06_createDCtx(void) if (zbd==NULL) return NULL; memset(zbd, 0, sizeof(*zbd)); zbd->zd = ZSTDv06_createDCtx(); + if (zbd->zd==NULL) { + ZBUFFv06_freeDCtx(zbd); /* avoid leaking the context */ + return NULL; + } zbd->stage = ZBUFFds_init; return zbd; } @@ -4000,7 +3958,9 @@ size_t ZBUFFv06_decompressInit(ZBUFFv06_DCtx* zbd) MEM_STATIC size_t ZBUFFv06_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize) { size_t length = MIN(dstCapacity, srcSize); - memcpy(dst, src, length); + if (length > 0) { + memcpy(dst, src, length); + } return length; } @@ -4031,7 +3991,8 @@ size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* zbd, size_t const toLoad = hSize - zbd->lhSize; /* if hSize!=0, hSize > zbd->lhSize */ if (ZSTDv06_isError(hSize)) return hSize; if (toLoad > (size_t)(iend-ip)) { /* not enough input to load full header */ - memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip); + if (ip != NULL) + memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip); zbd->lhSize += iend-ip; *dstCapacityPtr = 0; return (hSize - zbd->lhSize) + ZSTDv06_blockHeaderSize; /* remaining header bytes + next block header */ @@ -4109,7 +4070,7 @@ size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* zbd, if (!decodedSize) { zbd->stage = ZBUFFds_read; break; } /* this was just a header */ zbd->outEnd = zbd->outStart + decodedSize; zbd->stage = ZBUFFds_flush; - // break; /* ZBUFFds_flush follows */ + /* break; */ /* ZBUFFds_flush follows */ } } /* fall-through */ @@ -4148,3 +4109,5 @@ size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* zbd, ***************************************/ size_t ZBUFFv06_recommendedDInSize(void) { return ZSTDv06_BLOCKSIZE_MAX + ZSTDv06_blockHeaderSize /* block header size*/ ; } size_t ZBUFFv06_recommendedDOutSize(void) { return ZSTDv06_BLOCKSIZE_MAX; } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v06.h b/vendor/github.com/DataDog/zstd/zstd_v06.h index 0781857..f511b10 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v06.h +++ b/vendor/github.com/DataDog/zstd/zstd_v06.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -170,3 +171,5 @@ ZSTDLIBv06_API size_t ZBUFFv06_recommendedDOutSize(void); #endif #endif /* ZSTDv06_BUFFERED_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v07.c b/vendor/github.com/DataDog/zstd/zstd_v07.c index a2eeff8..05c94aa 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v07.c +++ b/vendor/github.com/DataDog/zstd/zstd_v07.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -24,6 +25,7 @@ #define HUFv07_STATIC_LINKING_ONLY /* HUFv07_TABLELOG_ABSOLUTEMAX */ #define ZSTDv07_STATIC_LINKING_ONLY +#include "compiler.h" #include "error_private.h" @@ -184,7 +186,7 @@ ZSTDLIBv07_API size_t ZSTDv07_insertBlock(ZSTDv07_DCtx* dctx, const void* blockS low-level memory access routines Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -227,22 +229,17 @@ extern "C" { # include /* _byteswap_ulong */ # include /* _byteswap_* */ #endif -#if defined(__GNUC__) -# define MEM_STATIC static __attribute__((unused)) -#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define MEM_STATIC static inline -#elif defined(_MSC_VER) -# define MEM_STATIC static __inline -#else -# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ -#endif /*-************************************************************** * Basic Types *****************************************************************/ #if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include +# if defined(_AIX) +# include +# else +# include /* intptr_t */ +# endif typedef uint8_t BYTE; typedef uint16_t U16; typedef int16_t S16; @@ -264,27 +261,6 @@ extern "C" { /*-************************************************************** * Memory I/O *****************************************************************/ -/* MEM_FORCE_MEMORY_ACCESS : - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets depending on alignment. - * In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define MEM_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) -# define MEM_FORCE_MEMORY_ACCESS 1 -# endif -#endif MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; } MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; } @@ -295,33 +271,6 @@ MEM_STATIC unsigned MEM_isLittleEndian(void) return one.c[0]; } -#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) - -/* violates C standard, by lying on structure alignment. -Only use if no other choice to achieve best performance on target platform */ -MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } -MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } -MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } - -#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign; - -MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } -MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } -MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } - -MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; } - -#else - -/* default method, safe and standard. - can sometimes prove slower */ - MEM_STATIC U16 MEM_read16(const void* memPtr) { U16 val; memcpy(&val, memPtr, sizeof(val)); return val; @@ -342,8 +291,6 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) memcpy(memPtr, &value, sizeof(value)); } -#endif /* MEM_FORCE_MEMORY_ACCESS */ - MEM_STATIC U32 MEM_swap32(U32 in) { #if defined(_MSC_VER) /* Visual Studio */ @@ -438,7 +385,7 @@ MEM_STATIC size_t MEM_readLEST(const void* memPtr) header file (to include) Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -526,9 +473,8 @@ MEM_STATIC size_t BITv07_readBitsFast(BITv07_DStream_t* bitD, unsigned nbBits); MEM_STATIC unsigned BITv07_highbit32 (U32 val) { # if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - _BitScanReverse ( &r, val ); - return (unsigned) r; + unsigned long r; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; # elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ return __builtin_clz (val) ^ 31; # else /* Software version */ @@ -596,7 +542,7 @@ MEM_STATIC size_t BITv07_initDStream(BITv07_DStream_t* bitD, const void* srcBuff } /*! BITv07_lookBitsFast() : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BITv07_lookBitsFast(const BITv07_DStream_t* bitD, U32 nbBits) { U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1; @@ -616,7 +562,7 @@ MEM_STATIC size_t BITv07_readBits(BITv07_DStream_t* bitD, U32 nbBits) } /*! BITv07_readBitsFast() : -* unsafe version; only works only if nbBits >= 1 */ +* unsafe version; only works if nbBits >= 1 */ MEM_STATIC size_t BITv07_readBitsFast(BITv07_DStream_t* bitD, U32 nbBits) { size_t const value = BITv07_lookBitsFast(bitD, nbBits); @@ -670,7 +616,7 @@ MEM_STATIC unsigned BITv07_endOfDStream(const BITv07_DStream_t* DStream) Public Prototypes declaration Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -978,7 +924,7 @@ MEM_STATIC BYTE FSEv07_decodeSymbolFast(FSEv07_DState_t* DStatePtr, BITv07_DStre header file Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1151,7 +1097,7 @@ size_t HUFv07_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void Common functions of New Generation Entropy library Copyright (C) 2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1314,7 +1260,7 @@ size_t HUFv07_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, if (!srcSize) return ERROR(srcSize_wrong); iSize = ip[0]; - //memset(huffWeight, 0, hwSize); /* is not necessary, even though some analyzer complain ... */ + /* memset(huffWeight, 0, hwSize); */ /* is not necessary, even though some analyzer complain ... */ if (iSize >= 128) { /* special header */ if (iSize >= (242)) { /* RLE */ @@ -1375,7 +1321,7 @@ size_t HUFv07_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, FSE : Finite State Entropy decoder Copyright (C) 2013-2015, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1699,7 +1645,7 @@ size_t FSEv07_decompress(void* dst, size_t maxDstSize, const void* cSrc, size_t Huffman decoder, part of New Generation Entropy library Copyright (C) 2013-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -1784,7 +1730,7 @@ size_t HUFv07_readDTableX2 (HUFv07_DTable* DTable, const void* src, size_t srcSi HUFv07_DEltX2* const dt = (HUFv07_DEltX2*)dtPtr; HUFv07_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUFv07_DTable)); - //memset(huffWeight, 0, sizeof(huffWeight)); /* is not necessary, even though some analyzer complain ... */ + /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ iSize = HUFv07_readStats(huffWeight, HUFv07_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize); if (HUFv07_isError(iSize)) return iSize; @@ -2148,7 +2094,7 @@ size_t HUFv07_readDTableX4 (HUFv07_DTable* DTable, const void* src, size_t srcSi HUFv07_STATIC_ASSERT(sizeof(HUFv07_DEltX4) == sizeof(HUFv07_DTable)); /* if compilation fails here, assertion is false */ if (maxTableLog > HUFv07_TABLELOG_ABSOLUTEMAX) return ERROR(tableLog_tooLarge); - //memset(weightList, 0, sizeof(weightList)); /* is not necessary, even though some analyzer complain ... */ + /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ iSize = HUFv07_readStats(weightList, HUFv07_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize); if (HUFv07_isError(iSize)) return iSize; @@ -2530,8 +2476,8 @@ size_t HUFv07_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cS return decompress[algoNb](dst, dstSize, cSrc, cSrcSize); } - //return HUFv07_decompress4X2(dst, dstSize, cSrc, cSrcSize); /* multi-streams single-symbol decoding */ - //return HUFv07_decompress4X4(dst, dstSize, cSrc, cSrcSize); /* multi-streams double-symbols decoding */ + /* return HUFv07_decompress4X2(dst, dstSize, cSrc, cSrcSize); */ /* multi-streams single-symbol decoding */ + /* return HUFv07_decompress4X4(dst, dstSize, cSrc, cSrcSize); */ /* multi-streams double-symbols decoding */ } size_t HUFv07_decompress4X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) @@ -2577,7 +2523,7 @@ size_t HUFv07_decompress1X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, Common functions of Zstd compression library Copyright (C) 2015-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2601,7 +2547,7 @@ size_t HUFv07_decompress1X_DCtx (HUFv07_DTable* dctx, void* dst, size_t dstSize, OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - zstd homepage : http://www.zstd.net/ + - zstd homepage : https://facebook.github.io/zstd/ */ @@ -2647,7 +2593,7 @@ static void ZSTDv07_defaultFreeFunction(void* opaque, void* address) Header File for include Copyright (C) 2014-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2717,7 +2663,7 @@ typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t; #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ #define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ -#define HufLog 12 +#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 typedef enum { lbt_huffman, lbt_repeat, lbt_raw, lbt_rle } litBlockType_t; #define LONGNBSEQ 0x7F00 @@ -2842,9 +2788,9 @@ typedef struct { U32 cachedLitLength; const BYTE* cachedLiterals; ZSTDv07_stats_t stats; -} seqStore_t; +} SeqStore_t; -void ZSTDv07_seqToCodes(const seqStore_t* seqStorePtr, size_t const nbSeq); +void ZSTDv07_seqToCodes(const SeqStore_t* seqStorePtr, size_t const nbSeq); /* custom memory allocation functions */ static const ZSTDv07_customMem defaultCustomMem = { ZSTDv07_defaultAllocFunction, ZSTDv07_defaultFreeFunction, NULL }; @@ -2854,7 +2800,7 @@ static const ZSTDv07_customMem defaultCustomMem = { ZSTDv07_defaultAllocFunction zstd - standard compression library Copyright (C) 2014-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -2878,7 +2824,7 @@ static const ZSTDv07_customMem defaultCustomMem = { ZSTDv07_defaultAllocFunction OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - zstd homepage : http://www.zstd.net + - zstd homepage : https://facebook.github.io/zstd */ /* *************************************************************** @@ -2931,7 +2877,7 @@ struct ZSTDv07_DCtx_s FSEv07_DTable LLTable[FSEv07_DTABLE_SIZE_U32(LLFSELog)]; FSEv07_DTable OffTable[FSEv07_DTABLE_SIZE_U32(OffFSELog)]; FSEv07_DTable MLTable[FSEv07_DTABLE_SIZE_U32(MLFSELog)]; - HUFv07_DTable hufTable[HUFv07_DTABLE_SIZE(HufLog)]; /* can accommodate HUFv07_decompress4X */ + HUFv07_DTable hufTable[HUFv07_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUFv07_decompress4X */ const void* previousDstEnd; const void* base; const void* vBase; @@ -2967,7 +2913,7 @@ size_t ZSTDv07_decompressBegin(ZSTDv07_DCtx* dctx) dctx->base = NULL; dctx->vBase = NULL; dctx->dictEnd = NULL; - dctx->hufTable[0] = (HUFv07_DTable)((HufLog)*0x1000001); + dctx->hufTable[0] = (HUFv07_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); dctx->litEntropy = dctx->fseEntropy = 0; dctx->dictID = 0; { int i; for (i=0; irep[i] = repStartValue[i]; } @@ -3254,7 +3200,7 @@ typedef struct * Provides the size of compressed block from block header `src` */ static size_t ZSTDv07_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr) { - const BYTE* const in = (const BYTE* const)src; + const BYTE* const in = (const BYTE*)src; U32 cSize; if (srcSize < ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong); @@ -3272,7 +3218,9 @@ static size_t ZSTDv07_getcBlockSize(const void* src, size_t srcSize, blockProper static size_t ZSTDv07_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize) { if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall); - memcpy(dst, src, srcSize); + if (srcSize > 0) { + memcpy(dst, src, srcSize); + } return srcSize; } @@ -3447,7 +3395,7 @@ static size_t ZSTDv07_decodeSeqHeaders(int* nbSeqPtr, FSEv07_DTable* DTableLL, FSEv07_DTable* DTableML, FSEv07_DTable* DTableOffb, U32 flagRepeatTable, const void* src, size_t srcSize) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; @@ -3597,11 +3545,14 @@ size_t ZSTDv07_execSequence(BYTE* op, const BYTE* match = oLitEnd - sequence.offset; /* check */ - if ((oLitEnd>oend_w) | (oMatchEnd>oend)) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ - if (iLitEnd > litLimit) return ERROR(corruption_detected); /* over-read beyond lit buffer */ + assert(oend >= op); + if (sequence.litLength + WILDCOPY_OVERLENGTH > (size_t)(oend - op)) return ERROR(dstSize_tooSmall); + if (sequenceLength > (size_t)(oend - op)) return ERROR(dstSize_tooSmall); + assert(litLimit >= *litPtr); + if (sequence.litLength > (size_t)(litLimit - *litPtr)) return ERROR(corruption_detected);; /* copy Literals */ - ZSTDv07_wildcopy(op, *litPtr, sequence.litLength); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ + ZSTDv07_wildcopy(op, *litPtr, (ptrdiff_t)sequence.litLength); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ op = oLitEnd; *litPtr = iLitEnd; /* update for next sequence */ @@ -3615,7 +3566,7 @@ size_t ZSTDv07_execSequence(BYTE* op, return sequenceLength; } /* span extDict & currentPrefixSegment */ - { size_t const length1 = dictEnd - match; + { size_t const length1 = (size_t)(dictEnd - match); memmove(oLitEnd, match, length1); op = oLitEnd + length1; sequence.matchLength -= length1; @@ -3666,7 +3617,7 @@ static size_t ZSTDv07_decompressSequences( { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; @@ -3712,10 +3663,12 @@ static size_t ZSTDv07_decompressSequences( /* last literal segment */ { size_t const lastLLSize = litEnd - litPtr; - //if (litPtr > litEnd) return ERROR(corruption_detected); /* too many literals already used */ + /* if (litPtr > litEnd) return ERROR(corruption_detected); */ /* too many literals already used */ if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall); - memcpy(op, litPtr, lastLLSize); - op += lastLLSize; + if (lastLLSize > 0) { + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } } return op-ostart; @@ -3776,7 +3729,9 @@ ZSTDLIBv07_API size_t ZSTDv07_insertBlock(ZSTDv07_DCtx* dctx, const void* blockS static size_t ZSTDv07_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length) { if (length > dstCapacity) return ERROR(dstSize_tooSmall); - memset(dst, byte, length); + if (length > 0) { + memset(dst, byte, length); + } return length; } @@ -3789,7 +3744,7 @@ static size_t ZSTDv07_decompressFrame(ZSTDv07_DCtx* dctx, { const BYTE* ip = (const BYTE*)src; const BYTE* const iend = ip + srcSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstCapacity; BYTE* op = ostart; size_t remainingSize = srcSize; @@ -4053,8 +4008,8 @@ size_t ZSTDv07_decompressContinue(ZSTDv07_DCtx* dctx, void* dst, size_t dstCapac } dctx->stage = ZSTDds_decodeBlockHeader; dctx->expected = ZSTDv07_blockHeaderSize; - dctx->previousDstEnd = (char*)dst + rSize; if (ZSTDv07_isError(rSize)) return rSize; + dctx->previousDstEnd = (char*)dst + rSize; if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize); return rSize; } @@ -4247,7 +4202,7 @@ ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx, Buffered version of Zstd compression library Copyright (C) 2015-2016, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + BSD 2-Clause License (https://opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -4271,7 +4226,7 @@ ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx, OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - - zstd homepage : http://www.zstd.net/ + - zstd homepage : https://facebook.github.io/zstd/ */ @@ -4378,7 +4333,9 @@ size_t ZBUFFv07_decompressInit(ZBUFFv07_DCtx* zbd) MEM_STATIC size_t ZBUFFv07_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize) { size_t const length = MIN(dstCapacity, srcSize); - memcpy(dst, src, length); + if (length > 0) { + memcpy(dst, src, length); + } return length; } @@ -4409,7 +4366,8 @@ size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* zbd, if (hSize != 0) { size_t const toLoad = hSize - zbd->lhSize; /* if hSize!=0, hSize > zbd->lhSize */ if (toLoad > (size_t)(iend-ip)) { /* not enough input to load full header */ - memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip); + if (ip != NULL) + memcpy(zbd->headerBuffer + zbd->lhSize, ip, iend-ip); zbd->lhSize += iend-ip; *dstCapacityPtr = 0; return (hSize - zbd->lhSize) + ZSTDv07_blockHeaderSize; /* remaining header bytes + next block header */ @@ -4531,3 +4489,5 @@ size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* zbd, ***************************************/ size_t ZBUFFv07_recommendedDInSize(void) { return ZSTDv07_BLOCKSIZE_ABSOLUTEMAX + ZSTDv07_blockHeaderSize /* block header size*/ ; } size_t ZBUFFv07_recommendedDOutSize(void) { return ZSTDv07_BLOCKSIZE_ABSOLUTEMAX; } + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstd_v07.h b/vendor/github.com/DataDog/zstd/zstd_v07.h index a566c1d..385b10c 100644 --- a/vendor/github.com/DataDog/zstd/zstd_v07.h +++ b/vendor/github.com/DataDog/zstd/zstd_v07.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -185,3 +186,5 @@ ZSTDLIBv07_API size_t ZBUFFv07_recommendedDOutSize(void); #endif #endif /* ZSTDv07_H_235446 */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstdmt_compress.c b/vendor/github.com/DataDog/zstd/zstdmt_compress.c index bc3062b..291a838 100644 --- a/vendor/github.com/DataDog/zstd/zstdmt_compress.c +++ b/vendor/github.com/DataDog/zstd/zstdmt_compress.c @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -15,17 +16,13 @@ #endif -/* ====== Constants ====== */ -#define ZSTDMT_OVERLAPLOG_DEFAULT 0 - - /* ====== Dependencies ====== */ -#include /* memcpy, memset */ -#include /* INT_MAX, UINT_MAX */ +#include "allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ +#include "zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset, INT_MAX, UINT_MAX */ #include "mem.h" /* MEM_STATIC */ #include "pool.h" /* threadpool */ #include "threading.h" /* mutex */ -#include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */ +#include "zstd_compress_internal.h" /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */ #include "zstd_ldm.h" #include "zstdmt_compress.h" @@ -44,12 +41,13 @@ # include # include -# define DEBUG_PRINTHEX(l,p,n) { \ - unsigned debug_u; \ - for (debug_u=0; debug_u<(n); debug_u++) \ - RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \ - RAWLOG(l, " \n"); \ -} +# define DEBUG_PRINTHEX(l,p,n) \ + do { \ + unsigned debug_u; \ + for (debug_u=0; debug_u<(n); debug_u++) \ + RAWLOG(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \ + RAWLOG(l, " \n"); \ + } while (0) static unsigned long long GetCurrentClockTimeMicroseconds(void) { @@ -61,25 +59,28 @@ static unsigned long long GetCurrentClockTimeMicroseconds(void) } } #define MUTEX_WAIT_TIME_DLEVEL 6 -#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) { \ - if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) { \ - unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \ - ZSTD_pthread_mutex_lock(mutex); \ - { unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \ - unsigned long long const elapsedTime = (afterTime-beforeTime); \ - if (elapsedTime > 1000) { /* or whatever threshold you like; I'm using 1 millisecond here */ \ - DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \ - elapsedTime, #mutex); \ - } } \ - } else { \ - ZSTD_pthread_mutex_lock(mutex); \ - } \ -} +#define ZSTD_PTHREAD_MUTEX_LOCK(mutex) \ + do { \ + if (DEBUGLEVEL >= MUTEX_WAIT_TIME_DLEVEL) { \ + unsigned long long const beforeTime = GetCurrentClockTimeMicroseconds(); \ + ZSTD_pthread_mutex_lock(mutex); \ + { unsigned long long const afterTime = GetCurrentClockTimeMicroseconds(); \ + unsigned long long const elapsedTime = (afterTime-beforeTime); \ + if (elapsedTime > 1000) { \ + /* or whatever threshold you like; I'm using 1 millisecond here */ \ + DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, \ + "Thread took %llu microseconds to acquire mutex %s \n", \ + elapsedTime, #mutex); \ + } } \ + } else { \ + ZSTD_pthread_mutex_lock(mutex); \ + } \ + } while (0) #else # define ZSTD_PTHREAD_MUTEX_LOCK(m) ZSTD_pthread_mutex_lock(m) -# define DEBUG_PRINTHEX(l,p,n) {} +# define DEBUG_PRINTHEX(l,p,n) do { } while (0) #endif @@ -90,9 +91,9 @@ static unsigned long long GetCurrentClockTimeMicroseconds(void) typedef struct buffer_s { void* start; size_t capacity; -} buffer_t; +} Buffer; -static const buffer_t g_nullBuffer = { NULL, 0 }; +static const Buffer g_nullBuffer = { NULL, 0 }; typedef struct ZSTDMT_bufferPool_s { ZSTD_pthread_mutex_t poolMutex; @@ -100,17 +101,37 @@ typedef struct ZSTDMT_bufferPool_s { unsigned totalBuffers; unsigned nbBuffers; ZSTD_customMem cMem; - buffer_t bTable[1]; /* variable size */ + Buffer* buffers; } ZSTDMT_bufferPool; -static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbWorkers, ZSTD_customMem cMem) +static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool) +{ + DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool); + if (!bufPool) return; /* compatibility with free on NULL */ + if (bufPool->buffers) { + unsigned u; + for (u=0; utotalBuffers; u++) { + DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->buffers[u].start); + ZSTD_customFree(bufPool->buffers[u].start, bufPool->cMem); + } + ZSTD_customFree(bufPool->buffers, bufPool->cMem); + } + ZSTD_pthread_mutex_destroy(&bufPool->poolMutex); + ZSTD_customFree(bufPool, bufPool->cMem); +} + +static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem) { - unsigned const maxNbBuffers = 2*nbWorkers + 3; - ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_calloc( - sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t), cMem); + ZSTDMT_bufferPool* const bufPool = + (ZSTDMT_bufferPool*)ZSTD_customCalloc(sizeof(ZSTDMT_bufferPool), cMem); if (bufPool==NULL) return NULL; if (ZSTD_pthread_mutex_init(&bufPool->poolMutex, NULL)) { - ZSTD_free(bufPool, cMem); + ZSTD_customFree(bufPool, cMem); + return NULL; + } + bufPool->buffers = (Buffer*)ZSTD_customCalloc(maxNbBuffers * sizeof(Buffer), cMem); + if (bufPool->buffers==NULL) { + ZSTDMT_freeBufferPool(bufPool); return NULL; } bufPool->bufferSize = 64 KB; @@ -120,32 +141,19 @@ static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbWorkers, ZSTD_custo return bufPool; } -static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool) -{ - unsigned u; - DEBUGLOG(3, "ZSTDMT_freeBufferPool (address:%08X)", (U32)(size_t)bufPool); - if (!bufPool) return; /* compatibility with free on NULL */ - for (u=0; utotalBuffers; u++) { - DEBUGLOG(4, "free buffer %2u (address:%08X)", u, (U32)(size_t)bufPool->bTable[u].start); - ZSTD_free(bufPool->bTable[u].start, bufPool->cMem); - } - ZSTD_pthread_mutex_destroy(&bufPool->poolMutex); - ZSTD_free(bufPool, bufPool->cMem); -} - /* only works at initialization, not during compression */ static size_t ZSTDMT_sizeof_bufferPool(ZSTDMT_bufferPool* bufPool) { - size_t const poolSize = sizeof(*bufPool) - + (bufPool->totalBuffers - 1) * sizeof(buffer_t); + size_t const poolSize = sizeof(*bufPool); + size_t const arraySize = bufPool->totalBuffers * sizeof(Buffer); unsigned u; size_t totalBufferSize = 0; ZSTD_pthread_mutex_lock(&bufPool->poolMutex); for (u=0; utotalBuffers; u++) - totalBufferSize += bufPool->bTable[u].capacity; + totalBufferSize += bufPool->buffers[u].capacity; ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); - return poolSize + totalBufferSize; + return poolSize + arraySize + totalBufferSize; } /* ZSTDMT_setBufferSize() : @@ -161,9 +169,8 @@ static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const } -static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, U32 nbWorkers) +static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, unsigned maxNbBuffers) { - unsigned const maxNbBuffers = 2*nbWorkers + 3; if (srcBufPool==NULL) return NULL; if (srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */ return srcBufPool; @@ -172,7 +179,7 @@ static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, size_t const bSize = srcBufPool->bufferSize; /* forward parameters */ ZSTDMT_bufferPool* newBufPool; ZSTDMT_freeBufferPool(srcBufPool); - newBufPool = ZSTDMT_createBufferPool(nbWorkers, cMem); + newBufPool = ZSTDMT_createBufferPool(maxNbBuffers, cMem); if (newBufPool==NULL) return newBufPool; ZSTDMT_setBufferSize(newBufPool, bSize); return newBufPool; @@ -183,15 +190,15 @@ static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, * assumption : bufPool must be valid * @return : a buffer, with start pointer and size * note: allocation may fail, in this case, start==NULL and size==0 */ -static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool) +static Buffer ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool) { size_t const bSize = bufPool->bufferSize; DEBUGLOG(5, "ZSTDMT_getBuffer: bSize = %u", (U32)bufPool->bufferSize); ZSTD_pthread_mutex_lock(&bufPool->poolMutex); if (bufPool->nbBuffers) { /* try to use an existing buffer */ - buffer_t const buf = bufPool->bTable[--(bufPool->nbBuffers)]; + Buffer const buf = bufPool->buffers[--(bufPool->nbBuffers)]; size_t const availBufferSize = buf.capacity; - bufPool->bTable[bufPool->nbBuffers] = g_nullBuffer; + bufPool->buffers[bufPool->nbBuffers] = g_nullBuffer; if ((availBufferSize >= bSize) & ((availBufferSize>>3) <= bSize)) { /* large enough, but not too much */ DEBUGLOG(5, "ZSTDMT_getBuffer: provide buffer %u of size %u", @@ -201,13 +208,13 @@ static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool) } /* size conditions not respected : scratch this buffer, create new one */ DEBUGLOG(5, "ZSTDMT_getBuffer: existing buffer does not meet size conditions => freeing"); - ZSTD_free(buf.start, bufPool->cMem); + ZSTD_customFree(buf.start, bufPool->cMem); } ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); /* create new buffer */ DEBUGLOG(5, "ZSTDMT_getBuffer: create a new buffer"); - { buffer_t buffer; - void* const start = ZSTD_malloc(bSize, bufPool->cMem); + { Buffer buffer; + void* const start = ZSTD_customMalloc(bSize, bufPool->cMem); buffer.start = start; /* note : start can be NULL if malloc fails ! */ buffer.capacity = (start==NULL) ? 0 : bSize; if (start==NULL) { @@ -225,17 +232,17 @@ static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* bufPool) * @return : a buffer that is at least the buffer pool buffer size. * If a reallocation happens, the data in the input buffer is copied. */ -static buffer_t ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buffer) +static Buffer ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, Buffer buffer) { size_t const bSize = bufPool->bufferSize; if (buffer.capacity < bSize) { - void* const start = ZSTD_malloc(bSize, bufPool->cMem); - buffer_t newBuffer; + void* const start = ZSTD_customMalloc(bSize, bufPool->cMem); + Buffer newBuffer; newBuffer.start = start; newBuffer.capacity = start == NULL ? 0 : bSize; if (start != NULL) { assert(newBuffer.capacity >= buffer.capacity); - memcpy(newBuffer.start, buffer.start, buffer.capacity); + ZSTD_memcpy(newBuffer.start, buffer.start, buffer.capacity); DEBUGLOG(5, "ZSTDMT_resizeBuffer: created buffer of size %u", (U32)bSize); return newBuffer; } @@ -246,28 +253,36 @@ static buffer_t ZSTDMT_resizeBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buffer) #endif /* store buffer for later re-use, up to pool capacity */ -static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf) +static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, Buffer buf) { DEBUGLOG(5, "ZSTDMT_releaseBuffer"); if (buf.start == NULL) return; /* compatible with release on NULL */ ZSTD_pthread_mutex_lock(&bufPool->poolMutex); if (bufPool->nbBuffers < bufPool->totalBuffers) { - bufPool->bTable[bufPool->nbBuffers++] = buf; /* stored for later use */ + bufPool->buffers[bufPool->nbBuffers++] = buf; /* stored for later use */ DEBUGLOG(5, "ZSTDMT_releaseBuffer: stored buffer of size %u in slot %u", (U32)buf.capacity, (U32)(bufPool->nbBuffers-1)); ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); return; } ZSTD_pthread_mutex_unlock(&bufPool->poolMutex); - /* Reached bufferPool capacity (should not happen) */ + /* Reached bufferPool capacity (note: should not happen) */ DEBUGLOG(5, "ZSTDMT_releaseBuffer: pool capacity reached => freeing "); - ZSTD_free(buf.start, bufPool->cMem); + ZSTD_customFree(buf.start, bufPool->cMem); } +/* We need 2 output buffers per worker since each dstBuff must be flushed after it is released. + * The 3 additional buffers are as follows: + * 1 buffer for input loading + * 1 buffer for "next input" when submitting current one + * 1 buffer stuck in queue */ +#define BUF_POOL_MAX_NB_BUFFERS(nbWorkers) (2*(nbWorkers) + 3) -/* ===== Seq Pool Wrapper ====== */ +/* After a worker releases its rawSeqStore, it is immediately ready for reuse. + * So we only need one seq buffer per worker. */ +#define SEQ_POOL_MAX_NB_BUFFERS(nbWorkers) (nbWorkers) -static rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0}; +/* ===== Seq Pool Wrapper ====== */ typedef ZSTDMT_bufferPool ZSTDMT_seqPool; @@ -276,23 +291,23 @@ static size_t ZSTDMT_sizeof_seqPool(ZSTDMT_seqPool* seqPool) return ZSTDMT_sizeof_bufferPool(seqPool); } -static rawSeqStore_t bufferToSeq(buffer_t buffer) +static RawSeqStore_t bufferToSeq(Buffer buffer) { - rawSeqStore_t seq = {NULL, 0, 0, 0}; + RawSeqStore_t seq = kNullRawSeqStore; seq.seq = (rawSeq*)buffer.start; seq.capacity = buffer.capacity / sizeof(rawSeq); return seq; } -static buffer_t seqToBuffer(rawSeqStore_t seq) +static Buffer seqToBuffer(RawSeqStore_t seq) { - buffer_t buffer; + Buffer buffer; buffer.start = seq.seq; buffer.capacity = seq.capacity * sizeof(rawSeq); return buffer; } -static rawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool) +static RawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool) { if (seqPool->bufferSize == 0) { return kNullRawSeqStore; @@ -301,13 +316,13 @@ static rawSeqStore_t ZSTDMT_getSeq(ZSTDMT_seqPool* seqPool) } #if ZSTD_RESIZE_SEQPOOL -static rawSeqStore_t ZSTDMT_resizeSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq) +static RawSeqStore_t ZSTDMT_resizeSeq(ZSTDMT_seqPool* seqPool, RawSeqStore_t seq) { return bufferToSeq(ZSTDMT_resizeBuffer(seqPool, seqToBuffer(seq))); } #endif -static void ZSTDMT_releaseSeq(ZSTDMT_seqPool* seqPool, rawSeqStore_t seq) +static void ZSTDMT_releaseSeq(ZSTDMT_seqPool* seqPool, RawSeqStore_t seq) { ZSTDMT_releaseBuffer(seqPool, seqToBuffer(seq)); } @@ -319,7 +334,7 @@ static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq) static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem) { - ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem); + ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(SEQ_POOL_MAX_NB_BUFFERS(nbWorkers), cMem); if (seqPool == NULL) return NULL; ZSTDMT_setNbSeq(seqPool, 0); return seqPool; @@ -332,7 +347,7 @@ static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool) static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers) { - return ZSTDMT_expandBufferPool(pool, nbWorkers); + return ZSTDMT_expandBufferPool(pool, SEQ_POOL_MAX_NB_BUFFERS(nbWorkers)); } @@ -344,17 +359,21 @@ typedef struct { int totalCCtx; int availCCtx; ZSTD_customMem cMem; - ZSTD_CCtx* cctx[1]; /* variable size */ + ZSTD_CCtx** cctxs; } ZSTDMT_CCtxPool; -/* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */ +/* note : all CCtx borrowed from the pool must be reverted back to the pool _before_ freeing the pool */ static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool) { - int cid; - for (cid=0; cidtotalCCtx; cid++) - ZSTD_freeCCtx(pool->cctx[cid]); /* note : compatible with free on NULL */ + if (!pool) return; ZSTD_pthread_mutex_destroy(&pool->poolMutex); - ZSTD_free(pool, pool->cMem); + if (pool->cctxs) { + int cid; + for (cid=0; cidtotalCCtx; cid++) + ZSTD_freeCCtx(pool->cctxs[cid]); /* free compatible with NULL */ + ZSTD_customFree(pool->cctxs, pool->cMem); + } + ZSTD_customFree(pool, pool->cMem); } /* ZSTDMT_createCCtxPool() : @@ -362,19 +381,24 @@ static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool) static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(int nbWorkers, ZSTD_customMem cMem) { - ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) ZSTD_calloc( - sizeof(ZSTDMT_CCtxPool) + (nbWorkers-1)*sizeof(ZSTD_CCtx*), cMem); + ZSTDMT_CCtxPool* const cctxPool = + (ZSTDMT_CCtxPool*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtxPool), cMem); assert(nbWorkers > 0); if (!cctxPool) return NULL; if (ZSTD_pthread_mutex_init(&cctxPool->poolMutex, NULL)) { - ZSTD_free(cctxPool, cMem); + ZSTD_customFree(cctxPool, cMem); return NULL; } - cctxPool->cMem = cMem; cctxPool->totalCCtx = nbWorkers; + cctxPool->cctxs = (ZSTD_CCtx**)ZSTD_customCalloc(nbWorkers * sizeof(ZSTD_CCtx*), cMem); + if (!cctxPool->cctxs) { + ZSTDMT_freeCCtxPool(cctxPool); + return NULL; + } + cctxPool->cMem = cMem; + cctxPool->cctxs[0] = ZSTD_createCCtx_advanced(cMem); + if (!cctxPool->cctxs[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; } cctxPool->availCCtx = 1; /* at least one cctx for single-thread mode */ - cctxPool->cctx[0] = ZSTD_createCCtx_advanced(cMem); - if (!cctxPool->cctx[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; } DEBUGLOG(3, "cctxPool created, with %u workers", nbWorkers); return cctxPool; } @@ -396,16 +420,16 @@ static size_t ZSTDMT_sizeof_CCtxPool(ZSTDMT_CCtxPool* cctxPool) { ZSTD_pthread_mutex_lock(&cctxPool->poolMutex); { unsigned const nbWorkers = cctxPool->totalCCtx; - size_t const poolSize = sizeof(*cctxPool) - + (nbWorkers-1) * sizeof(ZSTD_CCtx*); - unsigned u; + size_t const poolSize = sizeof(*cctxPool); + size_t const arraySize = cctxPool->totalCCtx * sizeof(ZSTD_CCtx*); size_t totalCCtxSize = 0; + unsigned u; for (u=0; ucctx[u]); + totalCCtxSize += ZSTD_sizeof_CCtx(cctxPool->cctxs[u]); } ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); assert(nbWorkers > 0); - return poolSize + totalCCtxSize; + return poolSize + arraySize + totalCCtxSize; } } @@ -415,7 +439,7 @@ static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* cctxPool) ZSTD_pthread_mutex_lock(&cctxPool->poolMutex); if (cctxPool->availCCtx) { cctxPool->availCCtx--; - { ZSTD_CCtx* const cctx = cctxPool->cctx[cctxPool->availCCtx]; + { ZSTD_CCtx* const cctx = cctxPool->cctxs[cctxPool->availCCtx]; ZSTD_pthread_mutex_unlock(&cctxPool->poolMutex); return cctx; } } @@ -429,7 +453,7 @@ static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx) if (cctx==NULL) return; /* compatibility with release on NULL */ ZSTD_pthread_mutex_lock(&pool->poolMutex); if (pool->availCCtx < pool->totalCCtx) - pool->cctx[pool->availCCtx++] = cctx; + pool->cctxs[pool->availCCtx++] = cctx; else { /* pool overflow : should not happen, since totalCCtx==nbWorkers */ DEBUGLOG(4, "CCtx pool overflow : free cctx"); @@ -443,7 +467,7 @@ static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx) typedef struct { void const* start; size_t size; -} range_t; +} Range; typedef struct { /* All variables in the struct are protected by mutex. */ @@ -459,63 +483,83 @@ typedef struct { ZSTD_pthread_mutex_t ldmWindowMutex; ZSTD_pthread_cond_t ldmWindowCond; /* Signaled when ldmWindow is updated */ ZSTD_window_t ldmWindow; /* A thread-safe copy of ldmState.window */ -} serialState_t; +} SerialState; -static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params, size_t jobSize) +static int +ZSTDMT_serialState_reset(SerialState* serialState, + ZSTDMT_seqPool* seqPool, + ZSTD_CCtx_params params, + size_t jobSize, + const void* dict, size_t const dictSize, + ZSTD_dictContentType_e dictContentType) { /* Adjust parameters */ - if (params.ldmParams.enableLdm) { + if (params.ldmParams.enableLdm == ZSTD_ps_enable) { DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10); ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); assert(params.ldmParams.hashRateLog < 32); - serialState->ldmState.hashPower = - ZSTD_rollingHash_primePower(params.ldmParams.minMatchLength); } else { - memset(¶ms.ldmParams, 0, sizeof(params.ldmParams)); + ZSTD_memset(¶ms.ldmParams, 0, sizeof(params.ldmParams)); } serialState->nextJobID = 0; if (params.fParams.checksumFlag) XXH64_reset(&serialState->xxhState, 0); - if (params.ldmParams.enableLdm) { + if (params.ldmParams.enableLdm == ZSTD_ps_enable) { ZSTD_customMem cMem = params.customMem; unsigned const hashLog = params.ldmParams.hashLog; size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t); unsigned const bucketLog = params.ldmParams.hashLog - params.ldmParams.bucketSizeLog; - size_t const bucketSize = (size_t)1 << bucketLog; unsigned const prevBucketLog = serialState->params.ldmParams.hashLog - serialState->params.ldmParams.bucketSizeLog; + size_t const numBuckets = (size_t)1 << bucketLog; /* Size the seq pool tables */ ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize)); /* Reset the window */ - ZSTD_window_clear(&serialState->ldmState.window); - serialState->ldmWindow = serialState->ldmState.window; + ZSTD_window_init(&serialState->ldmState.window); /* Resize tables and output space if necessary. */ if (serialState->ldmState.hashTable == NULL || serialState->params.ldmParams.hashLog < hashLog) { - ZSTD_free(serialState->ldmState.hashTable, cMem); - serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_malloc(hashSize, cMem); + ZSTD_customFree(serialState->ldmState.hashTable, cMem); + serialState->ldmState.hashTable = (ldmEntry_t*)ZSTD_customMalloc(hashSize, cMem); } if (serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) { - ZSTD_free(serialState->ldmState.bucketOffsets, cMem); - serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_malloc(bucketSize, cMem); + ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem); + serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_customMalloc(numBuckets, cMem); } if (!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets) return 1; /* Zero the tables */ - memset(serialState->ldmState.hashTable, 0, hashSize); - memset(serialState->ldmState.bucketOffsets, 0, bucketSize); + ZSTD_memset(serialState->ldmState.hashTable, 0, hashSize); + ZSTD_memset(serialState->ldmState.bucketOffsets, 0, numBuckets); + + /* Update window state and fill hash table with dict */ + serialState->ldmState.loadedDictEnd = 0; + if (dictSize > 0) { + if (dictContentType == ZSTD_dct_rawContent) { + BYTE const* const dictEnd = (const BYTE*)dict + dictSize; + ZSTD_window_update(&serialState->ldmState.window, dict, dictSize, /* forceNonContiguous */ 0); + ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, dictEnd, ¶ms.ldmParams); + serialState->ldmState.loadedDictEnd = params.forceWindow ? 0 : (U32)(dictEnd - serialState->ldmState.window.base); + } else { + /* don't even load anything */ + } + } + + /* Initialize serialState's copy of ldmWindow. */ + serialState->ldmWindow = serialState->ldmState.window; } + serialState->params = params; serialState->params.jobSize = (U32)jobSize; return 0; } -static int ZSTDMT_serialState_init(serialState_t* serialState) +static int ZSTDMT_serialState_init(SerialState* serialState) { int initError = 0; - memset(serialState, 0, sizeof(*serialState)); + ZSTD_memset(serialState, 0, sizeof(*serialState)); initError |= ZSTD_pthread_mutex_init(&serialState->mutex, NULL); initError |= ZSTD_pthread_cond_init(&serialState->cond, NULL); initError |= ZSTD_pthread_mutex_init(&serialState->ldmWindowMutex, NULL); @@ -523,20 +567,21 @@ static int ZSTDMT_serialState_init(serialState_t* serialState) return initError; } -static void ZSTDMT_serialState_free(serialState_t* serialState) +static void ZSTDMT_serialState_free(SerialState* serialState) { ZSTD_customMem cMem = serialState->params.customMem; ZSTD_pthread_mutex_destroy(&serialState->mutex); ZSTD_pthread_cond_destroy(&serialState->cond); ZSTD_pthread_mutex_destroy(&serialState->ldmWindowMutex); ZSTD_pthread_cond_destroy(&serialState->ldmWindowCond); - ZSTD_free(serialState->ldmState.hashTable, cMem); - ZSTD_free(serialState->ldmState.bucketOffsets, cMem); + ZSTD_customFree(serialState->ldmState.hashTable, cMem); + ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem); } -static void ZSTDMT_serialState_update(serialState_t* serialState, - ZSTD_CCtx* jobCCtx, rawSeqStore_t seqStore, - range_t src, unsigned jobID) +static void +ZSTDMT_serialState_genSequences(SerialState* serialState, + RawSeqStore_t* seqStore, + Range src, unsigned jobID) { /* Wait for our turn */ ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex); @@ -547,14 +592,15 @@ static void ZSTDMT_serialState_update(serialState_t* serialState, /* A future job may error and skip our job */ if (serialState->nextJobID == jobID) { /* It is now our turn, do any processing necessary */ - if (serialState->params.ldmParams.enableLdm) { + if (serialState->params.ldmParams.enableLdm == ZSTD_ps_enable) { size_t error; - assert(seqStore.seq != NULL && seqStore.pos == 0 && - seqStore.size == 0 && seqStore.capacity > 0); + DEBUGLOG(6, "ZSTDMT_serialState_genSequences: LDM update"); + assert(seqStore->seq != NULL && seqStore->pos == 0 && + seqStore->size == 0 && seqStore->capacity > 0); assert(src.size <= serialState->params.jobSize); - ZSTD_window_update(&serialState->ldmState.window, src.start, src.size); + ZSTD_window_update(&serialState->ldmState.window, src.start, src.size, /* forceNonContiguous */ 0); error = ZSTD_ldm_generateSequences( - &serialState->ldmState, &seqStore, + &serialState->ldmState, seqStore, &serialState->params.ldmParams, src.start, src.size); /* We provide a large enough buffer to never fail. */ assert(!ZSTD_isError(error)); (void)error; @@ -573,17 +619,22 @@ static void ZSTDMT_serialState_update(serialState_t* serialState, serialState->nextJobID++; ZSTD_pthread_cond_broadcast(&serialState->cond); ZSTD_pthread_mutex_unlock(&serialState->mutex); +} - if (seqStore.size > 0) { - size_t const err = ZSTD_referenceExternalSequences( - jobCCtx, seqStore.seq, seqStore.size); - assert(serialState->params.ldmParams.enableLdm); - assert(!ZSTD_isError(err)); - (void)err; +static void +ZSTDMT_serialState_applySequences(const SerialState* serialState, /* just for an assert() check */ + ZSTD_CCtx* jobCCtx, + const RawSeqStore_t* seqStore) +{ + if (seqStore->size > 0) { + DEBUGLOG(5, "ZSTDMT_serialState_applySequences: uploading %u external sequences", (unsigned)seqStore->size); + assert(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable); (void)serialState; + assert(jobCCtx); + ZSTD_referenceExternalSequences(jobCCtx, seqStore->seq, seqStore->size); } } -static void ZSTDMT_serialState_ensureFinished(serialState_t* serialState, +static void ZSTDMT_serialState_ensureFinished(SerialState* serialState, unsigned jobID, size_t cSize) { ZSTD_PTHREAD_MUTEX_LOCK(&serialState->mutex); @@ -607,36 +658,37 @@ static void ZSTDMT_serialState_ensureFinished(serialState_t* serialState, /* ===== Worker thread ===== */ /* ------------------------------------------ */ -static const range_t kNullRange = { NULL, 0 }; +static const Range kNullRange = { NULL, 0 }; typedef struct { - size_t consumed; /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */ - size_t cSize; /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */ - ZSTD_pthread_mutex_t job_mutex; /* Thread-safe - used by mtctx and worker */ - ZSTD_pthread_cond_t job_cond; /* Thread-safe - used by mtctx and worker */ - ZSTDMT_CCtxPool* cctxPool; /* Thread-safe - used by mtctx and (all) workers */ - ZSTDMT_bufferPool* bufPool; /* Thread-safe - used by mtctx and (all) workers */ - ZSTDMT_seqPool* seqPool; /* Thread-safe - used by mtctx and (all) workers */ - serialState_t* serial; /* Thread-safe - used by mtctx and (all) workers */ - buffer_t dstBuff; /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */ - range_t prefix; /* set by mtctx, then read by worker & mtctx => no barrier */ - range_t src; /* set by mtctx, then read by worker & mtctx => no barrier */ - unsigned jobID; /* set by mtctx, then read by worker => no barrier */ - unsigned firstJob; /* set by mtctx, then read by worker => no barrier */ - unsigned lastJob; /* set by mtctx, then read by worker => no barrier */ - ZSTD_CCtx_params params; /* set by mtctx, then read by worker => no barrier */ - const ZSTD_CDict* cdict; /* set by mtctx, then read by worker => no barrier */ - unsigned long long fullFrameSize; /* set by mtctx, then read by worker => no barrier */ - size_t dstFlushed; /* used only by mtctx */ - unsigned frameChecksumNeeded; /* used only by mtctx */ + size_t consumed; /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx */ + size_t cSize; /* SHARED - set0 by mtctx, then modified by worker AND read by mtctx, then set0 by mtctx */ + ZSTD_pthread_mutex_t job_mutex; /* Thread-safe - used by mtctx and worker */ + ZSTD_pthread_cond_t job_cond; /* Thread-safe - used by mtctx and worker */ + ZSTDMT_CCtxPool* cctxPool; /* Thread-safe - used by mtctx and (all) workers */ + ZSTDMT_bufferPool* bufPool; /* Thread-safe - used by mtctx and (all) workers */ + ZSTDMT_seqPool* seqPool; /* Thread-safe - used by mtctx and (all) workers */ + SerialState* serial; /* Thread-safe - used by mtctx and (all) workers */ + Buffer dstBuff; /* set by worker (or mtctx), then read by worker & mtctx, then modified by mtctx => no barrier */ + Range prefix; /* set by mtctx, then read by worker & mtctx => no barrier */ + Range src; /* set by mtctx, then read by worker & mtctx => no barrier */ + unsigned jobID; /* set by mtctx, then read by worker => no barrier */ + unsigned firstJob; /* set by mtctx, then read by worker => no barrier */ + unsigned lastJob; /* set by mtctx, then read by worker => no barrier */ + ZSTD_CCtx_params params; /* set by mtctx, then read by worker => no barrier */ + const ZSTD_CDict* cdict; /* set by mtctx, then read by worker => no barrier */ + unsigned long long fullFrameSize; /* set by mtctx, then read by worker => no barrier */ + size_t dstFlushed; /* used only by mtctx */ + unsigned frameChecksumNeeded; /* used only by mtctx */ } ZSTDMT_jobDescription; -#define JOB_ERROR(e) { \ - ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); \ - job->cSize = e; \ - ZSTD_pthread_mutex_unlock(&job->job_mutex); \ - goto _endJob; \ -} +#define JOB_ERROR(e) \ + do { \ + ZSTD_PTHREAD_MUTEX_LOCK(&job->job_mutex); \ + job->cSize = e; \ + ZSTD_pthread_mutex_unlock(&job->job_mutex); \ + goto _endJob; \ + } while (0) /* ZSTDMT_compressionJob() is a POOL_function type */ static void ZSTDMT_compressionJob(void* jobDescription) @@ -644,10 +696,11 @@ static void ZSTDMT_compressionJob(void* jobDescription) ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription; ZSTD_CCtx_params jobParams = job->params; /* do not modify job->params ! copy it, modify the copy */ ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(job->cctxPool); - rawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool); - buffer_t dstBuff = job->dstBuff; + RawSeqStore_t rawSeqStore = ZSTDMT_getSeq(job->seqPool); + Buffer dstBuff = job->dstBuff; size_t lastCBlockSize = 0; + DEBUGLOG(5, "ZSTDMT_compressionJob: job %u", job->jobID); /* resources */ if (cctx==NULL) JOB_ERROR(ERROR(memory_allocation)); if (dstBuff.start == NULL) { /* streaming job : doesn't provide a dstBuffer */ @@ -655,7 +708,7 @@ static void ZSTDMT_compressionJob(void* jobDescription) if (dstBuff.start==NULL) JOB_ERROR(ERROR(memory_allocation)); job->dstBuff = dstBuff; /* this value can be read in ZSTDMT_flush, when it copies the whole job */ } - if (jobParams.ldmParams.enableLdm && rawSeqStore.seq == NULL) + if (jobParams.ldmParams.enableLdm == ZSTD_ps_enable && rawSeqStore.seq == NULL) JOB_ERROR(ERROR(memory_allocation)); /* Don't compute the checksum for chunks, since we compute it externally, @@ -663,38 +716,49 @@ static void ZSTDMT_compressionJob(void* jobDescription) */ if (job->jobID != 0) jobParams.fParams.checksumFlag = 0; /* Don't run LDM for the chunks, since we handle it externally */ - jobParams.ldmParams.enableLdm = 0; + jobParams.ldmParams.enableLdm = ZSTD_ps_disable; + /* Correct nbWorkers to 0. */ + jobParams.nbWorkers = 0; /* init */ + + /* Perform serial step as early as possible */ + ZSTDMT_serialState_genSequences(job->serial, &rawSeqStore, job->src, job->jobID); + if (job->cdict) { size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, job->cdict, &jobParams, job->fullFrameSize); assert(job->firstJob); /* only allowed for first job */ if (ZSTD_isError(initError)) JOB_ERROR(initError); - } else { /* srcStart points at reloaded section */ + } else { U64 const pledgedSrcSize = job->firstJob ? job->fullFrameSize : job->src.size; { size_t const forceWindowError = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_forceMaxWindow, !job->firstJob); if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError); } + if (!job->firstJob) { + size_t const err = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_deterministicRefPrefix, 0); + if (ZSTD_isError(err)) JOB_ERROR(err); + } + DEBUGLOG(6, "ZSTDMT_compressionJob: job %u: loading prefix of size %zu", job->jobID, job->prefix.size); { size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, - job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */ + job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, ZSTD_dtlm_fast, NULL, /*cdict*/ &jobParams, pledgedSrcSize); if (ZSTD_isError(initError)) JOB_ERROR(initError); } } - /* Perform serial step as early as possible, but after CCtx initialization */ - ZSTDMT_serialState_update(job->serial, cctx, rawSeqStore, job->src, job->jobID); + /* External Sequences can only be applied after CCtx initialization */ + ZSTDMT_serialState_applySequences(job->serial, cctx, &rawSeqStore); if (!job->firstJob) { /* flush and overwrite frame header when it's not first job */ - size_t const hSize = ZSTD_compressContinue(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0); + size_t const hSize = ZSTD_compressContinue_public(cctx, dstBuff.start, dstBuff.capacity, job->src.start, 0); if (ZSTD_isError(hSize)) JOB_ERROR(hSize); DEBUGLOG(5, "ZSTDMT_compressionJob: flush and overwrite %u bytes of frame header (not first job)", (U32)hSize); ZSTD_invalidateRepCodes(cctx); } - /* compress */ + /* compress the entire job by smaller chunks, for better granularity */ { size_t const chunkSize = 4*ZSTD_BLOCKSIZE_MAX; int const nbChunks = (int)((job->src.size + (chunkSize-1)) / chunkSize); const BYTE* ip = (const BYTE*) job->src.start; @@ -706,7 +770,7 @@ static void ZSTDMT_compressionJob(void* jobDescription) DEBUGLOG(5, "ZSTDMT_compressionJob: compress %u bytes in %i blocks", (U32)job->src.size, nbChunks); assert(job->cSize == 0); for (chunkNb = 1; chunkNb < nbChunks; chunkNb++) { - size_t const cSize = ZSTD_compressContinue(cctx, op, oend-op, ip, chunkSize); + size_t const cSize = ZSTD_compressContinue_public(cctx, op, oend-op, ip, chunkSize); if (ZSTD_isError(cSize)) JOB_ERROR(cSize); ip += chunkSize; op += cSize; assert(op < oend); @@ -726,11 +790,18 @@ static void ZSTDMT_compressionJob(void* jobDescription) size_t const lastBlockSize1 = job->src.size & (chunkSize-1); size_t const lastBlockSize = ((lastBlockSize1==0) & (job->src.size>=chunkSize)) ? chunkSize : lastBlockSize1; size_t const cSize = (job->lastJob) ? - ZSTD_compressEnd (cctx, op, oend-op, ip, lastBlockSize) : - ZSTD_compressContinue(cctx, op, oend-op, ip, lastBlockSize); + ZSTD_compressEnd_public(cctx, op, oend-op, ip, lastBlockSize) : + ZSTD_compressContinue_public(cctx, op, oend-op, ip, lastBlockSize); if (ZSTD_isError(cSize)) JOB_ERROR(cSize); lastCBlockSize = cSize; } } + if (!job->firstJob) { + /* Double check that we don't have an ext-dict, because then our + * repcode invalidation doesn't work. + */ + assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window)); + } + ZSTD_CCtx_trace(cctx, 0); _endJob: ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize); @@ -755,10 +826,10 @@ static void ZSTDMT_compressionJob(void* jobDescription) /* ------------------------------------------ */ typedef struct { - range_t prefix; /* read-only non-owned prefix buffer */ - buffer_t buffer; + Range prefix; /* read-only non-owned prefix buffer */ + Buffer buffer; size_t filled; -} inBuff_t; +} InBuff_t; typedef struct { BYTE* buffer; /* The round input buffer. All jobs get references @@ -772,17 +843,26 @@ typedef struct { * the inBuff is sent to the worker thread. * pos <= capacity. */ -} roundBuff_t; +} RoundBuff_t; -static const roundBuff_t kNullRoundBuff = {NULL, 0, 0}; +static const RoundBuff_t kNullRoundBuff = {NULL, 0, 0}; #define RSYNC_LENGTH 32 +/* Don't create chunks smaller than the zstd block size. + * This stops us from regressing compression ratio too much, + * and ensures our output fits in ZSTD_compressBound(). + * + * If this is shrunk < ZSTD_BLOCKSIZELOG_MIN then + * ZSTD_COMPRESSBOUND() will need to be updated. + */ +#define RSYNC_MIN_BLOCK_LOG ZSTD_BLOCKSIZELOG_MAX +#define RSYNC_MIN_BLOCK_SIZE (1< one job is already prepared, but pool has shortage of workers. Don't create a new job. */ - inBuff_t inBuff; - roundBuff_t roundBuff; - serialState_t serial; - rsyncState_t rsync; - unsigned singleBlockingThread; + InBuff_t inBuff; + RoundBuff_t roundBuff; + SerialState serial; + RSyncState_t rsync; unsigned jobIDMask; unsigned doneJobID; unsigned nextJobID; @@ -810,6 +889,7 @@ struct ZSTDMT_CCtx_s { ZSTD_customMem cMem; ZSTD_CDict* cdictLocal; const ZSTD_CDict* cdict; + unsigned providedFactory: 1; }; static void ZSTDMT_freeJobsTable(ZSTDMT_jobDescription* jobTable, U32 nbJobs, ZSTD_customMem cMem) @@ -820,7 +900,7 @@ static void ZSTDMT_freeJobsTable(ZSTDMT_jobDescription* jobTable, U32 nbJobs, ZS ZSTD_pthread_mutex_destroy(&jobTable[jobNb].job_mutex); ZSTD_pthread_cond_destroy(&jobTable[jobNb].job_cond); } - ZSTD_free(jobTable, cMem); + ZSTD_customFree(jobTable, cMem); } /* ZSTDMT_allocJobsTable() @@ -832,7 +912,7 @@ static ZSTDMT_jobDescription* ZSTDMT_createJobsTable(U32* nbJobsPtr, ZSTD_custom U32 const nbJobs = 1 << nbJobsLog2; U32 jobNb; ZSTDMT_jobDescription* const jobTable = (ZSTDMT_jobDescription*) - ZSTD_calloc(nbJobs * sizeof(ZSTDMT_jobDescription), cMem); + ZSTD_customCalloc(nbJobs * sizeof(ZSTDMT_jobDescription), cMem); int initError = 0; if (jobTable==NULL) return NULL; *nbJobsPtr = nbJobs; @@ -863,12 +943,12 @@ static size_t ZSTDMT_expandJobsTable (ZSTDMT_CCtx* mtctx, U32 nbWorkers) { /* ZSTDMT_CCtxParam_setNbWorkers(): * Internal use only */ -size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers) +static size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers) { return ZSTD_CCtxParams_setParameter(params, ZSTD_c_nbWorkers, (int)nbWorkers); } -MEM_STATIC ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced_internal(unsigned nbWorkers, ZSTD_customMem cMem) +MEM_STATIC ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced_internal(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool) { ZSTDMT_CCtx* mtctx; U32 nbJobs = nbWorkers + 2; @@ -881,16 +961,23 @@ MEM_STATIC ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced_internal(unsigned nbWorkers, /* invalid custom allocator */ return NULL; - mtctx = (ZSTDMT_CCtx*) ZSTD_calloc(sizeof(ZSTDMT_CCtx), cMem); + mtctx = (ZSTDMT_CCtx*) ZSTD_customCalloc(sizeof(ZSTDMT_CCtx), cMem); if (!mtctx) return NULL; ZSTDMT_CCtxParam_setNbWorkers(&mtctx->params, nbWorkers); mtctx->cMem = cMem; mtctx->allJobsCompleted = 1; - mtctx->factory = POOL_create_advanced(nbWorkers, 0, cMem); + if (pool != NULL) { + mtctx->factory = pool; + mtctx->providedFactory = 1; + } + else { + mtctx->factory = POOL_create_advanced(nbWorkers, 0, cMem); + mtctx->providedFactory = 0; + } mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem); assert(nbJobs > 0); assert((nbJobs & (nbJobs - 1)) == 0); /* ensure nbJobs is a power of 2 */ mtctx->jobIDMask = nbJobs - 1; - mtctx->bufPool = ZSTDMT_createBufferPool(nbWorkers, cMem); + mtctx->bufPool = ZSTDMT_createBufferPool(BUF_POOL_MAX_NB_BUFFERS(nbWorkers), cMem); mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem); mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem); initError = ZSTDMT_serialState_init(&mtctx->serial); @@ -903,22 +990,18 @@ MEM_STATIC ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced_internal(unsigned nbWorkers, return mtctx; } -ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem) +ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, ZSTD_customMem cMem, ZSTD_threadPool* pool) { #ifdef ZSTD_MULTITHREAD - return ZSTDMT_createCCtx_advanced_internal(nbWorkers, cMem); + return ZSTDMT_createCCtx_advanced_internal(nbWorkers, cMem, pool); #else (void)nbWorkers; (void)cMem; + (void)pool; return NULL; #endif } -ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers) -{ - return ZSTDMT_createCCtx_advanced(nbWorkers, ZSTD_defaultCMem); -} - /* ZSTDMT_releaseAllJobResources() : * note : ensure all workers are killed first ! */ @@ -935,7 +1018,7 @@ static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx) ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff); /* Clear the job description, but keep the mutex/cond */ - memset(&mtctx->jobs[jobID], 0, sizeof(mtctx->jobs[jobID])); + ZSTD_memset(&mtctx->jobs[jobID], 0, sizeof(mtctx->jobs[jobID])); mtctx->jobs[jobID].job_mutex = mutex; mtctx->jobs[jobID].job_cond = cond; } @@ -962,7 +1045,8 @@ static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* mtctx) size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx) { if (mtctx==NULL) return 0; /* compatible with free on NULL */ - POOL_free(mtctx->factory); /* stop and free worker threads */ + if (!mtctx->providedFactory) + POOL_free(mtctx->factory); /* stop and free worker threads */ ZSTDMT_releaseAllJobResources(mtctx); /* release job resources into pools first */ ZSTDMT_freeJobsTable(mtctx->jobs, mtctx->jobIDMask+1, mtctx->cMem); ZSTDMT_freeBufferPool(mtctx->bufPool); @@ -971,8 +1055,8 @@ size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx) ZSTDMT_serialState_free(&mtctx->serial); ZSTD_freeCDict(mtctx->cdictLocal); if (mtctx->roundBuff.buffer) - ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem); - ZSTD_free(mtctx, mtctx->cMem); + ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem); + ZSTD_customFree(mtctx, mtctx->cMem); return 0; } @@ -989,73 +1073,14 @@ size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx) + mtctx->roundBuff.capacity; } -/* Internal only */ -size_t -ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, - ZSTDMT_parameter parameter, - int value) -{ - DEBUGLOG(4, "ZSTDMT_CCtxParam_setMTCtxParameter"); - switch(parameter) - { - case ZSTDMT_p_jobSize : - DEBUGLOG(4, "ZSTDMT_CCtxParam_setMTCtxParameter : set jobSize to %i", value); - return ZSTD_CCtxParams_setParameter(params, ZSTD_c_jobSize, value); - case ZSTDMT_p_overlapLog : - DEBUGLOG(4, "ZSTDMT_p_overlapLog : %i", value); - return ZSTD_CCtxParams_setParameter(params, ZSTD_c_overlapLog, value); - case ZSTDMT_p_rsyncable : - DEBUGLOG(4, "ZSTD_p_rsyncable : %i", value); - return ZSTD_CCtxParams_setParameter(params, ZSTD_c_rsyncable, value); - default : - return ERROR(parameter_unsupported); - } -} - -size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int value) -{ - DEBUGLOG(4, "ZSTDMT_setMTCtxParameter"); - return ZSTDMT_CCtxParam_setMTCtxParameter(&mtctx->params, parameter, value); -} - -size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int* value) -{ - switch (parameter) { - case ZSTDMT_p_jobSize: - return ZSTD_CCtxParams_getParameter(&mtctx->params, ZSTD_c_jobSize, value); - case ZSTDMT_p_overlapLog: - return ZSTD_CCtxParams_getParameter(&mtctx->params, ZSTD_c_overlapLog, value); - case ZSTDMT_p_rsyncable: - return ZSTD_CCtxParams_getParameter(&mtctx->params, ZSTD_c_rsyncable, value); - default: - return ERROR(parameter_unsupported); - } -} - -/* Sets parameters relevant to the compression job, - * initializing others to default values. */ -static ZSTD_CCtx_params ZSTDMT_initJobCCtxParams(const ZSTD_CCtx_params* params) -{ - ZSTD_CCtx_params jobParams = *params; - /* Clear parameters related to multithreading */ - jobParams.forceWindow = 0; - jobParams.nbWorkers = 0; - jobParams.jobSize = 0; - jobParams.overlapLog = 0; - jobParams.rsyncable = 0; - memset(&jobParams.ldmParams, 0, sizeof(ldmParams_t)); - memset(&jobParams.customMem, 0, sizeof(ZSTD_customMem)); - return jobParams; -} - /* ZSTDMT_resize() : * @return : error code if fails, 0 on success */ static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers) { if (POOL_resize(mtctx->factory, nbWorkers)) return ERROR(memory_allocation); - FORWARD_IF_ERROR( ZSTDMT_expandJobsTable(mtctx, nbWorkers) ); - mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, nbWorkers); + FORWARD_IF_ERROR( ZSTDMT_expandJobsTable(mtctx, nbWorkers) , ""); + mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, BUF_POOL_MAX_NB_BUFFERS(nbWorkers)); if (mtctx->bufPool == NULL) return ERROR(memory_allocation); mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers); if (mtctx->cctxPool == NULL) return ERROR(memory_allocation); @@ -1076,7 +1101,7 @@ void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_p DEBUGLOG(5, "ZSTDMT_updateCParams_whileCompressing (level:%i)", compressionLevel); mtctx->params.compressionLevel = compressionLevel; - { ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, 0, 0); + { ZSTD_compressionParameters cParams = ZSTD_getCParamsFromCCtxParams(cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); cParams.windowLog = saved_wlog; mtctx->params.cParams = cParams; } @@ -1098,7 +1123,7 @@ ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx) { unsigned jobNb; unsigned lastJobNb = mtctx->nextJobID + mtctx->jobReady; assert(mtctx->jobReady <= 1); DEBUGLOG(6, "ZSTDMT_getFrameProgression: jobs: from %u to <%u (jobReady:%u)", - mtctx->doneJobID, lastJobNb, mtctx->jobReady) + mtctx->doneJobID, lastJobNb, mtctx->jobReady); for (jobNb = mtctx->doneJobID ; jobNb < lastJobNb ; jobNb++) { unsigned const wJobID = jobNb & mtctx->jobIDMask; ZSTDMT_jobDescription* jobPtr = &mtctx->jobs[wJobID]; @@ -1160,11 +1185,11 @@ size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx) static unsigned ZSTDMT_computeTargetJobLog(const ZSTD_CCtx_params* params) { unsigned jobLog; - if (params->ldmParams.enableLdm) { + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* In Long Range Mode, the windowLog is typically oversized. * In which case, it's preferable to determine the jobSize - * based on chainLog instead. */ - jobLog = MAX(21, params->cParams.chainLog + 4); + * based on cycleLog instead. */ + jobLog = MAX(21, ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy) + 3); } else { jobLog = MAX(20, params->cParams.windowLog + 2); } @@ -1204,7 +1229,7 @@ static size_t ZSTDMT_computeOverlapSize(const ZSTD_CCtx_params* params) int const overlapRLog = 9 - ZSTDMT_overlapLog(params->overlapLog, params->cParams.strategy); int ovLog = (overlapRLog >= 8) ? 0 : (params->cParams.windowLog - overlapRLog); assert(0 <= overlapRLog && overlapRLog <= 8); - if (params->ldmParams.enableLdm) { + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* In Long Range Mode, the windowLog is typically oversized. * In which case, it's preferable to determine the jobSize * based on chainLog instead. @@ -1218,172 +1243,6 @@ static size_t ZSTDMT_computeOverlapSize(const ZSTD_CCtx_params* params) return (ovLog==0) ? 0 : (size_t)1 << ovLog; } -static unsigned -ZSTDMT_computeNbJobs(const ZSTD_CCtx_params* params, size_t srcSize, unsigned nbWorkers) -{ - assert(nbWorkers>0); - { size_t const jobSizeTarget = (size_t)1 << ZSTDMT_computeTargetJobLog(params); - size_t const jobMaxSize = jobSizeTarget << 2; - size_t const passSizeMax = jobMaxSize * nbWorkers; - unsigned const multiplier = (unsigned)(srcSize / passSizeMax) + 1; - unsigned const nbJobsLarge = multiplier * nbWorkers; - unsigned const nbJobsMax = (unsigned)(srcSize / jobSizeTarget) + 1; - unsigned const nbJobsSmall = MIN(nbJobsMax, nbWorkers); - return (multiplier>1) ? nbJobsLarge : nbJobsSmall; -} } - -/* ZSTDMT_compress_advanced_internal() : - * This is a blocking function : it will only give back control to caller after finishing its compression job. - */ -static size_t ZSTDMT_compress_advanced_internal( - ZSTDMT_CCtx* mtctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const ZSTD_CDict* cdict, - ZSTD_CCtx_params params) -{ - ZSTD_CCtx_params const jobParams = ZSTDMT_initJobCCtxParams(¶ms); - size_t const overlapSize = ZSTDMT_computeOverlapSize(¶ms); - unsigned const nbJobs = ZSTDMT_computeNbJobs(¶ms, srcSize, params.nbWorkers); - size_t const proposedJobSize = (srcSize + (nbJobs-1)) / nbJobs; - size_t const avgJobSize = (((proposedJobSize-1) & 0x1FFFF) < 0x7FFF) ? proposedJobSize + 0xFFFF : proposedJobSize; /* avoid too small last block */ - const char* const srcStart = (const char*)src; - size_t remainingSrcSize = srcSize; - unsigned const compressWithinDst = (dstCapacity >= ZSTD_compressBound(srcSize)) ? nbJobs : (unsigned)(dstCapacity / ZSTD_compressBound(avgJobSize)); /* presumes avgJobSize >= 256 KB, which should be the case */ - size_t frameStartPos = 0, dstBufferPos = 0; - assert(jobParams.nbWorkers == 0); - assert(mtctx->cctxPool->totalCCtx == params.nbWorkers); - - params.jobSize = (U32)avgJobSize; - DEBUGLOG(4, "ZSTDMT_compress_advanced_internal: nbJobs=%2u (rawSize=%u bytes; fixedSize=%u) ", - nbJobs, (U32)proposedJobSize, (U32)avgJobSize); - - if ((nbJobs==1) | (params.nbWorkers<=1)) { /* fallback to single-thread mode : this is a blocking invocation anyway */ - ZSTD_CCtx* const cctx = mtctx->cctxPool->cctx[0]; - DEBUGLOG(4, "ZSTDMT_compress_advanced_internal: fallback to single-thread mode"); - if (cdict) return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, jobParams.fParams); - return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, NULL, 0, &jobParams); - } - - assert(avgJobSize >= 256 KB); /* condition for ZSTD_compressBound(A) + ZSTD_compressBound(B) <= ZSTD_compressBound(A+B), required to compress directly into Dst (no additional buffer) */ - ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(avgJobSize) ); - if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, avgJobSize)) - return ERROR(memory_allocation); - - FORWARD_IF_ERROR( ZSTDMT_expandJobsTable(mtctx, nbJobs) ); /* only expands if necessary */ - - { unsigned u; - for (u=0; ujobs[u].prefix.start = srcStart + frameStartPos - dictSize; - mtctx->jobs[u].prefix.size = dictSize; - mtctx->jobs[u].src.start = srcStart + frameStartPos; - mtctx->jobs[u].src.size = jobSize; assert(jobSize > 0); /* avoid job.src.size == 0 */ - mtctx->jobs[u].consumed = 0; - mtctx->jobs[u].cSize = 0; - mtctx->jobs[u].cdict = (u==0) ? cdict : NULL; - mtctx->jobs[u].fullFrameSize = srcSize; - mtctx->jobs[u].params = jobParams; - /* do not calculate checksum within sections, but write it in header for first section */ - mtctx->jobs[u].dstBuff = dstBuffer; - mtctx->jobs[u].cctxPool = mtctx->cctxPool; - mtctx->jobs[u].bufPool = mtctx->bufPool; - mtctx->jobs[u].seqPool = mtctx->seqPool; - mtctx->jobs[u].serial = &mtctx->serial; - mtctx->jobs[u].jobID = u; - mtctx->jobs[u].firstJob = (u==0); - mtctx->jobs[u].lastJob = (u==nbJobs-1); - - DEBUGLOG(5, "ZSTDMT_compress_advanced_internal: posting job %u (%u bytes)", u, (U32)jobSize); - DEBUG_PRINTHEX(6, mtctx->jobs[u].prefix.start, 12); - POOL_add(mtctx->factory, ZSTDMT_compressionJob, &mtctx->jobs[u]); - - frameStartPos += jobSize; - dstBufferPos += dstBufferCapacity; - remainingSrcSize -= jobSize; - } } - - /* collect result */ - { size_t error = 0, dstPos = 0; - unsigned jobID; - for (jobID=0; jobIDjobs[jobID].job_mutex); - while (mtctx->jobs[jobID].consumed < mtctx->jobs[jobID].src.size) { - DEBUGLOG(5, "waiting for jobCompleted signal from job %u", jobID); - ZSTD_pthread_cond_wait(&mtctx->jobs[jobID].job_cond, &mtctx->jobs[jobID].job_mutex); - } - ZSTD_pthread_mutex_unlock(&mtctx->jobs[jobID].job_mutex); - DEBUGLOG(5, "ready to write job %u ", jobID); - - { size_t const cSize = mtctx->jobs[jobID].cSize; - if (ZSTD_isError(cSize)) error = cSize; - if ((!error) && (dstPos + cSize > dstCapacity)) error = ERROR(dstSize_tooSmall); - if (jobID) { /* note : job 0 is written directly at dst, which is correct position */ - if (!error) - memmove((char*)dst + dstPos, mtctx->jobs[jobID].dstBuff.start, cSize); /* may overlap when job compressed within dst */ - if (jobID >= compressWithinDst) { /* job compressed into its own buffer, which must be released */ - DEBUGLOG(5, "releasing buffer %u>=%u", jobID, compressWithinDst); - ZSTDMT_releaseBuffer(mtctx->bufPool, mtctx->jobs[jobID].dstBuff); - } } - mtctx->jobs[jobID].dstBuff = g_nullBuffer; - mtctx->jobs[jobID].cSize = 0; - dstPos += cSize ; - } - } /* for (jobID=0; jobIDserial.xxhState); - if (dstPos + 4 > dstCapacity) { - error = ERROR(dstSize_tooSmall); - } else { - DEBUGLOG(4, "writing checksum : %08X \n", checksum); - MEM_writeLE32((char*)dst + dstPos, checksum); - dstPos += 4; - } } - - if (!error) DEBUGLOG(4, "compressed size : %u ", (U32)dstPos); - return error ? error : dstPos; - } -} - -size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const ZSTD_CDict* cdict, - ZSTD_parameters params, - int overlapLog) -{ - ZSTD_CCtx_params cctxParams = mtctx->params; - cctxParams.cParams = params.cParams; - cctxParams.fParams = params.fParams; - assert(ZSTD_OVERLAPLOG_MIN <= overlapLog && overlapLog <= ZSTD_OVERLAPLOG_MAX); - cctxParams.overlapLog = overlapLog; - return ZSTDMT_compress_advanced_internal(mtctx, - dst, dstCapacity, - src, srcSize, - cdict, cctxParams); -} - - -size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - int compressionLevel) -{ - ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, 0); - int const overlapLog = ZSTDMT_overlapLog_default(params.cParams.strategy); - params.fParams.contentSizeFlag = 1; - return ZSTDMT_compress_advanced(mtctx, dst, dstCapacity, src, srcSize, NULL, params, overlapLog); -} - - /* ====================================== */ /* ======= Streaming API ======= */ /* ====================================== */ @@ -1403,23 +1262,11 @@ size_t ZSTDMT_initCStream_internal( /* init */ if (params.nbWorkers != mtctx->params.nbWorkers) - FORWARD_IF_ERROR( ZSTDMT_resize(mtctx, params.nbWorkers) ); + FORWARD_IF_ERROR( ZSTDMT_resize(mtctx, (unsigned)params.nbWorkers) , ""); if (params.jobSize != 0 && params.jobSize < ZSTDMT_JOBSIZE_MIN) params.jobSize = ZSTDMT_JOBSIZE_MIN; if (params.jobSize > (size_t)ZSTDMT_JOBSIZE_MAX) params.jobSize = (size_t)ZSTDMT_JOBSIZE_MAX; - mtctx->singleBlockingThread = (pledgedSrcSize <= ZSTDMT_JOBSIZE_MIN); /* do not trigger multi-threading when srcSize is too small */ - if (mtctx->singleBlockingThread) { - ZSTD_CCtx_params const singleThreadParams = ZSTDMT_initJobCCtxParams(¶ms); - DEBUGLOG(5, "ZSTDMT_initCStream_internal: switch to single blocking thread mode"); - assert(singleThreadParams.nbWorkers == 0); - return ZSTD_initCStream_internal(mtctx->cctxPool->cctx[0], - dict, dictSize, cdict, - &singleThreadParams, pledgedSrcSize); - } - - DEBUGLOG(4, "ZSTDMT_initCStream_internal: %u workers", params.nbWorkers); - if (mtctx->allJobsCompleted == 0) { /* previous compression not correctly finished */ ZSTDMT_waitForAllJobsCompleted(mtctx); ZSTDMT_releaseAllJobResources(mtctx); @@ -1428,15 +1275,14 @@ size_t ZSTDMT_initCStream_internal( mtctx->params = params; mtctx->frameContentSize = pledgedSrcSize; + ZSTD_freeCDict(mtctx->cdictLocal); if (dict) { - ZSTD_freeCDict(mtctx->cdictLocal); mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, dictContentType, /* note : a loadPrefix becomes an internal CDict */ params.cParams, mtctx->cMem); mtctx->cdict = mtctx->cdictLocal; if (mtctx->cdictLocal == NULL) return ERROR(memory_allocation); } else { - ZSTD_freeCDict(mtctx->cdictLocal); mtctx->cdictLocal = NULL; mtctx->cdict = cdict; } @@ -1451,9 +1297,11 @@ size_t ZSTDMT_initCStream_internal( if (params.rsyncable) { /* Aim for the targetsectionSize as the average job size. */ - U32 const jobSizeMB = (U32)(mtctx->targetSectionSize >> 20); - U32 const rsyncBits = ZSTD_highbit32(jobSizeMB) + 20; - assert(jobSizeMB >= 1); + U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10); + U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10); + /* We refuse to create jobs < RSYNC_MIN_BLOCK_SIZE bytes, so make sure our + * expected job size is at least 4x larger. */ + assert(rsyncBits >= RSYNC_MIN_BLOCK_LOG + 2); DEBUGLOG(4, "rsyncLog = %u", rsyncBits); mtctx->rsync.hash = 0; mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1; @@ -1465,7 +1313,7 @@ size_t ZSTDMT_initCStream_internal( ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize)); { /* If ldm is enabled we need windowSize space. */ - size_t const windowSize = mtctx->params.ldmParams.enableLdm ? (1U << mtctx->params.cParams.windowLog) : 0; + size_t const windowSize = mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable ? (1U << mtctx->params.cParams.windowLog) : 0; /* Two buffers of slack, plus extra space for the overlap * This is the minimum slack that LDM works with. One extra because * flush might waste up to targetSectionSize-1 bytes. Another extra @@ -1480,8 +1328,8 @@ size_t ZSTDMT_initCStream_internal( size_t const capacity = MAX(windowSize, sectionsSize) + slackSize; if (mtctx->roundBuff.capacity < capacity) { if (mtctx->roundBuff.buffer) - ZSTD_free(mtctx->roundBuff.buffer, mtctx->cMem); - mtctx->roundBuff.buffer = (BYTE*)ZSTD_malloc(capacity, mtctx->cMem); + ZSTD_customFree(mtctx->roundBuff.buffer, mtctx->cMem); + mtctx->roundBuff.buffer = (BYTE*)ZSTD_customMalloc(capacity, mtctx->cMem); if (mtctx->roundBuff.buffer == NULL) { mtctx->roundBuff.capacity = 0; return ERROR(memory_allocation); @@ -1500,56 +1348,33 @@ size_t ZSTDMT_initCStream_internal( mtctx->allJobsCompleted = 0; mtctx->consumed = 0; mtctx->produced = 0; - if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize)) - return ERROR(memory_allocation); - return 0; -} - -size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx, - const void* dict, size_t dictSize, - ZSTD_parameters params, - unsigned long long pledgedSrcSize) -{ - ZSTD_CCtx_params cctxParams = mtctx->params; /* retrieve sticky params */ - DEBUGLOG(4, "ZSTDMT_initCStream_advanced (pledgedSrcSize=%u)", (U32)pledgedSrcSize); - cctxParams.cParams = params.cParams; - cctxParams.fParams = params.fParams; - return ZSTDMT_initCStream_internal(mtctx, dict, dictSize, ZSTD_dct_auto, NULL, - cctxParams, pledgedSrcSize); -} -size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx, - const ZSTD_CDict* cdict, - ZSTD_frameParameters fParams, - unsigned long long pledgedSrcSize) -{ - ZSTD_CCtx_params cctxParams = mtctx->params; - if (cdict==NULL) return ERROR(dictionary_wrong); /* method incompatible with NULL cdict */ - cctxParams.cParams = ZSTD_getCParamsFromCDict(cdict); - cctxParams.fParams = fParams; - return ZSTDMT_initCStream_internal(mtctx, NULL, 0 /*dictSize*/, ZSTD_dct_auto, cdict, - cctxParams, pledgedSrcSize); -} + /* update dictionary */ + ZSTD_freeCDict(mtctx->cdictLocal); + mtctx->cdictLocal = NULL; + mtctx->cdict = NULL; + if (dict) { + if (dictContentType == ZSTD_dct_rawContent) { + mtctx->inBuff.prefix.start = (const BYTE*)dict; + mtctx->inBuff.prefix.size = dictSize; + } else { + /* note : a loadPrefix becomes an internal CDict */ + mtctx->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byRef, dictContentType, + params.cParams, mtctx->cMem); + mtctx->cdict = mtctx->cdictLocal; + if (mtctx->cdictLocal == NULL) return ERROR(memory_allocation); + } + } else { + mtctx->cdict = cdict; + } + if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize, + dict, dictSize, dictContentType)) + return ERROR(memory_allocation); -/* ZSTDMT_resetCStream() : - * pledgedSrcSize can be zero == unknown (for the time being) - * prefer using ZSTD_CONTENTSIZE_UNKNOWN, - * as `0` might mean "empty" in the future */ -size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize) -{ - if (!pledgedSrcSize) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; - return ZSTDMT_initCStream_internal(mtctx, NULL, 0, ZSTD_dct_auto, 0, mtctx->params, - pledgedSrcSize); -} -size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0); - ZSTD_CCtx_params cctxParams = mtctx->params; /* retrieve sticky params */ - DEBUGLOG(4, "ZSTDMT_initCStream (cLevel=%i)", compressionLevel); - cctxParams.cParams = params.cParams; - cctxParams.fParams = params.fParams; - return ZSTDMT_initCStream_internal(mtctx, NULL, 0, ZSTD_dct_auto, NULL, cctxParams, ZSTD_CONTENTSIZE_UNKNOWN); + return 0; } @@ -1615,7 +1440,7 @@ static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* mtctx, size_t srcSize, ZS mtctx->roundBuff.pos += srcSize; mtctx->inBuff.buffer = g_nullBuffer; mtctx->inBuff.filled = 0; - /* Set the prefix */ + /* Set the prefix for next job */ if (!endFrame) { size_t const newPrefixSize = MIN(srcSize, mtctx->targetPrefixSize); mtctx->inBuff.prefix.start = src + srcSize - newPrefixSize; @@ -1714,9 +1539,11 @@ static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, u assert(mtctx->doneJobID < mtctx->nextJobID); assert(cSize >= mtctx->jobs[wJobID].dstFlushed); assert(mtctx->jobs[wJobID].dstBuff.start != NULL); - memcpy((char*)output->dst + output->pos, - (const char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].dstFlushed, - toFlush); + if (toFlush > 0) { + ZSTD_memcpy((char*)output->dst + output->pos, + (const char*)mtctx->jobs[wJobID].dstBuff.start + mtctx->jobs[wJobID].dstFlushed, + toFlush); + } output->pos += toFlush; mtctx->jobs[wJobID].dstFlushed += toFlush; /* can write : this value is only used by mtctx */ @@ -1750,12 +1577,17 @@ static size_t ZSTDMT_flushProduced(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, u * If the data of the first job is broken up into two segments, we cover both * sections. */ -static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx) +static Range ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx) { unsigned const firstJobID = mtctx->doneJobID; unsigned const lastJobID = mtctx->nextJobID; unsigned jobID; + /* no need to check during first round */ + size_t roundBuffCapacity = mtctx->roundBuff.capacity; + size_t nbJobs1stRoundMin = roundBuffCapacity / mtctx->targetSectionSize; + if (lastJobID < nbJobs1stRoundMin) return kNullRange; + for (jobID = firstJobID; jobID < lastJobID; ++jobID) { unsigned const wJobID = jobID & mtctx->jobIDMask; size_t consumed; @@ -1765,7 +1597,7 @@ static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx) ZSTD_pthread_mutex_unlock(&mtctx->jobs[wJobID].job_mutex); if (consumed < mtctx->jobs[wJobID].src.size) { - range_t range = mtctx->jobs[wJobID].prefix; + Range range = mtctx->jobs[wJobID].prefix; if (range.size == 0) { /* Empty prefix */ range = mtctx->jobs[wJobID].src; @@ -1781,26 +1613,30 @@ static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx) /** * Returns non-zero iff buffer and range overlap. */ -static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range) +static int ZSTDMT_isOverlapped(Buffer buffer, Range range) { BYTE const* const bufferStart = (BYTE const*)buffer.start; - BYTE const* const bufferEnd = bufferStart + buffer.capacity; BYTE const* const rangeStart = (BYTE const*)range.start; - BYTE const* const rangeEnd = rangeStart + range.size; if (rangeStart == NULL || bufferStart == NULL) return 0; - /* Empty ranges cannot overlap */ - if (bufferStart == bufferEnd || rangeStart == rangeEnd) - return 0; - return bufferStart < rangeEnd && rangeStart < bufferEnd; + { + BYTE const* const bufferEnd = bufferStart + buffer.capacity; + BYTE const* const rangeEnd = rangeStart + range.size; + + /* Empty ranges cannot overlap */ + if (bufferStart == bufferEnd || rangeStart == rangeEnd) + return 0; + + return bufferStart < rangeEnd && rangeStart < bufferEnd; + } } -static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window) +static int ZSTDMT_doesOverlapWindow(Buffer buffer, ZSTD_window_t window) { - range_t extDict; - range_t prefix; + Range extDict; + Range prefix; DEBUGLOG(5, "ZSTDMT_doesOverlapWindow"); extDict.start = window.dictBase + window.lowLimit; @@ -1819,9 +1655,9 @@ static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window) || ZSTDMT_isOverlapped(buffer, prefix); } -static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer) +static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, Buffer buffer) { - if (mtctx->params.ldmParams.enableLdm) { + if (mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable) { ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex; DEBUGLOG(5, "ZSTDMT_waitForLdmComplete"); DEBUGLOG(5, "source [0x%zx, 0x%zx)", @@ -1844,16 +1680,16 @@ static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer) */ static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx) { - range_t const inUse = ZSTDMT_getInputDataInUse(mtctx); + Range const inUse = ZSTDMT_getInputDataInUse(mtctx); size_t const spaceLeft = mtctx->roundBuff.capacity - mtctx->roundBuff.pos; - size_t const target = mtctx->targetSectionSize; - buffer_t buffer; + size_t const spaceNeeded = mtctx->targetSectionSize; + Buffer buffer; DEBUGLOG(5, "ZSTDMT_tryGetInputRange"); assert(mtctx->inBuff.buffer.start == NULL); - assert(mtctx->roundBuff.capacity >= target); + assert(mtctx->roundBuff.capacity >= spaceNeeded); - if (spaceLeft < target) { + if (spaceLeft < spaceNeeded) { /* ZSTD_invalidateRepCodes() doesn't work for extDict variants. * Simply copy the prefix to the beginning in that case. */ @@ -1867,12 +1703,12 @@ static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx) return 0; } ZSTDMT_waitForLdmComplete(mtctx, buffer); - memmove(start, mtctx->inBuff.prefix.start, prefixSize); + ZSTD_memmove(start, mtctx->inBuff.prefix.start, prefixSize); mtctx->inBuff.prefix.start = start; mtctx->roundBuff.pos = prefixSize; } buffer.start = mtctx->roundBuff.buffer + mtctx->roundBuff.pos; - buffer.capacity = target; + buffer.capacity = spaceNeeded; if (ZSTDMT_isOverlapped(buffer, inUse)) { DEBUGLOG(5, "Waiting for buffer..."); @@ -1899,7 +1735,7 @@ static int ZSTDMT_tryGetInputRange(ZSTDMT_CCtx* mtctx) typedef struct { size_t toLoad; /* The number of bytes to load from the input. */ int flush; /* Boolean declaring if we must flush because we found a synchronization point. */ -} syncPoint_t; +} SyncPoint; /** * Searches through the input for a synchronization point. If one is found, we @@ -1907,14 +1743,14 @@ typedef struct { * Otherwise, we will load as many bytes as possible and instruct the caller * to continue as normal. */ -static syncPoint_t +static SyncPoint findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input) { BYTE const* const istart = (BYTE const*)input.src + input.pos; U64 const primePower = mtctx->rsync.primePower; U64 const hitMask = mtctx->rsync.hitMask; - syncPoint_t syncPoint; + SyncPoint syncPoint; U64 hash; BYTE const* prev; size_t pos; @@ -1924,6 +1760,11 @@ findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input) if (!mtctx->params.rsyncable) /* Rsync is disabled. */ return syncPoint; + if (mtctx->inBuff.filled + input.size - input.pos < RSYNC_MIN_BLOCK_SIZE) + /* We don't emit synchronization points if it would produce too small blocks. + * We don't have enough input to find a synchronization point, so don't look. + */ + return syncPoint; if (mtctx->inBuff.filled + syncPoint.toLoad < RSYNC_LENGTH) /* Not enough to compute the hash. * We will miss any synchronization points in this RSYNC_LENGTH byte @@ -1934,23 +1775,41 @@ findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input) */ return syncPoint; /* Initialize the loop variables. */ - if (mtctx->inBuff.filled >= RSYNC_LENGTH) { - /* We have enough bytes buffered to initialize the hash. + if (mtctx->inBuff.filled < RSYNC_MIN_BLOCK_SIZE) { + /* We don't need to scan the first RSYNC_MIN_BLOCK_SIZE positions + * because they can't possibly be a sync point. So we can start + * part way through the input buffer. + */ + pos = RSYNC_MIN_BLOCK_SIZE - mtctx->inBuff.filled; + if (pos >= RSYNC_LENGTH) { + prev = istart + pos - RSYNC_LENGTH; + hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH); + } else { + assert(mtctx->inBuff.filled >= RSYNC_LENGTH); + prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH; + hash = ZSTD_rollingHash_compute(prev + pos, (RSYNC_LENGTH - pos)); + hash = ZSTD_rollingHash_append(hash, istart, pos); + } + } else { + /* We have enough bytes buffered to initialize the hash, + * and have processed enough bytes to find a sync point. * Start scanning at the beginning of the input. */ + assert(mtctx->inBuff.filled >= RSYNC_MIN_BLOCK_SIZE); + assert(RSYNC_MIN_BLOCK_SIZE >= RSYNC_LENGTH); pos = 0; prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH; hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH); - } else { - /* We don't have enough bytes buffered to initialize the hash, but - * we know we have at least RSYNC_LENGTH bytes total. - * Start scanning after the first RSYNC_LENGTH bytes less the bytes - * already buffered. - */ - pos = RSYNC_LENGTH - mtctx->inBuff.filled; - prev = (BYTE const*)mtctx->inBuff.buffer.start - pos; - hash = ZSTD_rollingHash_compute(mtctx->inBuff.buffer.start, mtctx->inBuff.filled); - hash = ZSTD_rollingHash_append(hash, istart, pos); + if ((hash & hitMask) == hitMask) { + /* We're already at a sync point so don't load any more until + * we're able to flush this sync point. + * This likely happened because the job table was full so we + * couldn't add our job. + */ + syncPoint.toLoad = 0; + syncPoint.flush = 1; + return syncPoint; + } } /* Starting with the hash of the previous RSYNC_LENGTH bytes, roll * through the input. If we hit a synchronization point, then cut the @@ -1960,16 +1819,24 @@ findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input) * then a block will be emitted anyways, but this is okay, since if we * are already synchronized we will remain synchronized. */ + assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); for (; pos < syncPoint.toLoad; ++pos) { BYTE const toRemove = pos < RSYNC_LENGTH ? prev[pos] : istart[pos - RSYNC_LENGTH]; - /* if (pos >= RSYNC_LENGTH) assert(ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); */ + /* This assert is very expensive, and Debian compiles with asserts enabled. + * So disable it for now. We can get similar coverage by checking it at the + * beginning & end of the loop. + * assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); + */ hash = ZSTD_rollingHash_rotate(hash, toRemove, istart[pos], primePower); + assert(mtctx->inBuff.filled + pos >= RSYNC_MIN_BLOCK_SIZE); if ((hash & hitMask) == hitMask) { syncPoint.toLoad = pos + 1; syncPoint.flush = 1; + ++pos; /* for assert */ break; } } + assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); return syncPoint; } @@ -1995,34 +1862,11 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, assert(output->pos <= output->size); assert(input->pos <= input->size); - if (mtctx->singleBlockingThread) { /* delegate to single-thread (synchronous) */ - return ZSTD_compressStream2(mtctx->cctxPool->cctx[0], output, input, endOp); - } - if ((mtctx->frameEnded) && (endOp==ZSTD_e_continue)) { /* current frame being ended. Only flush/end are allowed */ return ERROR(stage_wrong); } - /* single-pass shortcut (note : synchronous-mode) */ - if ( (!mtctx->params.rsyncable) /* rsyncable mode is disabled */ - && (mtctx->nextJobID == 0) /* just started */ - && (mtctx->inBuff.filled == 0) /* nothing buffered */ - && (!mtctx->jobReady) /* no job already created */ - && (endOp == ZSTD_e_end) /* end order */ - && (output->size - output->pos >= ZSTD_compressBound(input->size - input->pos)) ) { /* enough space in dst */ - size_t const cSize = ZSTDMT_compress_advanced_internal(mtctx, - (char*)output->dst + output->pos, output->size - output->pos, - (const char*)input->src + input->pos, input->size - input->pos, - mtctx->cdict, mtctx->params); - if (ZSTD_isError(cSize)) return cSize; - input->pos = input->size; - output->pos += cSize; - mtctx->allJobsCompleted = 1; - mtctx->frameEnded = 1; - return 0; - } - /* fill input buffer */ if ( (!mtctx->jobReady) && (input->size > input->pos) ) { /* support NULL input */ @@ -2038,20 +1882,28 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, DEBUGLOG(5, "ZSTDMT_tryGetInputRange completed successfully : mtctx->inBuff.buffer.start = %p", mtctx->inBuff.buffer.start); } if (mtctx->inBuff.buffer.start != NULL) { - syncPoint_t const syncPoint = findSynchronizationPoint(mtctx, *input); + SyncPoint const syncPoint = findSynchronizationPoint(mtctx, *input); if (syncPoint.flush && endOp == ZSTD_e_continue) { endOp = ZSTD_e_flush; } assert(mtctx->inBuff.buffer.capacity >= mtctx->targetSectionSize); DEBUGLOG(5, "ZSTDMT_compressStream_generic: adding %u bytes on top of %u to buffer of size %u", (U32)syncPoint.toLoad, (U32)mtctx->inBuff.filled, (U32)mtctx->targetSectionSize); - memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, syncPoint.toLoad); + ZSTD_memcpy((char*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled, (const char*)input->src + input->pos, syncPoint.toLoad); input->pos += syncPoint.toLoad; mtctx->inBuff.filled += syncPoint.toLoad; forwardInputProgress = syncPoint.toLoad>0; } - if ((input->pos < input->size) && (endOp == ZSTD_e_end)) - endOp = ZSTD_e_flush; /* can't end now : not all input consumed */ + } + if ((input->pos < input->size) && (endOp == ZSTD_e_end)) { + /* Can't end yet because the input is not fully consumed. + * We are in one of these cases: + * - mtctx->inBuff is NULL & empty: we couldn't get an input buffer so don't create a new job. + * - We filled the input buffer: flush this job but don't end the frame. + * - We hit a synchronization point: flush this job but don't end the frame. + */ + assert(mtctx->inBuff.filled == 0 || mtctx->inBuff.filled == mtctx->targetSectionSize || mtctx->params.rsyncable); + endOp = ZSTD_e_flush; } if ( (mtctx->jobReady) @@ -2060,7 +1912,7 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, || ((endOp == ZSTD_e_end) && (!mtctx->frameEnded)) ) { /* must finish the frame with a zero-size block */ size_t const jobSize = mtctx->inBuff.filled; assert(mtctx->inBuff.filled <= mtctx->targetSectionSize); - FORWARD_IF_ERROR( ZSTDMT_createCompressionJob(mtctx, jobSize, endOp) ); + FORWARD_IF_ERROR( ZSTDMT_createCompressionJob(mtctx, jobSize, endOp) , ""); } /* check for potential compressed data ready to be flushed */ @@ -2071,46 +1923,4 @@ size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, } } - -size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input) -{ - FORWARD_IF_ERROR( ZSTDMT_compressStream_generic(mtctx, output, input, ZSTD_e_continue) ); - - /* recommended next input size : fill current input buffer */ - return mtctx->targetSectionSize - mtctx->inBuff.filled; /* note : could be zero when input buffer is fully filled and no more availability to create new job */ -} - - -static size_t ZSTDMT_flushStream_internal(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_EndDirective endFrame) -{ - size_t const srcSize = mtctx->inBuff.filled; - DEBUGLOG(5, "ZSTDMT_flushStream_internal"); - - if ( mtctx->jobReady /* one job ready for a worker to pick up */ - || (srcSize > 0) /* still some data within input buffer */ - || ((endFrame==ZSTD_e_end) && !mtctx->frameEnded)) { /* need a last 0-size block to end frame */ - DEBUGLOG(5, "ZSTDMT_flushStream_internal : create a new job (%u bytes, end:%u)", - (U32)srcSize, (U32)endFrame); - FORWARD_IF_ERROR( ZSTDMT_createCompressionJob(mtctx, srcSize, endFrame) ); - } - - /* check if there is any data available to flush */ - return ZSTDMT_flushProduced(mtctx, output, 1 /* blockToFlush */, endFrame); -} - - -size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output) -{ - DEBUGLOG(5, "ZSTDMT_flushStream"); - if (mtctx->singleBlockingThread) - return ZSTD_flushStream(mtctx->cctxPool->cctx[0], output); - return ZSTDMT_flushStream_internal(mtctx, output, ZSTD_e_flush); -} - -size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output) -{ - DEBUGLOG(4, "ZSTDMT_endStream"); - if (mtctx->singleBlockingThread) - return ZSTD_endStream(mtctx->cctxPool->cctx[0], output); - return ZSTDMT_flushStream_internal(mtctx, output, ZSTD_e_end); -} +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/DataDog/zstd/zstdmt_compress.h b/vendor/github.com/DataDog/zstd/zstdmt_compress.h index 12a5260..ab1b11b 100644 --- a/vendor/github.com/DataDog/zstd/zstdmt_compress.h +++ b/vendor/github.com/DataDog/zstd/zstdmt_compress.h @@ -1,5 +1,6 @@ +#ifndef USE_EXTERNAL_ZSTD /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,121 +12,62 @@ #ifndef ZSTDMT_COMPRESS_H #define ZSTDMT_COMPRESS_H - #if defined (__cplusplus) - extern "C" { - #endif - +/* === Dependencies === */ +#include "zstd_deps.h" /* size_t */ +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters */ +#include "zstd.h" /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */ /* Note : This is an internal API. * These APIs used to be exposed with ZSTDLIB_API, * because it used to be the only way to invoke MT compression. - * Now, it's recommended to use ZSTD_compress2 and ZSTD_compressStream2() - * instead. - * - * If you depend on these APIs and can't switch, then define - * ZSTD_LEGACY_MULTITHREADED_API when making the dynamic library. - * However, we may completely remove these functions in a future - * release, so please switch soon. + * Now, you must use ZSTD_compress2 and ZSTD_compressStream2() instead. * * This API requires ZSTD_MULTITHREAD to be defined during compilation, * otherwise ZSTDMT_createCCtx*() will fail. */ -#ifdef ZSTD_LEGACY_MULTITHREADED_API -# define ZSTDMT_API ZSTDLIB_API -#else -# define ZSTDMT_API -#endif - -/* === Dependencies === */ -#include /* size_t */ -#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters */ -#include "zstd.h" /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */ - - /* === Constants === */ -#ifndef ZSTDMT_NBWORKERS_MAX -# define ZSTDMT_NBWORKERS_MAX 200 +#ifndef ZSTDMT_NBWORKERS_MAX /* a different value can be selected at compile time */ +# define ZSTDMT_NBWORKERS_MAX ((sizeof(void*)==4) /*32-bit*/ ? 64 : 256) #endif -#ifndef ZSTDMT_JOBSIZE_MIN -# define ZSTDMT_JOBSIZE_MIN (1 MB) +#ifndef ZSTDMT_JOBSIZE_MIN /* a different value can be selected at compile time */ +# define ZSTDMT_JOBSIZE_MIN (512 KB) #endif #define ZSTDMT_JOBLOG_MAX (MEM_32bits() ? 29 : 30) #define ZSTDMT_JOBSIZE_MAX (MEM_32bits() ? (512 MB) : (1024 MB)) +/* ======================================================== + * === Private interface, for use by ZSTD_compress.c === + * === Not exposed in libzstd. Never invoke directly === + * ======================================================== */ + /* === Memory management === */ typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx; /* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */ -ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers); -/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */ -ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, - ZSTD_customMem cMem); -ZSTDMT_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx); - -ZSTDMT_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx); - - -/* === Simple one-pass compression function === */ - -ZSTDMT_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - int compressionLevel); - +ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, + ZSTD_customMem cMem, + ZSTD_threadPool *pool); +size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx); +size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx); /* === Streaming functions === */ -ZSTDMT_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel); -ZSTDMT_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */ - -ZSTDMT_API size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx); -ZSTDMT_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input); - -ZSTDMT_API size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */ -ZSTDMT_API size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */ - - -/* === Advanced functions and parameters === */ - -ZSTDMT_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const ZSTD_CDict* cdict, - ZSTD_parameters params, - int overlapLog); - -ZSTDMT_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx, - const void* dict, size_t dictSize, /* dict can be released after init, a local copy is preserved within zcs */ - ZSTD_parameters params, - unsigned long long pledgedSrcSize); /* pledgedSrcSize is optional and can be zero == unknown */ - -ZSTDMT_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx, - const ZSTD_CDict* cdict, - ZSTD_frameParameters fparams, - unsigned long long pledgedSrcSize); /* note : zero means empty */ - -/* ZSTDMT_parameter : - * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */ -typedef enum { - ZSTDMT_p_jobSize, /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */ - ZSTDMT_p_overlapLog, /* Each job may reload a part of previous job to enhance compression ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */ - ZSTDMT_p_rsyncable /* Enables rsyncable mode. */ -} ZSTDMT_parameter; - -/* ZSTDMT_setMTCtxParameter() : - * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter. - * The function must be called typically after ZSTD_createCCtx() but __before ZSTDMT_init*() !__ - * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions. - * @return : 0, or an error code (which can be tested using ZSTD_isError()) */ -ZSTDMT_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int value); - -/* ZSTDMT_getMTCtxParameter() : - * Query the ZSTDMT_CCtx for a parameter value. - * @return : 0, or an error code (which can be tested using ZSTD_isError()) */ -ZSTDMT_API size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int* value); +size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx); +/*! ZSTDMT_initCStream_internal() : + * Private use only. Init streaming operation. + * expects params to be valid. + * must receive dict, or cdict, or none, but not both. + * mtctx can be freshly constructed or reused from a prior compression. + * If mtctx is reused, memory allocations from the prior compression may not be freed, + * even if they are not needed for the current compression. + * @return : 0, or an error code */ +size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* mtctx, + const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, unsigned long long pledgedSrcSize); /*! ZSTDMT_compressStream_generic() : * Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream() @@ -134,16 +76,10 @@ ZSTDMT_API size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter * 0 if fully flushed * or an error code * note : needs to be init using any ZSTD_initCStream*() variant */ -ZSTDMT_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, - ZSTD_outBuffer* output, - ZSTD_inBuffer* input, - ZSTD_EndDirective endOp); - - -/* ======================================================== - * === Private interface, for use by ZSTD_compress.c === - * === Not exposed in libzstd. Never invoke directly === - * ======================================================== */ +size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); /*! ZSTDMT_toFlushNow() * Tell how many bytes are ready to be flushed immediately. @@ -153,15 +89,6 @@ ZSTDMT_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, * therefore flushing is limited by speed of oldest job. */ size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx); -/*! ZSTDMT_CCtxParam_setMTCtxParameter() - * like ZSTDMT_setMTCtxParameter(), but into a ZSTD_CCtx_Params */ -size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, int value); - -/*! ZSTDMT_CCtxParam_setNbWorkers() - * Set nbWorkers, and clamp it. - * Also reset jobSize and overlapLog */ -size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers); - /*! ZSTDMT_updateCParams_whileCompressing() : * Updates only a selected set of compression parameters, to remain compatible with current frame. * New parameters will be applied to next compression job. */ @@ -173,20 +100,6 @@ void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_p */ ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx); - -/*! ZSTDMT_initCStream_internal() : - * Private use only. Init streaming operation. - * expects params to be valid. - * must receive dict, or cdict, or none, but not both. - * @return : 0, or an error code */ -size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs, - const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, - const ZSTD_CDict* cdict, - ZSTD_CCtx_params params, unsigned long long pledgedSrcSize); - - -#if defined (__cplusplus) -} -#endif - #endif /* ZSTDMT_COMPRESS_H */ + +#endif /* USE_EXTERNAL_ZSTD */ diff --git a/vendor/github.com/RaduBerinde/axisds/.gitignore b/vendor/github.com/RaduBerinde/axisds/.gitignore new file mode 100644 index 0000000..9941dd0 --- /dev/null +++ b/vendor/github.com/RaduBerinde/axisds/.gitignore @@ -0,0 +1,27 @@ +# If you prefer the allow list template instead of the deny list, see community template: +# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore +# +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Dependency directories (remove the comment below to include it) +# vendor/ + +# Go workspace file +go.work +go.work.sum + +# env file +.env + +.idea/ \ No newline at end of file diff --git a/vendor/github.com/cockroachdb/fifo/LICENSE b/vendor/github.com/RaduBerinde/axisds/LICENSE similarity index 100% rename from vendor/github.com/cockroachdb/fifo/LICENSE rename to vendor/github.com/RaduBerinde/axisds/LICENSE diff --git a/vendor/github.com/RaduBerinde/axisds/README.md b/vendor/github.com/RaduBerinde/axisds/README.md new file mode 100644 index 0000000..192cc83 --- /dev/null +++ b/vendor/github.com/RaduBerinde/axisds/README.md @@ -0,0 +1,11 @@ +# axisds: Axis (1D) data structures + +[![Build Status](https://github.com/RaduBerinde/axisds/actions/workflows/ci.yaml/badge.svg)](https://github.com/RaduBerinde/axisds/actions/workflows/ci.yaml) +[![Go Report Card](https://goreportcard.com/badge/github.com/RaduBerinde/axisds)](https://goreportcard.com/report/github.com/RaduBerinde/axisds) +[![GoDoc](https://godoc.org/github.com/RaduBerinde/axisds?status.svg)](https://godoc.org/github.com/RaduBerinde/axisds) + +This project contains data structures for entities ordered across a single +dimension, which can conceptually be embedded into the real number axis. + + * [regiontree](regiontree/README.md): A data structure that partitions a one-dimensional space into contiguous + regions, each associated with a property. diff --git a/vendor/github.com/RaduBerinde/axisds/boundary.go b/vendor/github.com/RaduBerinde/axisds/boundary.go new file mode 100644 index 0000000..0c2f696 --- /dev/null +++ b/vendor/github.com/RaduBerinde/axisds/boundary.go @@ -0,0 +1,98 @@ +// Copyright 2025 Radu Berinde. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package axisds + +// Boundary is the most basic unit used by this library. It represents a +// boundary on a 1D axis. +// +// The data structures in this package operate on half-open intervals +// like [startBoundary, endBoundary). +// +// Endpoint is a Boundary wrapper that creates more fine-grained intervals, +// allowing arbitrary types of intervals (in terms of inclusive vs exclusive +// endpoints). +// +// NOTE: it's equivalent to think of Boundaries as being infinitesimal and not +// corresponding to any valid value. In this case, there is no concept of +// inclusive or exclusive intervals. Endpoint can represent infinitesimal +// boundaries that are immediately before or after a value, allowing arbitrary +// types of intervals (with respect to valid values). +type Boundary any + +// CompareFn is a function that compares two boundaries and returns -1, 0, or +1. +type CompareFn[B Boundary] func(x, y B) int + +// Endpoint is a Boundary that extends a simpler boundary type to allow +// representing intervals with inclusive or exclusive end points. +type Endpoint[B Boundary] struct { + B B + // If PlusEpsilon is true, the boundary is considered to be infinitesimally + // after B. When used as an interval ending point, it corresponds to an + // inclusive end bound. When used as an interval starting point, it + // corresponds to an exclusive start bound. + PlusEpsilon bool +} + +// InclusiveOrExclusive is used to specify the type of interval endpoint. +type InclusiveOrExclusive int8 + +const Inclusive InclusiveOrExclusive = 1 +const Exclusive InclusiveOrExclusive = 2 + +// InclusiveIf returns Inclusive if the argument is true and Exclusive +// otherwise. +func InclusiveIf(inclusive bool) InclusiveOrExclusive { + if inclusive { + return Inclusive + } + return Exclusive +} + +func MakeStartEndpoint[B Boundary](startBoundary B, startTyp InclusiveOrExclusive) Endpoint[B] { + return Endpoint[B]{ + B: startBoundary, + PlusEpsilon: startTyp == Exclusive, + } +} + +func MakeEndEndpoint[B Boundary](endBoundary B, endTyp InclusiveOrExclusive) Endpoint[B] { + return Endpoint[B]{ + B: endBoundary, + PlusEpsilon: endTyp == Inclusive, + } +} + +func MakeEndpoints[B Boundary]( + startBoundary B, startTyp InclusiveOrExclusive, endBoundary B, endTyp InclusiveOrExclusive, +) (start, end Endpoint[B]) { + return MakeStartEndpoint(startBoundary, startTyp), MakeEndEndpoint(endBoundary, endTyp) +} + +// EndpointCompareFn returns a CompareFn for Endpoint[B]. +func EndpointCompareFn[B Boundary](bCmp CompareFn[B]) CompareFn[Endpoint[B]] { + return func(x, y Endpoint[B]) int { + if c := bCmp(x.B, y.B); c != 0 { + return c + } + switch { + case x.PlusEpsilon == y.PlusEpsilon: + return 0 + case x.PlusEpsilon: + return +1 + default: + return -1 + } + } +} diff --git a/vendor/github.com/RaduBerinde/axisds/formatter.go b/vendor/github.com/RaduBerinde/axisds/formatter.go new file mode 100644 index 0000000..eddfe97 --- /dev/null +++ b/vendor/github.com/RaduBerinde/axisds/formatter.go @@ -0,0 +1,55 @@ +// Copyright 2025 Radu Berinde. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package axisds + +import "fmt" + +// BoundaryFormatter is used to print boundaries. +type BoundaryFormatter[B Boundary] func(b B) string + +// MakeBoundaryFormatter creates a BoundaryFormatter[B] that uses fmt.Sprint(). +func MakeBoundaryFormatter[B Boundary]() BoundaryFormatter[B] { + return func(b B) string { + return fmt.Sprint(b) + } +} + +// IntervalFormatter is used to print intervals. +type IntervalFormatter[B Boundary] func(start, end B) string + +// MakeIntervalFormatter creates an IntervalFormatter[B] which uses the given +// formatter for B. +func MakeIntervalFormatter[B Boundary](bFmt BoundaryFormatter[B]) IntervalFormatter[B] { + return func(start, end B) string { + return fmt.Sprintf("[%s, %s)", bFmt(start), bFmt(end)) + } +} + +// MakeEndpointIntervalFormatter creates an IntervalFormatter[Endpoint[B]] which +// uses the given formatter for B. +func MakeEndpointIntervalFormatter[B Boundary]( + bFmt BoundaryFormatter[B], +) IntervalFormatter[Endpoint[B]] { + return func(start, end Endpoint[B]) string { + c1, c2 := '[', ')' + if start.PlusEpsilon { + c1 = '(' + } + if end.PlusEpsilon { + c2 = ']' + } + return fmt.Sprintf("%c%s, %s%c", c1, bFmt(start.B), bFmt(end.B), c2) + } +} diff --git a/vendor/github.com/RaduBerinde/axisds/parser.go b/vendor/github.com/RaduBerinde/axisds/parser.go new file mode 100644 index 0000000..3c8bda2 --- /dev/null +++ b/vendor/github.com/RaduBerinde/axisds/parser.go @@ -0,0 +1,120 @@ +// Copyright 2025 Radu Berinde. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package axisds + +import ( + "fmt" + "regexp" + "runtime/debug" +) + +// Parser is an interface for parsing intervals. +type Parser[B Boundary] interface { + // ParseBoundary is used to parse a "bare" boundary. Used for Endpoint[B]. + ParseBoundary(str string) (b B, err error) + + // ParseInterval parses an interval of the form `boundary1, boundary2` + // from the input and returns any remaining fields in the string. + ParseInterval(input string) (start, end B, remaining string, err error) +} + +// MakeBasicParser creates a Parser[B] that uses Sscanf with `%v` for the +// boundaries. +func MakeBasicParser[B Boundary]() Parser[B] { + return basicParser[B]{} +} + +// MakeEndpointParser creates a Parser[Endpoint[B]]. +func MakeEndpointParser[B Boundary](p Parser[B]) Parser[Endpoint[B]] { + return &endpointParser[B]{p: p} +} + +// MustParseInterval parses a string into an interval; panics on errors. +func MustParseInterval[B Boundary](p Parser[B], input string) (start, end B) { + start, end, rem := MustParseIntervalPrefix(p, input) + if rem != "" { + panic(fmt.Sprintf("extra fields in input: %q", rem)) + } + return start, end +} + +// MustParseIntervalPrefix parses a string into an interval and an optional +// remainder string, panics on errors. +func MustParseIntervalPrefix[B Boundary]( + p Parser[B], input string, +) (start, end B, remaining string) { + start, end, remaining, err := p.ParseInterval(input) + if err != nil { + panic(err) + } + return start, end, remaining +} + +type basicParser[B Boundary] struct{} + +var _ Parser[int] = basicParser[int]{} + +func (p basicParser[B]) ParseBoundary(str string) (b B, err error) { + _, err = fmt.Sscanf(str, "%v", &b) + if err != nil { + return b, fmt.Errorf("malformed boundary %q: %v\n%s", str, err, string(debug.Stack())) + } + return b, nil +} + +func (p basicParser[B]) ParseInterval(input string) (start, end B, remaining string, err error) { + re := regexp.MustCompile(`^\[([^,]+), ([^)]+)\) *(.*)$`) + matches := re.FindStringSubmatch(input) + if matches == nil { + return start, end, "", fmt.Errorf("malformed interval %q", input) + } + start, err = p.ParseBoundary(matches[1]) + if err == nil { + end, err = p.ParseBoundary(matches[2]) + } + if err != nil { + return start, end, "", err + } + return start, end, matches[3], nil +} + +type endpointParser[B Boundary] struct { + p Parser[B] +} + +func (p endpointParser[B]) ParseBoundary(str string) (e Endpoint[B], err error) { + return e, fmt.Errorf("not implemented") +} + +func (p endpointParser[B]) ParseInterval( + input string, +) (start, end Endpoint[B], remaining string, err error) { + re := regexp.MustCompile(`^([(\[])([^,]+), ([^)]+)([)\]]) *(.*)$`) + matches := re.FindStringSubmatch(input) + if matches == nil { + return start, end, "", fmt.Errorf("malformed interval %q", input) + } + var b1, b2 B + b1, err = p.p.ParseBoundary(matches[2]) + if err == nil { + b2, err = p.p.ParseBoundary(matches[3]) + } + if err != nil { + return start, end, "", err + } + typ1 := InclusiveIf(matches[1] == "[") + typ2 := InclusiveIf(matches[4] == "]") + return MakeStartEndpoint(b1, typ1), MakeEndEndpoint(b2, typ2), matches[5], nil +} diff --git a/vendor/github.com/RaduBerinde/axisds/regiontree/README.md b/vendor/github.com/RaduBerinde/axisds/regiontree/README.md new file mode 100644 index 0000000..5be638c --- /dev/null +++ b/vendor/github.com/RaduBerinde/axisds/regiontree/README.md @@ -0,0 +1,246 @@ +# Region Tree + +## Overview + +Region Tree is a data structure that partitions a one-dimensional space into +contiguous segments (called *regions*), each associated with a value (called a +*property*). It is useful for representing piecewise-constant data along a line +(such as time intervals, indices, or coordinates) and supports efficient updates +and queries. Adjacent regions with the same property are automatically merged, +which helps keep the representation compact and easy to manage. + +**Key Features:** + +- **One-dimensional interval management:** Represents a set of non-overlapping + intervals (regions) along a 1D axis, each with an associated property/value. + +- **Automatic merging:** If two neighboring regions have equal properties, they + merge into a single region (no duplicate adjacent segments with the same + value). + +- **Efficient updates:** Supports fast range updates (changing the property of + all regions in a given interval) in O(log N + K) time, where N is the number + of regions and K is the number of regions affected by the update. + +- **Efficient queries:** Can enumerate (iterate over) all regions overlapping a + query range, or all regions in the structure, skipping regions with the default + value. + +- **Generic design:** Works with any ordered boundary type (e.g. numbers, + coordinates, etc.) and any property type. You provide a comparison function + for boundaries and an equality function for properties. + +- **Copy-on-write cloning:** Supports cheap cloning of the entire tree. A clone + operation is O(1) (lazy copy), allowing you to fork versions of the region set + without an expensive deep copy. Subsequent modifications on either copy will + not affect the other (they diverge on first write). + +## How It Works + +Region Tree maintains a set of **boundary points** that divide the line into +contiguous regions. Each region covers the half-open interval from one boundary +up to (but not including) the next boundary, and has an associated property +value. By convention, the area up to the first boundary is considered to have +the "zero" property (the default value for the property type). The data +structure aims to store only boundaries where the property changes. + +Internally, the region tree uses a B-tree to store the regions by their start +boundary, enabling logarithmic search and update. However, you do not need to +interact with the B-tree directly - the region tree API provides +high-level methods to update and query the structure. + +## Usage Examples + +Below are simple examples illustrating how to create a region tree, update +regions, and query them. These examples use integer boundaries and integer +properties for simplicity. + +### Creating a Region Tree + +To create a region tree, you call the `Make` function with a comparison +function for the boundary type and an equality function for the property type. +For basic types like integers, you can typically use a standard comparator and +equality check: + +```go +import ( + "cmp" + "fmt" + "github.com/RaduBerinde/axisds/regiontree" +) + +func main() { + // Create a region tree with int boundaries and int properties. + // Use a default integer comparison and equality check. + rt := regiontree.Make[int, int]( + cmp.Compare[int], // comparison for boundaries + func(a, b int) bool { // equality check for properties + return a == b + }, + ) + + fmt.Println(rt.IsEmpty()) // Expect "true" as no regions have been set yet. +} +``` + +In the above snippet: +- `cmp.Compare[int]` is a comparator that orders two `int` boundaries (this + could be a simple function returning negative, zero, or positive for ab). +- The property equality function `func(a, b int) bool { return a == b }` tells + the region tree to treat two properties as equal if their values are exactly + equal. This determines when regions can be merged. (For non-numeric properties, + you might provide a different function. For example, if `P` is a struct, you may + want to compare a specific field for equality.) + +Initially, the tree is empty (no non-default regions), so `rt.IsEmpty()` returns +true. + +### Setting (Inserting) a Region + +You don't insert regions by specifying explicit intervals; instead you **update +a range** by providing a function that sets or modifies the property for that +range. For example, to mark the interval [10, 30) with a property value of `1`: + +```go +// Set the property to 1 for the interval [10, 30). +rt.Update(10, 30, func(oldValue int) int { + return 1 // ignore oldValue, simply set property to 1 +}) +``` + +After this update, the region [10, 30) has property `1`. Everything outside +[10, 30) remains at the default property (`0` in this case). Internally, the +tree now has boundaries at 10 and 30, defining one explicit region: +- [10, 30) = 1 (and implicitly, [30, ∞) = 0 as default, and [−∞, 10) = 0 as + default, which are not stored since they are default). + +If we update an adjacent interval with the same property, the regions will +merge. For example: + +```go +// Set the property to 1 for [30, 40) as well. +rt.Update(30, 40, func(oldValue int) int { + return 1 +}) +``` + +Now the interval [10, 40) will become one continuous region with property 1. +Initially, a boundary at 30 was present, but after the second update the +properties on both sides of that boundary are equal (both sides became 1), so +the region tree removes the unnecessary boundary at 30. The regions [10, 30) and +[30, 40) coalesce into a single region [10, 40) = 1. + +### Overlapping Updates and Automatic Splitting + +If you update a range that overlaps existing regions with a *different* +property, region tree will split and/or trim regions as needed. For example: + +```go +// Set the property to 2 for [15, 25), overlapping the [10, 40) region. +rt.Update(15, 25, func(oldValue int) int { + return 2 +}) +``` + +Before this update, we had [10, 40) = 1. The update [15, 25) = 2 intersects the +middle of [10, 40). The region tree will handle this by splitting the original +region into pieces and assigning new values accordingly: +- [10, 15) remains with property 1 (the portion before 15, unchanged). +- [15, 25) will have property 2 (the updated range). +- [25, 40) reverts to property 1 (the portion after 25, which was part of the + original region and remains property 1). + +The structure automatically creates boundaries at 15 and 25 during the update. +After this operation, the tree contains three regions: +- [10, 15) = 1 +- [15, 25) = 2 +- [25, 40) = 1 + +Notice that [10, 15) and [25, 40) both have property 1, but they are not +adjacent - there is a region with property 2 in between - so they remain +separate segments in the tree. + +### Removing a Region (Setting Back to Default) + +You can "remove" or clear a segment by updating it back to the default property +(zero value). For instance, continuing from the above state, if we want to clear +the interval [15, 25) (setting it back to 0): + +```go +// Reset the property to 0 (default) for [15, 25), effectively removing that segment. +rt.Update(15, 25, func(oldValue int) int { + return 0 // set back to default +}) +``` + +After this operation, the [15, 25) segment with property 2 is removed. The +regions [10, 15) = 1 and [25, 40) = 1 remain, with [15, 25) now being default +(0). + +Everything from 15 to 25 and outside 10 to 40 is just default (0) and not stored as a region. + +### Enumerating (Querying) Regions + +To retrieve the regions and their properties, you use the **Enumerate** methods. +The `Enumerate(start, end, emitFunc)` method iterates over all regions that +overlap the interval `[start, end)` and calls `emitFunc` for each with the +region's start, end, and property. There is also a convenience method +`EnumerateAll(emitFunc)` to iterate over *all* regions in the tree (with +non-default property). + +Let's enumerate all regions in our example tree: + +```go +rt.EnumerateAll(func(start, end int, prop int) bool { + fmt.Printf("[%d, %d) = %d\n", start, end, prop) + return true // return false if you want to stop early +}) +// Output: +// [10, 15) = 1 +// [25, 40) = 1 +``` + +As expected, it listed the two regions with property 1. Regions with the default +property (0) are omitted from enumeration output. If we want to query a specific +sub-range, say [0, 30), we can do: + +```go +rt.Enumerate(0, 30, func(start, end int, prop int) bool { + fmt.Printf("[%d, %d) = %d\n", start, end, prop) + return true +}) +// Output: +// [10, 15) = 1 +// [25, 30) = 1 +``` + +This prints only the portion of our regions that lies between 0 and 30. In this +case, [25, 40) was partially outside the query range (only [25, 30) falls in +[0,30)). + +**Note on the callback:** The `emitFunc` should return a boolean. Return `true` +to continue enumeration, or `false` to stop early (useful if you only want the +first overlapping region, for example). The enumeration is done in ascending +order of region start boundaries. + +### Cloning the Tree + +If you need to work with a snapshot of the regions and modify it independently, +you can use `Clone()`: + +```go +clone := rt.Clone() +// 'clone' now shares structure with 'rt' but can be modified independently. + +// For example, modify the clone: +clone.Update(0, 100, func(old int) int { return 5 }) + +// The original 'rt' is unchanged; 'clone' now has its own set of regions. +``` + +Cloning is very fast (constant time) and implemented with copy-on-write. This +means initially the clone and original share the same underlying data, but as +soon as you perform an update on one of them, that update will not affect the +other. This is useful for branching scenarios or caching snapshots of the +interval state at a moment in time. \ No newline at end of file diff --git a/vendor/github.com/RaduBerinde/axisds/regiontree/region_tree.go b/vendor/github.com/RaduBerinde/axisds/regiontree/region_tree.go new file mode 100644 index 0000000..469f843 --- /dev/null +++ b/vendor/github.com/RaduBerinde/axisds/regiontree/region_tree.go @@ -0,0 +1,452 @@ +// Copyright 2025 Radu Berinde. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package regiontree + +import ( + "fmt" + "strings" + + "github.com/RaduBerinde/axisds" + "github.com/RaduBerinde/btreemap" +) + +type Boundary = axisds.Boundary + +// Property is an arbitrary type that represents a property of a region of a +// one-dimensional axis. +type Property any + +// PropertyEqualFn is a function used to compare properties of two regions. If +// it returns true, the two property values can be used interchangeably. +// +// Note that it is allowed for the function to "evolve" over time (but not +// concurrently with a region tree method), with values that were not equal +// becoming equal (but not the opposite: once two values are equal, they must +// stay equal forever). For example, the property can be a monotonic expiration +// time and as we update the current time, expired times become equal to the +// zero property. +// +// A zero property value is any value that is equal to the zero P value. +type PropertyEqualFn[P Property] func(a, b P) bool + +// T is a tree of regions which fragment a one-dimensional space. Regions have +// boundaries of type B and each region maintains a property P. Neighboring +// regions with equal properties are automatically merged. +// +// T supports lazy (copy-on-write) cloning via Clone(). +type T[B Boundary, P Property] struct { + cmp axisds.CompareFn[B] + propEq PropertyEqualFn[P] + // Tree maps each region start boundary to its property. The region ends at + // the next rgion's start boundary. The last region has zero property. + tree *btreemap.BTreeMap[B, P] +} + +// Make creates a new region tree with the given boundary and property +// comparison functions. +func Make[B Boundary, P Property](cmp axisds.CompareFn[B], propEq PropertyEqualFn[P]) T[B, P] { + t := T[B, P]{ + cmp: cmp, + propEq: propEq, + } + t.tree = btreemap.New[B, P](8, btreemap.CmpFunc[B](cmp)) + return t +} + +// Update the property for the given range. The updateProp function is called +// for all the regions within the range to calculate the new property. +// +// The runtime complexity is O(log N + K) where K is the number of regions we +// are updating. Note that if the ranges we update are mostly non-overlapping, +// this will be O(log N) on average. +func (t *T[B, P]) Update(start, end B, updateProp func(p P) P) { + // Get information about the region before start. + startBoundaryExists, beforeProp := t.startBoundaryInfo(start) + endBoundaryExists, afterProp := t.endBoundaryInfo(end) + + lastProp := beforeProp + var startProp P + var addStartBoundary bool + if !startBoundaryExists { + // See if we need to add the start boundary. + startProp = updateProp(beforeProp) + if !t.propEq(startProp, lastProp) { + // We will add the start boundary with startProp. + addStartBoundary = true + } + lastProp = startProp + } + + type update struct { + start B + prop P + delete bool + } + var updates []update + // Collect all the boundaries in the range that need to be updated or deleted. + t.tree.AscendFunc(btreemap.GE(start), btreemap.LT(end), func(rStart B, rProp P) bool { + prop := updateProp(rProp) + if t.propEq(prop, lastProp) { + // Boundary not necessary; remove it. + updates = append(updates, update{start: rStart, delete: true}) + } else if !t.propEq(prop, rProp) { + updates = append(updates, update{start: rStart, prop: prop, delete: false}) + } + lastProp = prop + return true + }) + + if addStartBoundary { + t.tree.ReplaceOrInsert(start, startProp) + } + + for _, u := range updates { + if u.delete { + t.tree.Delete(u.start) + } else { + t.tree.ReplaceOrInsert(u.start, u.prop) + } + } + + if t.propEq(lastProp, afterProp) { + if endBoundaryExists { + // End boundary can be removed. + t.tree.Delete(end) + } + } else { + if !endBoundaryExists { + // End boundary needs to be added. + t.tree.ReplaceOrInsert(end, afterProp) + } + } +} + +// startBoundaryInfo checks if the boundary exists and returns the property +// for the region that contains or ends at the boundary. +// +// exists=true: +// +// start +// | +// v +// ---|---beforeProp---|---------|--- +// +// exists=false: +// +// start +// | +// v +// ---|---beforeProp---|--- +// +// If no regions contain start, beforeProp is zero. +func (t *T[B, P]) startBoundaryInfo(start B) (exists bool, beforeProp P) { + t.tree.DescendFunc(btreemap.LE(start), btreemap.Min[B](), func(rStart B, rProp P) bool { + if !exists && t.cmp(rStart, start) == 0 { + exists = true + // Do one more step to get the property before the boundary. + return true + } + beforeProp = rProp + return false + }) + return exists, beforeProp +} + +// startBoundaryInfo checks if the boundary exists and returns the property +// for the region that contains or starts at the boundary. +// +// exists=true: +// +// end +// | +// v +// ---|-----------|---afterProp---|--- +// +// exists=false: +// +// end +// | +// v +// ---|---afterProp---|--- +// +// If no regions contain end, afterProp is zero. +func (t *T[B, P]) endBoundaryInfo(end B) (exists bool, afterProp P) { + t.tree.DescendFunc(btreemap.LE(end), btreemap.Min[B](), func(rStart B, rProp P) bool { + exists = t.cmp(rStart, end) == 0 + afterProp = rProp + return false + }) + return exists, afterProp +} + +// Enumerate all regions in the range [start, end) with non-zero property. +// +// Two consecutive regions can "touch" but not overlap; if they touch, their +// properties are not equal. +// +// Enumerate stops once emit() returns false. +// +// Enumerate can be called concurrently with other read-only methods (Enumerate, +// EnumerateAll, Any). +func (t *T[B, P]) Enumerate(start, end B, emit func(start, end B, prop P) bool) { + t.enumerate(start, end, emit, false /* with GC */) +} + +// EnumerateWithGC is a variant of Enumerate which internally deletes +// unnecessary boundaries between regions with properties that have become +// equal. +// +// This variant is only useful to improve performance when the PropertyEqualFn +// can change over time. It cannot be called concurrently with any other +// methods. +func (t *T[B, P]) EnumerateWithGC(start, end B, emit func(start, end B, prop P) bool) { + t.enumerate(start, end, emit, true /* with GC */) +} + +func (t *T[B, P]) enumerate(start, end B, emit func(start, end B, prop P) bool, withGC bool) { + if t.tree.Len() < 2 || t.cmp(start, end) >= 0 { + return + } + var eh enumerateHelper[B, P] + // Handle the case where we don't have a boundary equal to start; we have to + // find the region that contains it. + t.tree.DescendFunc(btreemap.LE(start), btreemap.Min[B](), func(rStart B, rProp P) bool { + if t.cmp(rStart, start) < 0 { + // This is the first addRegion call, so we won't emit anything. + eh.addRegion(start, rProp, t.propEq, nil) + } + return false + }) + var toDelete []B + t.tree.AscendFunc(btreemap.GE(start), btreemap.LT(end), func(rStart B, rProp P) bool { + eh.addRegion(rStart, rProp, t.propEq, emit) + if withGC && eh.canDeleteLastBoundary { + toDelete = append(toDelete, rStart) + } + return !eh.stopEmitting + }) + eh.finish(end, t.propEq, emit) + for _, b := range toDelete { + t.tree.Delete(b) + } +} + +// Any returns true if [start, end) overlaps any region with property that +// satisfies the given function. +// +// Any can be called concurrently with other read-only methods (Enumerate, +// EnumerateAll, Any). +func (t *T[B, P]) Any(start, end B, propFn func(prop P) bool) bool { + return t.any(start, end, propFn, false /* withGC */) +} + +// AnyWithGC is a variant of Any which internally deletes unnecessary boundaries +// between regions with properties that have become equal. +// +// This variant is only useful to improve performance when the PropertyEqualFn +// can change over time. It cannot be called concurrently with any other +// methods. +func (t *T[B, P]) AnyWithGC(start, end B, propFn func(prop P) bool) bool { + return t.any(start, end, propFn, true /* withGC */) +} + +// Any returns true if [start, end) overlaps any region with property that +// satisfies the given function. +func (t *T[B, P]) any(start, end B, propFn func(prop P) bool, withGC bool) bool { + if t.cmp(start, end) >= 0 { + return false + } + startBoundaryExists, lastProp := t.startBoundaryInfo(start) + if !startBoundaryExists && propFn(lastProp) { + return true + } + found := false + var toDelete []B + t.tree.AscendFunc(btreemap.GE(start), btreemap.LT(end), func(rStart B, rProp P) bool { + if withGC && t.propEq(rProp, lastProp) { + toDelete = append(toDelete, rStart) + } + lastProp = rProp + if propFn(rProp) { + found = true + return false + } + return true + }) + for _, b := range toDelete { + t.tree.Delete(b) + } + return found +} + +// EnumerateAll emits all regions with non-zero property. +// +// Two consecutive regions can "touch" but not overlap; if they touch, their +// properties are not equal. +// +// EnumerateAll stops once emit() returns false. +// +// Enumerate can be called concurrently with other read-only methods (Enumerate, +// EnumerateAll, Any). +func (t *T[B, P]) EnumerateAll(emit func(start, end B, prop P) bool) { + t.enumerateAll(emit, false /* withGC */) +} + +// EnumerateAllWithGC is a variant of EnumerateAll which internally deletes +// unnecessary boundaries between regions with properties that have become +// equal. +// +// This variant is only useful to improve performance when the PropertyEqualFn +// can change over time. It cannot be called concurrently with any other +// methods. +func (t *T[B, P]) EnumerateAllWithGC(emit func(start, end B, prop P) bool) { + t.enumerateAll(emit, true /* withGC */) +} + +func (t *T[B, P]) enumerateAll(emit func(start, end B, prop P) bool, withGC bool) { + var eh enumerateHelper[B, P] + var toDelete []B + t.tree.AscendFunc(btreemap.Min[B](), btreemap.Max[B](), func(rStart B, rProp P) bool { + eh.addRegion(rStart, rProp, t.propEq, emit) + if eh.canDeleteLastBoundary { + toDelete = append(toDelete, rStart) + } + return !eh.stopEmitting + }) + for _, b := range toDelete { + t.tree.Delete(b) + } +} + +type enumerateHelper[B Boundary, P Property] struct { + lastBoundary B + lastProp P + initialized bool + stopEmitting bool + // canDeleteLastBoundary is set by addRegion when the two last regions had + // equal properties. + canDeleteLastBoundary bool +} + +func (eh *enumerateHelper[B, P]) addRegion( + boundary B, prop P, propEq PropertyEqualFn[P], emitFn func(start, end B, prop P) bool, +) { + if !eh.initialized { + eh.lastBoundary = boundary + eh.lastProp = prop + eh.initialized = true + return + } + eh.canDeleteLastBoundary = propEq(eh.lastProp, prop) + if eh.canDeleteLastBoundary || eh.stopEmitting { + return + } + var zeroProp P + if !propEq(zeroProp, eh.lastProp) && !emitFn(eh.lastBoundary, boundary, eh.lastProp) { + eh.stopEmitting = true + } + eh.lastBoundary = boundary + eh.lastProp = prop +} + +func (eh *enumerateHelper[B, P]) finish( + end B, propEq PropertyEqualFn[P], emitFn func(start, end B, prop P) bool, +) { + var zeroProp P + if eh.initialized && !eh.stopEmitting && !propEq(zeroProp, eh.lastProp) { + emitFn(eh.lastBoundary, end, eh.lastProp) + } +} + +// IsEmpty returns true if the set contains no non-expired spans. +func (t *T[B, P]) IsEmpty() bool { + if t.tree.Len() < 2 { + return true + } + // Check that we have regions with non-zero property. + var toDelete []B + t.tree.AscendFunc(btreemap.Min[B](), btreemap.Max[B](), func(rStart B, rProp P) bool { + var zeroProp P + if t.propEq(rProp, zeroProp) { + toDelete = append(toDelete, rStart) + return true + } + return false + }) + for _, r := range toDelete { + t.tree.Delete(r) + } + return t.tree.Len() < 2 +} + +// InternalLen returns the number of region boundaries stored internally. +func (t *T[B, P]) InternalLen() int { + return t.tree.Len() +} + +// Clone creates a lazy clone of T with the same properties and regions. The new +// tree can be modified independently. +// +// This operation is constant time; it can cause some minor slowdown of future +// updates because of copy-on-write logic. +func (t *T[B, P]) Clone() T[B, P] { + return T[B, P]{ + cmp: t.cmp, + propEq: t.propEq, + tree: t.tree.Clone(), + } +} + +// String formats all regions, one per line. +func (t *T[B, P]) String(iFmt axisds.IntervalFormatter[B]) string { + var b strings.Builder + var eh enumerateHelper[B, P] + t.tree.AscendFunc(btreemap.Min[B](), btreemap.Max[B](), func(rStart B, rProp P) bool { + eh.addRegion(rStart, rProp, t.propEq, func(start, end B, prop P) bool { + fmt.Fprintf(&b, "%s = %v\n", iFmt(start, end), prop) + return true + }) + return true + }) + if b.Len() == 0 { + return "" + } + return b.String() +} + +// CheckInvariants can be used in testing builds to verify internal invariants. +func (t *T[B, P]) CheckInvariants() { + var lastBoundary B + var lastProp P + lastBoundarySet := false + t.tree.AscendFunc(btreemap.Min[B](), btreemap.Max[B](), func(rStart B, rProp P) bool { + if lastBoundarySet && t.cmp(lastBoundary, rStart) >= 0 { + panic("region boundaries not increasing") + } + if !t.propEq(rProp, rProp) { + panic("region property is not equal to itself") + } + lastBoundary = rStart + lastBoundarySet = true + lastProp = rProp + return true + }) + + // Last region should have the zero property. + var zeroProp P + if !t.propEq(lastProp, zeroProp) { + panic("last region must always have zero property") + } +} diff --git a/vendor/github.com/cockroachdb/fifo/.gitignore b/vendor/github.com/RaduBerinde/btreemap/.gitignore similarity index 96% rename from vendor/github.com/cockroachdb/fifo/.gitignore rename to vendor/github.com/RaduBerinde/btreemap/.gitignore index 6eb7626..4e05b06 100644 --- a/vendor/github.com/cockroachdb/fifo/.gitignore +++ b/vendor/github.com/RaduBerinde/btreemap/.gitignore @@ -21,5 +21,7 @@ go.work go.work.sum -.idea/ +# env file +.env +.idea/ diff --git a/vendor/github.com/mschoch/smat/LICENSE b/vendor/github.com/RaduBerinde/btreemap/LICENSE similarity index 99% rename from vendor/github.com/mschoch/smat/LICENSE rename to vendor/github.com/RaduBerinde/btreemap/LICENSE index 7a4a3ea..261eeb9 100644 --- a/vendor/github.com/mschoch/smat/LICENSE +++ b/vendor/github.com/RaduBerinde/btreemap/LICENSE @@ -1,4 +1,3 @@ - Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -199,4 +198,4 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file + limitations under the License. diff --git a/vendor/github.com/RaduBerinde/btreemap/README.md b/vendor/github.com/RaduBerinde/btreemap/README.md new file mode 100644 index 0000000..620b9ea --- /dev/null +++ b/vendor/github.com/RaduBerinde/btreemap/README.md @@ -0,0 +1,151 @@ +BTree map based on [google/btree](https://github.com/google/btree) + +[![Build Status](https://github.com/RaduBerinde/btreemap/actions/workflows/ci.yaml/badge.svg)](https://github.com/RaduBerinde/btreemap/actions/workflows/ci.yaml) +[![Go Report Card](https://goreportcard.com/badge/github.com/RaduBerinde/btreemap)](https://goreportcard.com/report/github.com/RaduBerinde/btreemap) +[![GoDoc](https://godoc.org/github.com/RaduBerinde/btreemap?status.svg)](https://godoc.org/github.com/RaduBerinde/btreemap) + +# btreemap + +> An **ordered, in‑memory B‑Tree map for Go** – a generic key‑value store with fast range iteration and copy‑on‑write snapshots. +> +> **Forked from [`github.com/google/btree`](https://github.com/google/btree)** and modernized for Go1.22+ generics and [`iter`](https://pkg.go.dev/iter) sequences. + +--- + +## Features + +* **Ordered map** – keys are kept sorted; all operations are *O(log(n))*. +* **Generic** – works with any key/value types via a user‑supplied compare function. +* **Fast range iteration** – iterate forward or backward between arbitrary bounds without allocation. +* **Copy‑on‑write snapshots** – `Clone` produces a cheap, immutable view that can be read concurrently with the parent. +* **Custom degree / free‑list** – tune memory use vs. CPU by choosing the node degree and enabling node reuse. + +> **Concurrency note** +> *Reads* may run concurrently on the *same* `BTreeMap`. *Writes* (any method that mutates the map) must be serialized. +> `Clone` creates an independent snapshot that may be mutated safely in parallel with the original. + +## Quick start + +```go +package main + +import ( + "cmp" + "fmt" + "github.com/RaduBerinde/btreemap" +) + +func main() { + // A 2‑3‑4 tree mapping int → string. + m := btreemap.New[int,string](2, cmp.Compare[int]) + + // Insert / replace + _, _, replaced := m.ReplaceOrInsert(42, "meaning") + fmt.Println(replaced) // false – key was new + + // Lookup + _, v, ok := m.Get(42) + fmt.Println(ok, v) // true meaning + + // Range iteration: 10 ≤ k < 100 + for k, v := range m.Ascend(btreemap.GE(10), btreemap.LT(100)) { + fmt.Printf("%d → %s\n", k, v) + } + + fmt.Println("len before:", m.Len()) + + // Delete single key + m.Delete(42) + fmt.Println("len after:", m.Len()) +} +``` + +--- + +## API overview + +```go +// Construction +func New[K,V any](degree int, cmp btreemap.CmpFunc[K]) *BTreeMap[K,V] +func NewWithFreeList[K,V any](degree int, cmp CmpFunc[K], fl *FreeList[K,V]) *BTreeMap[K,V] + +// Mutations +ReplaceOrInsert(key, value) (oldKey, oldValue, replaced bool) +Delete(key) (oldKey, oldValue, found bool) +DeleteMin() / DeleteMax() +Clear(addNodesToFreeList bool) + +// Queries +Get(key) (key, value, found bool) +Has(key) bool +Min()/Max() (key, value, found bool) +Len() int + +// Iteration (log(n) to first element, then amortized O(1)) +Ascend(start LowerBound[K], stop UpperBound[K]) iter.Seq2[K,V] +Descend(start UpperBound[K], stop LowerBound[K]) iter.Seq2[K,V] + +// Snapshots +Clone() *BTreeMap[K,V] +``` + +### Bounds helpers + +```go +// Lower bound +btreemap.Min[T]() // unbounded (−∞) +btreemap.GE(key) // ≥ key (inclusive) +btreemap.GT(key) // > key (exclusive) + +// Upper bound +btreemap.Max[T]() // unbounded (+∞) +btreemap.LE(key) // ≤ key (inclusive) +btreemap.LT(key) // < key (exclusive) +``` + +### Example: descending top‑N + +```go +// Print the 5 largest entries. +count := 0 +for k, v := range m.Descend(btreemap.Max[int](), btreemap.Min[int]()) { + fmt.Println(k, v) + count++ + if count == 5 { + break + } +} +``` + +### Example: snapshot for concurrent readers + +```go +snapshot := m.Clone() // cheap, O(1) +go func() { + // Writer goroutine mutates the original map. + for i := 0; i < 1_000; i++ { + m.ReplaceOrInsert(i, "val") + } +}() + +// Reader goroutine works on an immutable view. +for k, v := range snapshot.Ascend(btreemap.Min[int](), btreemap.Max[int]()) { + fmt.Println(k, v) +}) +``` + +--- + +## Tuning + +| Parameter | Effect | +|-----------|--------| +| **degree** | Maximum node size = `2*degree-1`. Small degrees use more pointers but less per‑node scan time (good for small maps). Higher degrees shrink tree height and improve cache locality for large maps. Typical values:`2…8`. | +| **FreeList** | Reuses nodes after deletes/`Clear`, reducing GC pressure in high‑churn workloads. Use `NewWithFreeList` if you need fine control over freelist size or sharing between many maps. | + +--- + +## License + +Apache 2.0 – see [LICENSE](LICENSE). +Original work ©Google Inc. 2014‑2024 \ No newline at end of file diff --git a/vendor/github.com/RaduBerinde/btreemap/btreemap.go b/vendor/github.com/RaduBerinde/btreemap/btreemap.go new file mode 100644 index 0000000..6bfd57f --- /dev/null +++ b/vendor/github.com/RaduBerinde/btreemap/btreemap.go @@ -0,0 +1,350 @@ +// Copyright 2014-2022 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package btreemap implements an ordered key-value map using an in-memory +// B-Tree of arbitrary degree. +// +// The internal B-Tree code is based on github.com/google/btree. +package btreemap + +import "iter" + +// New creates a new map backed by a B-Tree with the given degree. +// +// New(2), for example, will create a 2-3-4 tree, where each node contains 1 to 3 +// items and 2 to 4 children). +// +// The passed-in CmpFunc determines how objects of type T are ordered. For +// ordered basic types, use cmp.Compare. +func New[K any, V any](degree int, cmp CmpFunc[K]) *BTreeMap[K, V] { + return NewWithFreeList(degree, cmp, NewFreeList[K, V](DefaultFreeListSize)) +} + +// NewWithFreeList creates a new map that uses the given node free list. +func NewWithFreeList[K any, V any](degree int, cmp CmpFunc[K], f *FreeList[K, V]) *BTreeMap[K, V] { + if degree <= 1 { + panic("bad degree") + } + return &BTreeMap[K, V]{ + degree: degree, + cow: ©OnWriteContext[K, V]{freelist: f, cmp: cmp}, + } +} + +// BTreeMap implements an ordered key-value map using an in-memory B-Tree of +// arbitrary degree. It allows easy insertion, removal, and iteration. +// +// Write operations are not safe for concurrent mutation by multiple goroutines, +// but Read operations are. +type BTreeMap[K any, V any] struct { + degree int + length int + root *node[K, V] + cow *copyOnWriteContext[K, V] +} + +// CmpFunc returns: +// - 0 if the two keys are equal; +// - a negative number if a < b; +// - a positive number if a > b. +type CmpFunc[K any] func(a, b K) int + +// Clone clones the btree, lazily. Clone should not be called concurrently, +// but the original tree (t) and the new tree (t2) can be used concurrently +// once the Clone call completes. +// +// The internal tree structure of b is marked read-only and shared between t and +// t2. Writes to both t and t2 use copy-on-write logic, creating new nodes +// whenever one of b's original nodes would have been modified. Read operations +// should have no performance degradation. Write operations for both t and t2 +// will initially experience minor slow-downs caused by additional allocs and +// copies due to the aforementioned copy-on-write logic, but should converge to +// the original performance characteristics of the original tree. +func (t *BTreeMap[K, V]) Clone() (t2 *BTreeMap[K, V]) { + // Create two entirely new copy-on-write contexts. + // This operation effectively creates three trees: + // the original, shared nodes (old b.cow) + // the new b.cow nodes + // the new out.cow nodes + cow1, cow2 := *t.cow, *t.cow + out := *t + t.cow = &cow1 + out.cow = &cow2 + return &out +} + +// ReplaceOrInsert adds the given item to the tree. If an item in the tree +// already equals the given one, it is removed from the tree and returned, +// and the second return value is true. Otherwise, (zeroValue, false) +// +// nil cannot be added to the tree (will panic). +func (t *BTreeMap[K, V]) ReplaceOrInsert(key K, value V) (_ K, _ V, replaced bool) { + if t.root == nil { + t.root = t.cow.newNode() + t.root.items = append(t.root.items, kv[K, V]{k: key, v: value}) + t.length++ + return + } else { + t.root = t.root.mutableFor(t.cow) + if len(t.root.items) >= t.maxItems() { + item2, second := t.root.split(t.maxItems() / 2) + oldroot := t.root + t.root = t.cow.newNode() + t.root.items = append(t.root.items, item2) + t.root.children = append(t.root.children, oldroot, second) + } + } + out, outb := t.root.insert(kv[K, V]{k: key, v: value}, t.maxItems()) + if !outb { + t.length++ + } + return out.k, out.v, outb +} + +// Delete removes an item equal to the passed in item from the tree, returning +// it. If no such item exists, returns (zeroValue, false). +func (t *BTreeMap[K, V]) Delete(key K) (K, V, bool) { + return t.deleteItem(key, removeItem) +} + +// DeleteMin removes the smallest item in the tree and returns it. +// If no such item exists, returns (zeroValue, false). +func (t *BTreeMap[K, V]) DeleteMin() (K, V, bool) { + var zero K + return t.deleteItem(zero, removeMin) +} + +// DeleteMax removes the largest item in the tree and returns it. +// If no such item exists, returns (zeroValue, false). +func (t *BTreeMap[K, V]) DeleteMax() (K, V, bool) { + var zero K + return t.deleteItem(zero, removeMax) +} + +func (t *BTreeMap[K, V]) deleteItem(key K, typ toRemove) (_ K, _ V, _ bool) { + if t.root == nil || len(t.root.items) == 0 { + return + } + t.root = t.root.mutableFor(t.cow) + out, outb := t.root.remove(key, t.minItems(), typ) + if len(t.root.items) == 0 && len(t.root.children) > 0 { + oldroot := t.root + t.root = t.root.children[0] + t.cow.freeNode(oldroot) + } + if outb { + t.length-- + } + return out.k, out.v, outb +} + +// LowerBound defines an (optional) lower bound for iteration. +type LowerBound[K any] bound[K] + +// Min returns a LowerBound that does not limit the lower bound of the iteration. +func Min[K any]() LowerBound[K] { return LowerBound[K]{kind: boundKindNone} } + +// GE returns an inclusive lower bound. +func GE[K any](key K) LowerBound[K] { return LowerBound[K]{key: key, kind: boundKindInclusive} } + +// GT returns an exclusive lower bound. +func GT[K any](key K) LowerBound[K] { return LowerBound[K]{key: key, kind: boundKindExclusive} } + +// UpperBound defines an (optional) upper bound for iteration. +type UpperBound[K any] bound[K] + +// Max returns an UpperBound that does not limit the upper bound of the iteration. +func Max[K any]() UpperBound[K] { return UpperBound[K]{kind: boundKindNone} } + +// LE returns an inclusive upper bound. +func LE[K any](key K) UpperBound[K] { return UpperBound[K]{key: key, kind: boundKindInclusive} } + +// LT returns an exclusive upper bound. +func LT[K any](key K) UpperBound[K] { return UpperBound[K]{key: key, kind: boundKindExclusive} } + +// AscendFunc calls yield() for all elements between the start and stop bounds, +// in ascending order. +func (t *BTreeMap[K, V]) AscendFunc( + start LowerBound[K], stop UpperBound[K], yield func(key K, value V) bool, +) { + if t.root != nil { + t.root.ascend(start, stop, false, yield) + } +} + +// Ascend returns an iterator which yields all elements between the start and +// stop bounds, in ascending order. +func (t *BTreeMap[K, V]) Ascend(start LowerBound[K], stop UpperBound[K]) iter.Seq2[K, V] { + return func(yield func(key K, value V) bool) { + if t.root != nil { + t.root.ascend(start, stop, false, yield) + } + } +} + +// DescendFunc calls yield() for all elements between the start and stop bounds, +// in ascending order. +func (t *BTreeMap[K, V]) DescendFunc( + start UpperBound[K], stop LowerBound[K], yield func(key K, value V) bool, +) { + if t.root != nil { + t.root.descend(start, stop, false, yield) + } +} + +// Descend returns an iterator which yields all elements between the start and +// stop bounds, in ascending order. +func (t *BTreeMap[K, V]) Descend(start UpperBound[K], stop LowerBound[K]) iter.Seq2[K, V] { + return func(yield func(key K, value V) bool) { + if t.root != nil { + t.root.descend(start, stop, false, yield) + } + } +} + +// Get looks for the key in the tree, returning (key, value, true) if found, or +// (0, 0, false) otherwise. +func (t *BTreeMap[K, V]) Get(key K) (_ K, _ V, _ bool) { + if t.root == nil { + return + } + return t.root.get(key) +} + +// Min returns the smallest key and associated value in the tree, or +// (0, 0, false) if the tree is empty. +func (t *BTreeMap[K, V]) Min() (K, V, bool) { + return min(t.root) +} + +// Max returns the largest key and associated value in the tree, or +// (0, 0, false) if the tree is empty. +func (t *BTreeMap[K, V]) Max() (K, V, bool) { + return max(t.root) +} + +// Has returns true if the given key is in the tree. +func (t *BTreeMap[K, V]) Has(key K) bool { + _, _, ok := t.Get(key) + return ok +} + +// Len returns the number of items currently in the tree. +func (t *BTreeMap[K, V]) Len() int { + return t.length +} + +// Clear removes all items from the btree. If addNodesToFreelist is true, +// t's nodes are added to its freelist as part of this call, until the freelist +// is full. Otherwise, the root node is simply dereferenced and the subtree +// left to Go's normal GC processes. +// +// This can be much faster +// than calling Delete on all elements, because that requires finding/removing +// each element in the tree and updating the tree accordingly. It also is +// somewhat faster than creating a new tree to replace the old one, because +// nodes from the old tree are reclaimed into the freelist for use by the new +// one, instead of being lost to the garbage collector. +// +// This call takes: +// +// O(1): when addNodesToFreelist is false, this is a single operation. +// O(1): when the freelist is already full, it breaks out immediately +// O(freelist size): when the freelist is empty and the nodes are all owned +// by this tree, nodes are added to the freelist until full. +// O(tree size): when all nodes are owned by another tree, all nodes are +// iterated over looking for nodes to add to the freelist, and due to +// ownership, none are. +func (t *BTreeMap[K, V]) Clear(addNodesToFreelist bool) { + if t.root != nil && addNodesToFreelist { + t.root.reset(t.cow) + } + t.root, t.length = nil, 0 +} + +// maxItems returns the max number of items to allow per node. +func (t *BTreeMap[K, V]) maxItems() int { + return t.degree*2 - 1 +} + +// minItems returns the min number of items to allow per node (ignored for the +// root node). +func (t *BTreeMap[K, V]) minItems() int { + return t.degree - 1 +} + +// copyOnWriteContext pointers determine node ownership... a tree with a write +// context equivalent to a node's write context is allowed to modify that node. +// A tree whose write context does not match a node's is not allowed to modify +// it, and must create a new, writable copy (IE: it's a Clone). +// +// When doing any write operation, we maintain the invariant that the current +// node's context is equal to the context of the tree that requested the write. +// We do this by, before we descend into any node, creating a copy with the +// correct context if the contexts don't match. +// +// Since the node we're currently visiting on any write has the requesting +// tree's context, that node is modifiable in place. Children of that node may +// not share context, but before we descend into them, we'll make a mutable +// copy. +type copyOnWriteContext[K any, V any] struct { + freelist *FreeList[K, V] + cmp CmpFunc[K] +} + +type bound[K any] struct { + kind boundKind + key K +} + +type boundKind uint8 + +const ( + boundKindNone boundKind = iota + boundKindInclusive + boundKindExclusive +) + +func (c *copyOnWriteContext[K, V]) newNode() *node[K, V] { + n := c.freelist.newNode() + n.cow = c + return n +} + +type freeType int + +const ( + ftFreelistFull freeType = iota // node was freed (available for GC, not stored in freelist) + ftStored // node was stored in the freelist for later use + ftNotOwned // node was ignored by COW, since it's owned by another one +) + +// freeNode frees a node within a given COW context, if it's owned by that +// context. It returns what happened to the node (see freeType const +// documentation). +func (c *copyOnWriteContext[K, V]) freeNode(n *node[K, V]) freeType { + if n.cow == c { + // clear to allow GC + n.items.truncate(0) + n.children.truncate(0) + n.cow = nil + if c.freelist.freeNode(n) { + return ftStored + } else { + return ftFreelistFull + } + } else { + return ftNotOwned + } +} diff --git a/vendor/github.com/RaduBerinde/btreemap/free_list.go b/vendor/github.com/RaduBerinde/btreemap/free_list.go new file mode 100644 index 0000000..6f6781c --- /dev/null +++ b/vendor/github.com/RaduBerinde/btreemap/free_list.go @@ -0,0 +1,58 @@ +// Copyright 2014-2022 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package btreemap + +import "sync" + +const DefaultFreeListSize = 32 + +// FreeList represents a free list of btree nodes. By default each +// BTreeMap has its own FreeList, but multiple BTrees can share the same +// FreeList, in particular when they're created with Clone. +// Two Btrees using the same freelist are safe for concurrent write access. +type FreeList[K, V any] struct { + mu sync.Mutex + freelist []*node[K, V] +} + +// NewFreeList creates a new free list. +// size is the maximum size of the returned free list. +func NewFreeList[K any, V any](size int) *FreeList[K, V] { + return &FreeList[K, V]{freelist: make([]*node[K, V], 0, size)} +} + +func (f *FreeList[K, V]) newNode() (n *node[K, V]) { + f.mu.Lock() + index := len(f.freelist) - 1 + if index < 0 { + f.mu.Unlock() + return new(node[K, V]) + } + n = f.freelist[index] + f.freelist[index] = nil + f.freelist = f.freelist[:index] + f.mu.Unlock() + return +} + +func (f *FreeList[K, V]) freeNode(n *node[K, V]) (out bool) { + f.mu.Lock() + if len(f.freelist) < cap(f.freelist) { + f.freelist = append(f.freelist, n) + out = true + } + f.mu.Unlock() + return +} diff --git a/vendor/github.com/RaduBerinde/btreemap/node.go b/vendor/github.com/RaduBerinde/btreemap/node.go new file mode 100644 index 0000000..f33e2f2 --- /dev/null +++ b/vendor/github.com/RaduBerinde/btreemap/node.go @@ -0,0 +1,445 @@ +// Copyright 2014-2022 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package btreemap + +import ( + "fmt" + "io" + "sort" + "strings" +) + +// node is an internal node in a tree. +// +// It must at all times maintain the invariant that either +// - len(children) == 0, len(items) unconstrained +// - len(children) == len(items) + 1 +type node[K any, V any] struct { + items items[kv[K, V]] + children items[*node[K, V]] + cow *copyOnWriteContext[K, V] +} + +type kv[K any, V any] struct { + k K + v V +} + +func (n *node[K, V]) mutableFor(cow *copyOnWriteContext[K, V]) *node[K, V] { + if n.cow == cow { + return n + } + out := cow.newNode() + if cap(out.items) >= len(n.items) { + out.items = out.items[:len(n.items)] + } else { + out.items = make(items[kv[K, V]], len(n.items), cap(n.items)) + } + copy(out.items, n.items) + // Copy children + if cap(out.children) >= len(n.children) { + out.children = out.children[:len(n.children)] + } else { + out.children = make(items[*node[K, V]], len(n.children), cap(n.children)) + } + copy(out.children, n.children) + return out +} + +func (n *node[K, V]) mutableChild(i int) *node[K, V] { + c := n.children[i].mutableFor(n.cow) + n.children[i] = c + return c +} + +// split splits the given node at the given index. The current node shrinks, +// and this function returns the item that existed at that index and a new node +// containing all items/children after it. +func (n *node[K, V]) split(i int) (kv[K, V], *node[K, V]) { + item := n.items[i] + next := n.cow.newNode() + next.items = append(next.items, n.items[i+1:]...) + n.items.truncate(i) + if len(n.children) > 0 { + next.children = append(next.children, n.children[i+1:]...) + n.children.truncate(i + 1) + } + return item, next +} + +// maybeSplitChild checks if a child should be split, and if so splits it. +// Returns whether or not a split occurred. +func (n *node[K, V]) maybeSplitChild(i, maxItems int) bool { + if len(n.children[i].items) < maxItems { + return false + } + first := n.mutableChild(i) + item, second := first.split(maxItems / 2) + n.items.insertAt(i, item) + n.children.insertAt(i+1, second) + return true +} + +// insert inserts an item into the subtree rooted at this node, making sure +// no nodes in the subtree exceed maxItems items. Should an equivalent item be +// be found/replaced by insert, it will be returned. +func (n *node[K, V]) insert(item kv[K, V], maxItems int) (_ kv[K, V], _ bool) { + i, found := findKV(n.items, item.k, n.cow.cmp) + if found { + out := n.items[i] + n.items[i] = item + return out, true + } + if len(n.children) == 0 { + n.items.insertAt(i, item) + return + } + if n.maybeSplitChild(i, maxItems) { + inTree := n.items[i] + switch c := n.cow.cmp(item.k, inTree.k); { + case c < 0: + // no change, we want first split node + case c > 0: + i++ // we want second split node + default: + out := n.items[i] + n.items[i] = item + return out, true + } + } + return n.mutableChild(i).insert(item, maxItems) +} + +// get finds the given key in the subtree and returns the key and value. +func (n *node[K, V]) get(key K) (_ K, _ V, _ bool) { + i, found := findKV(n.items, key, n.cow.cmp) + if found { + return n.items[i].k, n.items[i].v, true + } else if len(n.children) > 0 { + return n.children[i].get(key) + } + return +} + +// min returns the first item in the subtree. +func min[K any, V any](n *node[K, V]) (_ K, _ V, ok bool) { + if n == nil { + return + } + for len(n.children) > 0 { + n = n.children[0] + } + if len(n.items) == 0 { + return + } + return n.items[0].k, n.items[0].v, true +} + +// max returns the last item in the subtree. +func max[K any, V any](n *node[K, V]) (_ K, _ V, ok bool) { + if n == nil { + return + } + for len(n.children) > 0 { + n = n.children[len(n.children)-1] + } + if len(n.items) == 0 { + return + } + out := n.items[len(n.items)-1] + return out.k, out.v, true +} + +// toRemove details what item to remove in a node.remove call. +type toRemove int + +const ( + removeItem toRemove = iota // removes the given item + removeMin // removes smallest item in the subtree + removeMax // removes largest item in the subtree +) + +// remove removes an item from the subtree rooted at this node. +func (n *node[K, V]) remove(key K, minItems int, typ toRemove) (_ kv[K, V], _ bool) { + var i int + var found bool + switch typ { + case removeMax: + if len(n.children) == 0 { + return n.items.pop(), true + } + i = len(n.items) + case removeMin: + if len(n.children) == 0 { + return n.items.removeAt(0), true + } + i = 0 + case removeItem: + i, found = findKV(n.items, key, n.cow.cmp) + if len(n.children) == 0 { + if found { + return n.items.removeAt(i), true + } + return + } + default: + panic("invalid type") + } + // If we get to here, we have children. + if len(n.children[i].items) <= minItems { + return n.growChildAndRemove(i, key, minItems, typ) + } + child := n.mutableChild(i) + // Either we had enough items to begin with, or we've done some + // merging/stealing, because we've got enough now and we're ready to return + // stuff. + if found { + // The item exists at index 'i', and the child we've selected can give us a + // predecessor, since if we've gotten here it's got > minItems items in it. + out := n.items[i] + // We use our special-case 'remove' call with typ=maxItem to pull the + // predecessor of item i (the rightmost leaf of our immediate left child) + // and set it into where we pulled the item from. + var zero K + n.items[i], _ = child.remove(zero, minItems, removeMax) + return out, true + } + // Final recursive call. Once we're here, we know that the item isn't in this + // node and that the child is big enough to remove from. + return child.remove(key, minItems, typ) +} + +// growChildAndRemove grows child 'i' to make sure it's possible to remove an +// item from it while keeping it at minItems, then calls remove to actually +// remove it. +// +// Most documentation says we have to do two sets of special casing: +// 1. item is in this node +// 2. item is in child +// +// In both cases, we need to handle the two subcases: +// +// A) node has enough values that it can spare one +// B) node doesn't have enough values +// +// For the latter, we have to check: +// +// a) left sibling has node to spare +// b) right sibling has node to spare +// c) we must merge +// +// To simplify our code here, we handle cases #1 and #2 the same: +// If a node doesn't have enough items, we make sure it does (using a,b,c). +// We then simply redo our remove call, and the second time (regardless of +// whether we're in case 1 or 2), we'll have enough items and can guarantee +// that we hit case A. +func (n *node[K, V]) growChildAndRemove(i int, key K, minItems int, typ toRemove) (kv[K, V], bool) { + if i > 0 && len(n.children[i-1].items) > minItems { + // Steal from left child + child := n.mutableChild(i) + stealFrom := n.mutableChild(i - 1) + stolenItem := stealFrom.items.pop() + child.items.insertAt(0, n.items[i-1]) + n.items[i-1] = stolenItem + if len(stealFrom.children) > 0 { + child.children.insertAt(0, stealFrom.children.pop()) + } + } else if i < len(n.items) && len(n.children[i+1].items) > minItems { + // steal from right child + child := n.mutableChild(i) + stealFrom := n.mutableChild(i + 1) + stolenItem := stealFrom.items.removeAt(0) + child.items = append(child.items, n.items[i]) + n.items[i] = stolenItem + if len(stealFrom.children) > 0 { + child.children = append(child.children, stealFrom.children.removeAt(0)) + } + } else { + if i >= len(n.items) { + i-- + } + child := n.mutableChild(i) + // merge with right child + mergeItem := n.items.removeAt(i) + mergeChild := n.children.removeAt(i + 1) + child.items = append(child.items, mergeItem) + child.items = append(child.items, mergeChild.items...) + child.children = append(child.children, mergeChild.children...) + n.cow.freeNode(mergeChild) + } + return n.remove(key, minItems, typ) +} + +// asced provides a simple method for iterating over elements in the tree, in +// ascending order. +func (n *node[K, V]) ascend( + start LowerBound[K], stop UpperBound[K], hit bool, iter func(K, V) bool, +) (bool, bool) { + var ok bool + var index int + if start.kind != boundKindNone { + index, _ = findKV(n.items, start.key, n.cow.cmp) + } + for i := index; i < len(n.items); i++ { + if len(n.children) > 0 { + if hit, ok = n.children[i].ascend(start, stop, hit, iter); !ok { + return hit, false + } + } + if start.kind == boundKindExclusive && !hit && n.cow.cmp(start.key, n.items[i].k) >= 0 { + hit = true + continue + } + hit = true + if stop.kind != boundKindNone { + c := n.cow.cmp(n.items[i].k, stop.key) + if c > 0 || (c == 0 && stop.kind == boundKindExclusive) { + return hit, false + } + } + if !iter(n.items[i].k, n.items[i].v) { + return hit, false + } + } + if len(n.children) > 0 { + if hit, ok = n.children[len(n.children)-1].ascend(start, stop, hit, iter); !ok { + return hit, false + } + } + return hit, true +} + +// descend provides a simple method for iterating over elements in the tree, in +// ascending order. +func (n *node[K, V]) descend( + start UpperBound[K], stop LowerBound[K], hit bool, iter func(K, V) bool, +) (bool, bool) { + var ok bool + var index int + if start.kind != boundKindNone { + var found bool + index, found = findKV(n.items, start.key, n.cow.cmp) + if !found { + index = index - 1 + } + } else { + index = len(n.items) - 1 + } + for i := index; i >= 0; i-- { + if start.kind != boundKindNone { + c := n.cow.cmp(start.key, n.items[i].k) + if c < 0 || (c == 0 && (start.kind == boundKindExclusive || hit)) { + continue + } + } + if len(n.children) > 0 { + if hit, ok = n.children[i+1].descend(start, stop, hit, iter); !ok { + return hit, false + } + } + if stop.kind != boundKindNone { + c := n.cow.cmp(n.items[i].k, stop.key) + if c < 0 || (c == 0 && stop.kind == boundKindExclusive) { + return hit, false + } + } + hit = true + if !iter(n.items[i].k, n.items[i].v) { + return hit, false + } + } + if len(n.children) > 0 { + if hit, ok = n.children[0].descend(start, stop, hit, iter); !ok { + return hit, false + } + } + return hit, true +} + +// print is used for testing/debugging purposes. +func (n *node[K, V]) print(w io.Writer, level int) { + fmt.Fprintf(w, "%sNODE:%v\n", strings.Repeat(" ", level), n.items) + for _, c := range n.children { + c.print(w, level+1) + } +} + +// reset returns a subtree to the freelist. It breaks out immediately if the +// freelist is full, since the only benefit of iterating is to fill that +// freelist up. Returns true if parent reset call should continue. +func (n *node[K, V]) reset(c *copyOnWriteContext[K, V]) bool { + for _, child := range n.children { + if !child.reset(c) { + return false + } + } + return c.freeNode(n) != ftFreelistFull +} + +// items stores items in a node. +type items[T any] []T + +// insertAt inserts a value into the given index, pushing all subsequent values +// forward. +func (s *items[T]) insertAt(index int, item T) { + var zero T + *s = append(*s, zero) + if index < len(*s) { + copy((*s)[index+1:], (*s)[index:]) + } + (*s)[index] = item +} + +// removeAt removes a value at a given index, pulling all subsequent values +// back. +func (s *items[T]) removeAt(index int) T { + item := (*s)[index] + copy((*s)[index:], (*s)[index+1:]) + var zero T + (*s)[len(*s)-1] = zero + *s = (*s)[:len(*s)-1] + return item +} + +// pop removes and returns the last element in the list. +func (s *items[T]) pop() (out T) { + index := len(*s) - 1 + out = (*s)[index] + var zero T + (*s)[index] = zero + *s = (*s)[:index] + return +} + +// truncate truncates this instance at index so that it contains only the +// first index items. index must be less than or equal to length. +func (s *items[T]) truncate(index int) { + var toClear items[T] + *s, toClear = (*s)[:index], (*s)[index:] + var zero T + for i := 0; i < len(toClear); i++ { + toClear[i] = zero + } +} + +// findKV returns the index where the given key should be inserted into this +// list. 'found' is true if the kty already exists in the list at the given +// index. +func findKV[K any, V any](s items[kv[K, V]], key K, cmp CmpFunc[K]) (index int, found bool) { + i := sort.Search(len(s), func(i int) bool { + return cmp(key, s[i].k) <= 0 + }) + return i, i < len(s) && cmp(key, s[i].k) == 0 +} diff --git a/vendor/github.com/RoaringBitmap/roaring/.drone.yml b/vendor/github.com/RoaringBitmap/roaring/.drone.yml deleted file mode 100644 index 7936bfe..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/.drone.yml +++ /dev/null @@ -1,19 +0,0 @@ -kind: pipeline -name: default - -workspace: - base: /go - path: src/github.com/RoaringBitmap/roaring - -steps: -- name: test - image: golang - commands: - - go get -t - - go test - - go build -tags appengine - - go test -tags appengine - - GOARCH=386 go build - - GOARCH=386 go test - - GOARCH=arm go build - - GOARCH=arm64 go build diff --git a/vendor/github.com/RoaringBitmap/roaring/.gitignore b/vendor/github.com/RoaringBitmap/roaring/.gitignore deleted file mode 100644 index 851f323..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*~ -roaring-fuzz.zip -workdir -coverage.out -testdata/all3.classic diff --git a/vendor/github.com/RoaringBitmap/roaring/.gitmodules b/vendor/github.com/RoaringBitmap/roaring/.gitmodules deleted file mode 100644 index e69de29..0000000 diff --git a/vendor/github.com/RoaringBitmap/roaring/AUTHORS b/vendor/github.com/RoaringBitmap/roaring/AUTHORS deleted file mode 100644 index 26ec99d..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/AUTHORS +++ /dev/null @@ -1,11 +0,0 @@ -# This is the official list of roaring authors for copyright purposes. - -Todd Gruben (@tgruben), -Daniel Lemire (@lemire), -Elliot Murphy (@statik), -Bob Potter (@bpot), -Tyson Maly (@tvmaly), -Will Glynn (@willglynn), -Brent Pedersen (@brentp) -Maciej Biłas (@maciej), -Joe Nall (@joenall) diff --git a/vendor/github.com/RoaringBitmap/roaring/CONTRIBUTORS b/vendor/github.com/RoaringBitmap/roaring/CONTRIBUTORS deleted file mode 100644 index 1a8da9c..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/CONTRIBUTORS +++ /dev/null @@ -1,18 +0,0 @@ -# This is the official list of roaring contributors - -Todd Gruben (@tgruben), -Daniel Lemire (@lemire), -Elliot Murphy (@statik), -Bob Potter (@bpot), -Tyson Maly (@tvmaly), -Will Glynn (@willglynn), -Brent Pedersen (@brentp), -Jason E. Aten (@glycerine), -Vali Malinoiu (@0x4139), -Forud Ghafouri (@fzerorubigd), -Joe Nall (@joenall), -(@fredim), -Edd Robinson (@e-dard), -Alexander Petrov (@alldroll), -Guy Molinari (@guymolinari), -Ling Jin (@JinLingChristopher) diff --git a/vendor/github.com/RoaringBitmap/roaring/README.md b/vendor/github.com/RoaringBitmap/roaring/README.md deleted file mode 100644 index f6705df..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/README.md +++ /dev/null @@ -1,413 +0,0 @@ -# roaring - -[![GoDoc](https://godoc.org/github.com/RoaringBitmap/roaring?status.svg)](https://godoc.org/github.com/RoaringBitmap/roaring) [![Go Report Card](https://goreportcard.com/badge/RoaringBitmap/roaring)](https://goreportcard.com/report/github.com/RoaringBitmap/roaring) - -![Go-CI](https://github.com/RoaringBitmap/roaring/workflows/Go-CI/badge.svg) -![Go-ARM-CI](https://github.com/RoaringBitmap/roaring/workflows/Go-ARM-CI/badge.svg) -![Go-Windows-CI](https://github.com/RoaringBitmap/roaring/workflows/Go-Windows-CI/badge.svg) -============= - -This is a go version of the Roaring bitmap data structure. - -Roaring bitmaps are used by several major systems such as [Apache Lucene][lucene] and derivative systems such as [Solr][solr] and -[Elasticsearch][elasticsearch], [Apache Druid (Incubating)][druid], [LinkedIn Pinot][pinot], [Netflix Atlas][atlas], [Apache Spark][spark], [OpenSearchServer][opensearchserver], [anacrolix/torrent][anacrolix/torrent], [Whoosh][whoosh], [Redpanda](https://github.com/redpanda-data/redpanda), [Pilosa][pilosa], [Microsoft Visual Studio Team Services (VSTS)][vsts], and eBay's [Apache Kylin][kylin]. The YouTube SQL Engine, [Google Procella](https://research.google/pubs/pub48388/), uses Roaring bitmaps for indexing. - -[lucene]: https://lucene.apache.org/ -[solr]: https://lucene.apache.org/solr/ -[elasticsearch]: https://www.elastic.co/products/elasticsearch -[druid]: https://druid.apache.org/ -[spark]: https://spark.apache.org/ -[opensearchserver]: http://www.opensearchserver.com -[anacrolix/torrent]: https://github.com/anacrolix/torrent -[whoosh]: https://bitbucket.org/mchaput/whoosh/wiki/Home -[pilosa]: https://www.pilosa.com/ -[kylin]: http://kylin.apache.org/ -[pinot]: http://github.com/linkedin/pinot/wiki -[vsts]: https://www.visualstudio.com/team-services/ -[atlas]: https://github.com/Netflix/atlas - -Roaring bitmaps are found to work well in many important applications: - -> Use Roaring for bitmap compression whenever possible. Do not use other bitmap compression methods ([Wang et al., SIGMOD 2017](http://db.ucsd.edu/wp-content/uploads/2017/03/sidm338-wangA.pdf)) - - -The ``roaring`` Go library is used by -* [anacrolix/torrent] -* [InfluxDB](https://www.influxdata.com) -* [Pilosa](https://www.pilosa.com/) -* [Bleve](http://www.blevesearch.com) -* [Weaviate](https://github.com/weaviate/weaviate) -* [lindb](https://github.com/lindb/lindb) -* [Elasticell](https://github.com/deepfabric/elasticell) -* [SourceGraph](https://github.com/sourcegraph/sourcegraph) -* [M3](https://github.com/m3db/m3) -* [trident](https://github.com/NetApp/trident) -* [Husky](https://www.datadoghq.com/blog/engineering/introducing-husky/) -* [FrostDB](https://github.com/polarsignals/frostdb) - -This library is used in production in several systems, it is part of the [Awesome Go collection](https://awesome-go.com). - - -There are also [Java](https://github.com/RoaringBitmap/RoaringBitmap) and [C/C++](https://github.com/RoaringBitmap/CRoaring) versions. The Java, C, C++ and Go version are binary compatible: e.g, you can save bitmaps -from a Java program and load them back in Go, and vice versa. We have a [format specification](https://github.com/RoaringBitmap/RoaringFormatSpec). - - -This code is licensed under Apache License, Version 2.0 (ASL2.0). - -Copyright 2016-... by the authors. - -When should you use a bitmap? -=================================== - - -Sets are a fundamental abstraction in -software. They can be implemented in various -ways, as hash sets, as trees, and so forth. -In databases and search engines, sets are often an integral -part of indexes. For example, we may need to maintain a set -of all documents or rows (represented by numerical identifier) -that satisfy some property. Besides adding or removing -elements from the set, we need fast functions -to compute the intersection, the union, the difference between sets, and so on. - - -To implement a set -of integers, a particularly appealing strategy is the -bitmap (also called bitset or bit vector). Using n bits, -we can represent any set made of the integers from the range -[0,n): the ith bit is set to one if integer i is present in the set. -Commodity processors use words of W=32 or W=64 bits. By combining many such words, we can -support large values of n. Intersections, unions and differences can then be implemented - as bitwise AND, OR and ANDNOT operations. -More complicated set functions can also be implemented as bitwise operations. - -When the bitset approach is applicable, it can be orders of -magnitude faster than other possible implementation of a set (e.g., as a hash set) -while using several times less memory. - -However, a bitset, even a compressed one is not always applicable. For example, if -you have 1000 random-looking integers, then a simple array might be the best representation. -We refer to this case as the "sparse" scenario. - -When should you use compressed bitmaps? -=================================== - -An uncompressed BitSet can use a lot of memory. For example, if you take a BitSet -and set the bit at position 1,000,000 to true and you have just over 100kB. That is over 100kB -to store the position of one bit. This is wasteful even if you do not care about memory: -suppose that you need to compute the intersection between this BitSet and another one -that has a bit at position 1,000,001 to true, then you need to go through all these zeroes, -whether you like it or not. That can become very wasteful. - -This being said, there are definitively cases where attempting to use compressed bitmaps is wasteful. -For example, if you have a small universe size. E.g., your bitmaps represent sets of integers -from [0,n) where n is small (e.g., n=64 or n=128). If you can use uncompressed BitSet and -it does not blow up your memory usage, then compressed bitmaps are probably not useful -to you. In fact, if you do not need compression, then a BitSet offers remarkable speed. - -The sparse scenario is another use case where compressed bitmaps should not be used. -Keep in mind that random-looking data is usually not compressible. E.g., if you have a small set of -32-bit random integers, it is not mathematically possible to use far less than 32 bits per integer, -and attempts at compression can be counterproductive. - -How does Roaring compares with the alternatives? -================================================== - - -Most alternatives to Roaring are part of a larger family of compressed bitmaps that are run-length-encoded -bitmaps. They identify long runs of 1s or 0s and they represent them with a marker word. -If you have a local mix of 1s and 0, you use an uncompressed word. - -There are many formats in this family: - -* Oracle's BBC is an obsolete format at this point: though it may provide good compression, -it is likely much slower than more recent alternatives due to excessive branching. -* WAH is a patented variation on BBC that provides better performance. -* Concise is a variation on the patented WAH. It some specific instances, it can compress -much better than WAH (up to 2x better), but it is generally slower. -* EWAH is both free of patent, and it is faster than all the above. On the downside, it -does not compress quite as well. It is faster because it allows some form of "skipping" -over uncompressed words. So though none of these formats are great at random access, EWAH -is better than the alternatives. - - - -There is a big problem with these formats however that can hurt you badly in some cases: there is no random access. If you want to check whether a given value is present in the set, you have to start from the beginning and "uncompress" the whole thing. This means that if you want to intersect a big set with a large set, you still have to uncompress the whole big set in the worst case... - -Roaring solves this problem. It works in the following manner. It divides the data into chunks of 216 integers -(e.g., [0, 216), [216, 2 x 216), ...). Within a chunk, it can use an uncompressed bitmap, a simple list of integers, -or a list of runs. Whatever format it uses, they all allow you to check for the presence of any one value quickly -(e.g., with a binary search). The net result is that Roaring can compute many operations much faster than run-length-encoded -formats like WAH, EWAH, Concise... Maybe surprisingly, Roaring also generally offers better compression ratios. - - - - - -### References - -- Daniel Lemire, Owen Kaser, Nathan Kurz, Luca Deri, Chris O'Hara, François Saint-Jacques, Gregory Ssi-Yan-Kai, Roaring Bitmaps: Implementation of an Optimized Software Library, Software: Practice and Experience 48 (4), 2018 [arXiv:1709.07821](https://arxiv.org/abs/1709.07821) -- Samy Chambi, Daniel Lemire, Owen Kaser, Robert Godin, -Better bitmap performance with Roaring bitmaps, -Software: Practice and Experience 46 (5), 2016.[arXiv:1402.6407](http://arxiv.org/abs/1402.6407) This paper used data from http://lemire.me/data/realroaring2014.html -- Daniel Lemire, Gregory Ssi-Yan-Kai, Owen Kaser, Consistently faster and smaller compressed bitmaps with Roaring, Software: Practice and Experience 46 (11), 2016. [arXiv:1603.06549](http://arxiv.org/abs/1603.06549) - -### Dependencies - -Dependencies are fetched automatically by giving the `-t` flag to `go get`. - -they include - - github.com/bits-and-blooms/bitset - - github.com/mschoch/smat - - github.com/glycerine/go-unsnap-stream - - github.com/philhofer/fwd - - github.com/jtolds/gls - -Note that the smat library requires Go 1.6 or better. - -#### Installation - - - go get -t github.com/RoaringBitmap/roaring - -### Instructions for contributors - -Using bash or other common shells: -``` -$ git clone git@github.com:RoaringBitmap/roaring.git -$ export GO111MODULE=on -$ go mod tidy -$ go test -v -``` - -### Example - -Here is a simplified but complete example: - -```go -package main - -import ( - "fmt" - "github.com/RoaringBitmap/roaring" - "bytes" -) - - -func main() { - // example inspired by https://github.com/fzandona/goroar - fmt.Println("==roaring==") - rb1 := roaring.BitmapOf(1, 2, 3, 4, 5, 100, 1000) - fmt.Println(rb1.String()) - - rb2 := roaring.BitmapOf(3, 4, 1000) - fmt.Println(rb2.String()) - - rb3 := roaring.New() - fmt.Println(rb3.String()) - - fmt.Println("Cardinality: ", rb1.GetCardinality()) - - fmt.Println("Contains 3? ", rb1.Contains(3)) - - rb1.And(rb2) - - rb3.Add(1) - rb3.Add(5) - - rb3.Or(rb1) - - // computes union of the three bitmaps in parallel using 4 workers - roaring.ParOr(4, rb1, rb2, rb3) - // computes intersection of the three bitmaps in parallel using 4 workers - roaring.ParAnd(4, rb1, rb2, rb3) - - - // prints 1, 3, 4, 5, 1000 - i := rb3.Iterator() - for i.HasNext() { - fmt.Println(i.Next()) - } - fmt.Println() - - // next we include an example of serialization - buf := new(bytes.Buffer) - rb1.WriteTo(buf) // we omit error handling - newrb:= roaring.New() - newrb.ReadFrom(buf) - if rb1.Equals(newrb) { - fmt.Println("I wrote the content to a byte stream and read it back.") - } - // you can iterate over bitmaps using ReverseIterator(), Iterator, ManyIterator() -} -``` - -If you wish to use serialization and handle errors, you might want to -consider the following sample of code: - -```go - rb := BitmapOf(1, 2, 3, 4, 5, 100, 1000) - buf := new(bytes.Buffer) - size,err:=rb.WriteTo(buf) - if err != nil { - t.Errorf("Failed writing") - } - newrb:= New() - size,err=newrb.ReadFrom(buf) - if err != nil { - t.Errorf("Failed reading") - } - if ! rb.Equals(newrb) { - t.Errorf("Cannot retrieve serialized version") - } -``` - -Given N integers in [0,x), then the serialized size in bytes of -a Roaring bitmap should never exceed this bound: - -`` 8 + 9 * ((long)x+65535)/65536 + 2 * N `` - -That is, given a fixed overhead for the universe size (x), Roaring -bitmaps never use more than 2 bytes per integer. You can call -``BoundSerializedSizeInBytes`` for a more precise estimate. - -### 64-bit Roaring - -By default, roaring is used to stored unsigned 32-bit integers. However, we also offer -an extension dedicated to 64-bit integers. It supports roughly the same functions: - -```go -package main - -import ( - "fmt" - "github.com/RoaringBitmap/roaring/roaring64" - "bytes" -) - - -func main() { - // example inspired by https://github.com/fzandona/goroar - fmt.Println("==roaring64==") - rb1 := roaring64.BitmapOf(1, 2, 3, 4, 5, 100, 1000) - fmt.Println(rb1.String()) - - rb2 := roaring64.BitmapOf(3, 4, 1000) - fmt.Println(rb2.String()) - - rb3 := roaring64.New() - fmt.Println(rb3.String()) - - fmt.Println("Cardinality: ", rb1.GetCardinality()) - - fmt.Println("Contains 3? ", rb1.Contains(3)) - - rb1.And(rb2) - - rb3.Add(1) - rb3.Add(5) - - rb3.Or(rb1) - - - - // prints 1, 3, 4, 5, 1000 - i := rb3.Iterator() - for i.HasNext() { - fmt.Println(i.Next()) - } - fmt.Println() - - // next we include an example of serialization - buf := new(bytes.Buffer) - rb1.WriteTo(buf) // we omit error handling - newrb:= roaring64.New() - newrb.ReadFrom(buf) - if rb1.Equals(newrb) { - fmt.Println("I wrote the content to a byte stream and read it back.") - } - // you can iterate over bitmaps using ReverseIterator(), Iterator, ManyIterator() -} -``` - -Only the 32-bit roaring format is standard and cross-operable between Java, C++, C and Go. There is no guarantee that the 64-bit versions are compatible. - -### Documentation - -Current documentation is available at https://pkg.go.dev/github.com/RoaringBitmap/roaring and https://pkg.go.dev/github.com/RoaringBitmap/roaring/roaring64 - -### Goroutine safety - -In general, it should not generally be considered safe to access -the same bitmaps using different goroutines--they are left -unsynchronized for performance. Should you want to access -a Bitmap from more than one goroutine, you should -provide synchronization. Typically this is done by using channels to pass -the *Bitmap around (in Go style; so there is only ever one owner), -or by using `sync.Mutex` to serialize operations on Bitmaps. - -### Coverage - -We test our software. For a report on our test coverage, see - -https://coveralls.io/github/RoaringBitmap/roaring?branch=master - -### Benchmark - -Type - - go test -bench Benchmark -run - - -To run benchmarks on [Real Roaring Datasets](https://github.com/RoaringBitmap/real-roaring-datasets) -run the following: - -```sh -go get github.com/RoaringBitmap/real-roaring-datasets -BENCH_REAL_DATA=1 go test -bench BenchmarkRealData -run - -``` - -### Iterative use - -You can use roaring with gore: - -- go get -u github.com/motemen/gore -- Make sure that ``$GOPATH/bin`` is in your ``$PATH``. -- go get github.com/RoaringBitmap/roaring - -```go -$ gore -gore version 0.2.6 :help for help -gore> :import github.com/RoaringBitmap/roaring -gore> x:=roaring.New() -gore> x.Add(1) -gore> x.String() -"{1}" -``` - - -### Fuzzy testing - -You can help us test further the library with fuzzy testing: - - go get github.com/dvyukov/go-fuzz/go-fuzz - go get github.com/dvyukov/go-fuzz/go-fuzz-build - go test -tags=gofuzz -run=TestGenerateSmatCorpus - go-fuzz-build github.com/RoaringBitmap/roaring - go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200 -func FuzzSmat - -Let it run, and if the # of crashers is > 0, check out the reports in -the workdir where you should be able to find the panic goroutine stack -traces. - -You may also replace `-func FuzzSmat` by `-func FuzzSerializationBuffer` or `-func FuzzSerializationStream`. - -### Alternative in Go - -There is a Go version wrapping the C/C++ implementation https://github.com/RoaringBitmap/gocroaring - -For an alternative implementation in Go, see https://github.com/fzandona/goroar -The two versions were written independently. - - -### Mailing list/discussion group - -https://groups.google.com/forum/#!forum/roaring-bitmaps diff --git a/vendor/github.com/RoaringBitmap/roaring/arraycontainer.go b/vendor/github.com/RoaringBitmap/roaring/arraycontainer.go deleted file mode 100644 index 80fa676..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/arraycontainer.go +++ /dev/null @@ -1,1101 +0,0 @@ -package roaring - -import ( - "fmt" -) - -type arrayContainer struct { - content []uint16 -} - -func (ac *arrayContainer) String() string { - s := "{" - for it := ac.getShortIterator(); it.hasNext(); { - s += fmt.Sprintf("%v, ", it.next()) - } - return s + "}" -} - -func (ac *arrayContainer) fillLeastSignificant16bits(x []uint32, i int, mask uint32) int { - if i < 0 { - panic("negative index") - } - if len(ac.content) == 0 { - return i - } - _ = x[len(ac.content)-1+i] - _ = ac.content[len(ac.content)-1] - for k := 0; k < len(ac.content); k++ { - x[k+i] = - uint32(ac.content[k]) | mask - } - return i + len(ac.content) -} - -func (ac *arrayContainer) iterate(cb func(x uint16) bool) bool { - iterator := shortIterator{ac.content, 0} - - for iterator.hasNext() { - if !cb(iterator.next()) { - return false - } - } - - return true -} - -func (ac *arrayContainer) getShortIterator() shortPeekable { - return &shortIterator{ac.content, 0} -} - -func (ac *arrayContainer) getReverseIterator() shortIterable { - return &reverseIterator{ac.content, len(ac.content) - 1} -} - -func (ac *arrayContainer) getManyIterator() manyIterable { - return &shortIterator{ac.content, 0} -} - -func (ac *arrayContainer) minimum() uint16 { - return ac.content[0] // assume not empty -} - -func (ac *arrayContainer) maximum() uint16 { - return ac.content[len(ac.content)-1] // assume not empty -} - -func (ac *arrayContainer) getSizeInBytes() int { - return ac.getCardinality() * 2 -} - -func (ac *arrayContainer) serializedSizeInBytes() int { - return ac.getCardinality() * 2 -} - -func arrayContainerSizeInBytes(card int) int { - return card * 2 -} - -// add the values in the range [firstOfRange,endx) -func (ac *arrayContainer) iaddRange(firstOfRange, endx int) container { - if firstOfRange >= endx { - return ac - } - indexstart := binarySearch(ac.content, uint16(firstOfRange)) - if indexstart < 0 { - indexstart = -indexstart - 1 - } - indexend := binarySearch(ac.content, uint16(endx-1)) - if indexend < 0 { - indexend = -indexend - 1 - } else { - indexend++ - } - rangelength := endx - firstOfRange - newcardinality := indexstart + (ac.getCardinality() - indexend) + rangelength - if newcardinality > arrayDefaultMaxSize { - a := ac.toBitmapContainer() - return a.iaddRange(firstOfRange, endx) - } - if cap(ac.content) < newcardinality { - tmp := make([]uint16, newcardinality, newcardinality) - copy(tmp[:indexstart], ac.content[:indexstart]) - copy(tmp[indexstart+rangelength:], ac.content[indexend:]) - - ac.content = tmp - } else { - ac.content = ac.content[:newcardinality] - copy(ac.content[indexstart+rangelength:], ac.content[indexend:]) - - } - for k := 0; k < rangelength; k++ { - ac.content[k+indexstart] = uint16(firstOfRange + k) - } - return ac -} - -// remove the values in the range [firstOfRange,endx) -func (ac *arrayContainer) iremoveRange(firstOfRange, endx int) container { - if firstOfRange >= endx { - return ac - } - indexstart := binarySearch(ac.content, uint16(firstOfRange)) - if indexstart < 0 { - indexstart = -indexstart - 1 - } - indexend := binarySearch(ac.content, uint16(endx-1)) - if indexend < 0 { - indexend = -indexend - 1 - } else { - indexend++ - } - rangelength := indexend - indexstart - answer := ac - copy(answer.content[indexstart:], ac.content[indexstart+rangelength:]) - answer.content = answer.content[:ac.getCardinality()-rangelength] - return answer -} - -// flip the values in the range [firstOfRange,endx) -func (ac *arrayContainer) not(firstOfRange, endx int) container { - if firstOfRange >= endx { - return ac.clone() - } - return ac.notClose(firstOfRange, endx-1) // remove everything in [firstOfRange,endx-1] -} - -// flip the values in the range [firstOfRange,lastOfRange] -func (ac *arrayContainer) notClose(firstOfRange, lastOfRange int) container { - if firstOfRange > lastOfRange { // unlike add and remove, not uses an inclusive range [firstOfRange,lastOfRange] - return ac.clone() - } - - // determine the span of array indices to be affected^M - startIndex := binarySearch(ac.content, uint16(firstOfRange)) - if startIndex < 0 { - startIndex = -startIndex - 1 - } - lastIndex := binarySearch(ac.content, uint16(lastOfRange)) - if lastIndex < 0 { - lastIndex = -lastIndex - 2 - } - currentValuesInRange := lastIndex - startIndex + 1 - spanToBeFlipped := lastOfRange - firstOfRange + 1 - newValuesInRange := spanToBeFlipped - currentValuesInRange - cardinalityChange := newValuesInRange - currentValuesInRange - newCardinality := len(ac.content) + cardinalityChange - if newCardinality > arrayDefaultMaxSize { - return ac.toBitmapContainer().not(firstOfRange, lastOfRange+1) - } - answer := newArrayContainer() - answer.content = make([]uint16, newCardinality, newCardinality) //a hack for sure - - copy(answer.content, ac.content[:startIndex]) - outPos := startIndex - inPos := startIndex - valInRange := firstOfRange - for ; valInRange <= lastOfRange && inPos <= lastIndex; valInRange++ { - if uint16(valInRange) != ac.content[inPos] { - answer.content[outPos] = uint16(valInRange) - outPos++ - } else { - inPos++ - } - } - - for ; valInRange <= lastOfRange; valInRange++ { - answer.content[outPos] = uint16(valInRange) - outPos++ - } - - for i := lastIndex + 1; i < len(ac.content); i++ { - answer.content[outPos] = ac.content[i] - outPos++ - } - answer.content = answer.content[:newCardinality] - return answer - -} - -func (ac *arrayContainer) equals(o container) bool { - - srb, ok := o.(*arrayContainer) - if ok { - // Check if the containers are the same object. - if ac == srb { - return true - } - - if len(srb.content) != len(ac.content) { - return false - } - - for i, v := range ac.content { - if v != srb.content[i] { - return false - } - } - return true - } - - // use generic comparison - bCard := o.getCardinality() - aCard := ac.getCardinality() - if bCard != aCard { - return false - } - - ait := ac.getShortIterator() - bit := o.getShortIterator() - for ait.hasNext() { - if bit.next() != ait.next() { - return false - } - } - return true -} - -func (ac *arrayContainer) toBitmapContainer() *bitmapContainer { - bc := newBitmapContainer() - bc.loadData(ac) - return bc - -} -func (ac *arrayContainer) iadd(x uint16) (wasNew bool) { - // Special case adding to the end of the container. - l := len(ac.content) - if l > 0 && l < arrayDefaultMaxSize && ac.content[l-1] < x { - ac.content = append(ac.content, x) - return true - } - - loc := binarySearch(ac.content, x) - - if loc < 0 { - s := ac.content - i := -loc - 1 - s = append(s, 0) - copy(s[i+1:], s[i:]) - s[i] = x - ac.content = s - return true - } - return false -} - -func (ac *arrayContainer) iaddReturnMinimized(x uint16) container { - // Special case adding to the end of the container. - l := len(ac.content) - if l > 0 && l < arrayDefaultMaxSize && ac.content[l-1] < x { - ac.content = append(ac.content, x) - return ac - } - - loc := binarySearch(ac.content, x) - - if loc < 0 { - if len(ac.content) >= arrayDefaultMaxSize { - a := ac.toBitmapContainer() - a.iadd(x) - return a - } - s := ac.content - i := -loc - 1 - s = append(s, 0) - copy(s[i+1:], s[i:]) - s[i] = x - ac.content = s - } - return ac -} - -// iremoveReturnMinimized is allowed to change the return type to minimize storage. -func (ac *arrayContainer) iremoveReturnMinimized(x uint16) container { - ac.iremove(x) - return ac -} - -func (ac *arrayContainer) iremove(x uint16) bool { - loc := binarySearch(ac.content, x) - if loc >= 0 { - s := ac.content - s = append(s[:loc], s[loc+1:]...) - ac.content = s - return true - } - return false -} - -func (ac *arrayContainer) remove(x uint16) container { - out := &arrayContainer{make([]uint16, len(ac.content))} - copy(out.content, ac.content[:]) - - loc := binarySearch(out.content, x) - if loc >= 0 { - s := out.content - s = append(s[:loc], s[loc+1:]...) - out.content = s - } - return out -} - -func (ac *arrayContainer) or(a container) container { - switch x := a.(type) { - case *arrayContainer: - return ac.orArray(x) - case *bitmapContainer: - return x.orArray(ac) - case *runContainer16: - if x.isFull() { - return x.clone() - } - return x.orArray(ac) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) orCardinality(a container) int { - switch x := a.(type) { - case *arrayContainer: - return ac.orArrayCardinality(x) - case *bitmapContainer: - return x.orArrayCardinality(ac) - case *runContainer16: - return x.orArrayCardinality(ac) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) ior(a container) container { - switch x := a.(type) { - case *arrayContainer: - return ac.iorArray(x) - case *bitmapContainer: - return a.(*bitmapContainer).orArray(ac) - //return ac.iorBitmap(x) // note: this does not make sense - case *runContainer16: - if x.isFull() { - return x.clone() - } - return ac.iorRun16(x) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) iorArray(value2 *arrayContainer) container { - value1 := ac - len1 := value1.getCardinality() - len2 := value2.getCardinality() - maxPossibleCardinality := len1 + len2 - if maxPossibleCardinality > cap(value1.content) { - // doubling the capacity reduces new slice allocations in the case of - // repeated calls to iorArray(). - newSize := 2 * maxPossibleCardinality - // the second check is to handle overly large array containers - // and should not occur in normal usage, - // as all array containers should be at most arrayDefaultMaxSize - if newSize > 2*arrayDefaultMaxSize && maxPossibleCardinality <= 2*arrayDefaultMaxSize { - newSize = 2 * arrayDefaultMaxSize - } - newcontent := make([]uint16, 0, newSize) - copy(newcontent[len2:maxPossibleCardinality], ac.content[0:len1]) - ac.content = newcontent - } else { - copy(ac.content[len2:maxPossibleCardinality], ac.content[0:len1]) - } - nl := union2by2(value1.content[len2:maxPossibleCardinality], value2.content, ac.content) - ac.content = ac.content[:nl] // reslice to match actual used capacity - - if nl > arrayDefaultMaxSize { - // Only converting to a bitmap when arrayDefaultMaxSize - // is actually exceeded minimizes conversions in the case of repeated - // calls to iorArray(). - return ac.toBitmapContainer() - } - return ac -} - -// Note: such code does not make practical sense, except for lazy evaluations -func (ac *arrayContainer) iorBitmap(bc2 *bitmapContainer) container { - bc1 := ac.toBitmapContainer() - bc1.iorBitmap(bc2) - *ac = *newArrayContainerFromBitmap(bc1) - return ac -} - -func (ac *arrayContainer) iorRun16(rc *runContainer16) container { - runCardinality := rc.getCardinality() - // heuristic for if the container should maybe be an - // array container. - if runCardinality < ac.getCardinality() && - runCardinality+ac.getCardinality() < arrayDefaultMaxSize { - var result container - result = ac - for _, run := range rc.iv { - result = result.iaddRange(int(run.start), int(run.start)+int(run.length)+1) - } - return result - } - return rc.orArray(ac) -} - -func (ac *arrayContainer) lazyIOR(a container) container { - switch x := a.(type) { - case *arrayContainer: - return ac.lazyIorArray(x) - case *bitmapContainer: - return ac.lazyIorBitmap(x) - case *runContainer16: - if x.isFull() { - return x.clone() - } - return ac.lazyIorRun16(x) - - } - panic("unsupported container type") -} - -func (ac *arrayContainer) lazyIorArray(ac2 *arrayContainer) container { - // TODO actually make this lazy - return ac.iorArray(ac2) -} - -func (ac *arrayContainer) lazyIorBitmap(bc *bitmapContainer) container { - // TODO actually make this lazy - return ac.iorBitmap(bc) -} - -func (ac *arrayContainer) lazyIorRun16(rc *runContainer16) container { - // TODO actually make this lazy - return ac.iorRun16(rc) -} - -func (ac *arrayContainer) lazyOR(a container) container { - switch x := a.(type) { - case *arrayContainer: - return ac.lazyorArray(x) - case *bitmapContainer: - return a.lazyOR(ac) - case *runContainer16: - if x.isFull() { - return x.clone() - } - return x.orArray(ac) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) orArray(value2 *arrayContainer) container { - value1 := ac - maxPossibleCardinality := value1.getCardinality() + value2.getCardinality() - if maxPossibleCardinality > arrayDefaultMaxSize { // it could be a bitmap! - bc := newBitmapContainer() - for k := 0; k < len(value2.content); k++ { - v := value2.content[k] - i := uint(v) >> 6 - mask := uint64(1) << (v % 64) - bc.bitmap[i] |= mask - } - for k := 0; k < len(ac.content); k++ { - v := ac.content[k] - i := uint(v) >> 6 - mask := uint64(1) << (v % 64) - bc.bitmap[i] |= mask - } - bc.cardinality = int(popcntSlice(bc.bitmap)) - if bc.cardinality <= arrayDefaultMaxSize { - return bc.toArrayContainer() - } - return bc - } - answer := newArrayContainerCapacity(maxPossibleCardinality) - nl := union2by2(value1.content, value2.content, answer.content) - answer.content = answer.content[:nl] // reslice to match actual used capacity - return answer -} - -func (ac *arrayContainer) orArrayCardinality(value2 *arrayContainer) int { - return union2by2Cardinality(ac.content, value2.content) -} - -func (ac *arrayContainer) lazyorArray(value2 *arrayContainer) container { - value1 := ac - maxPossibleCardinality := value1.getCardinality() + value2.getCardinality() - if maxPossibleCardinality > arrayLazyLowerBound { // it could be a bitmap! - bc := newBitmapContainer() - for k := 0; k < len(value2.content); k++ { - v := value2.content[k] - i := uint(v) >> 6 - mask := uint64(1) << (v % 64) - bc.bitmap[i] |= mask - } - for k := 0; k < len(ac.content); k++ { - v := ac.content[k] - i := uint(v) >> 6 - mask := uint64(1) << (v % 64) - bc.bitmap[i] |= mask - } - bc.cardinality = invalidCardinality - return bc - } - answer := newArrayContainerCapacity(maxPossibleCardinality) - nl := union2by2(value1.content, value2.content, answer.content) - answer.content = answer.content[:nl] // reslice to match actual used capacity - return answer -} - -func (ac *arrayContainer) and(a container) container { - switch x := a.(type) { - case *arrayContainer: - return ac.andArray(x) - case *bitmapContainer: - return x.and(ac) - case *runContainer16: - if x.isFull() { - return ac.clone() - } - return x.andArray(ac) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) andCardinality(a container) int { - switch x := a.(type) { - case *arrayContainer: - return ac.andArrayCardinality(x) - case *bitmapContainer: - return x.andCardinality(ac) - case *runContainer16: - return x.andArrayCardinality(ac) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) intersects(a container) bool { - switch x := a.(type) { - case *arrayContainer: - return ac.intersectsArray(x) - case *bitmapContainer: - return x.intersects(ac) - case *runContainer16: - return x.intersects(ac) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) iand(a container) container { - switch x := a.(type) { - case *arrayContainer: - return ac.iandArray(x) - case *bitmapContainer: - return ac.iandBitmap(x) - case *runContainer16: - if x.isFull() { - return ac - } - return x.andArray(ac) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) iandBitmap(bc *bitmapContainer) container { - pos := 0 - c := ac.getCardinality() - for k := 0; k < c; k++ { - // branchless - v := ac.content[k] - ac.content[pos] = v - pos += int(bc.bitValue(v)) - } - ac.content = ac.content[:pos] - return ac - -} - -func (ac *arrayContainer) xor(a container) container { - switch x := a.(type) { - case *arrayContainer: - return ac.xorArray(x) - case *bitmapContainer: - return a.xor(ac) - case *runContainer16: - return x.xorArray(ac) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) xorArray(value2 *arrayContainer) container { - value1 := ac - totalCardinality := value1.getCardinality() + value2.getCardinality() - if totalCardinality > arrayDefaultMaxSize { // it could be a bitmap! - bc := newBitmapContainer() - for k := 0; k < len(value2.content); k++ { - v := value2.content[k] - i := uint(v) >> 6 - bc.bitmap[i] ^= (uint64(1) << (v % 64)) - } - for k := 0; k < len(ac.content); k++ { - v := ac.content[k] - i := uint(v) >> 6 - bc.bitmap[i] ^= (uint64(1) << (v % 64)) - } - bc.computeCardinality() - if bc.cardinality <= arrayDefaultMaxSize { - return bc.toArrayContainer() - } - return bc - } - desiredCapacity := totalCardinality - answer := newArrayContainerCapacity(desiredCapacity) - length := exclusiveUnion2by2(value1.content, value2.content, answer.content) - answer.content = answer.content[:length] - return answer - -} - -func (ac *arrayContainer) andNot(a container) container { - switch x := a.(type) { - case *arrayContainer: - return ac.andNotArray(x) - case *bitmapContainer: - return ac.andNotBitmap(x) - case *runContainer16: - return ac.andNotRun16(x) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) andNotRun16(rc *runContainer16) container { - acb := ac.toBitmapContainer() - rcb := rc.toBitmapContainer() - return acb.andNotBitmap(rcb) -} - -func (ac *arrayContainer) iandNot(a container) container { - switch x := a.(type) { - case *arrayContainer: - return ac.iandNotArray(x) - case *bitmapContainer: - return ac.iandNotBitmap(x) - case *runContainer16: - return ac.iandNotRun16(x) - } - panic("unsupported container type") -} - -func (ac *arrayContainer) iandNotRun16(rc *runContainer16) container { - // Fast path: if either the array container or the run container is empty, the result is the array. - if ac.isEmpty() || rc.isEmpty() { - // Empty - return ac - } - // Fast path: if the run container is full, the result is empty. - if rc.isFull() { - ac.content = ac.content[:0] - return ac - } - current_run := 0 - // All values in [start_run, end_end] are part of the run - start_run := rc.iv[current_run].start - end_end := start_run + rc.iv[current_run].length - // We are going to read values in the array at index i, and we are - // going to write them at index pos. So we do in-place processing. - // We always have that pos <= i by construction. So we can either - // overwrite a value just read, or a value that was previous read. - pos := 0 - i := 0 - for ; i < len(ac.content); i++ { - if ac.content[i] < start_run { - // the value in the array appears before the run [start_run, end_end] - ac.content[pos] = ac.content[i] - pos++ - } else if ac.content[i] <= end_end { - // nothing to do, the value is in the array but also in the run. - } else { - // We have the value in the array after the run. We cannot tell - // whether we need to keep it or not. So let us move to another run. - if current_run+1 < len(rc.iv) { - current_run++ - start_run = rc.iv[current_run].start - end_end = start_run + rc.iv[current_run].length - i-- // retry with the same i - } else { - // We have exhausted the number of runs. We can keep the rest of the values - // from i to len(ac.content) - 1 inclusively. - break // We are done, the rest of the array will be kept - } - } - } - for ; i < len(ac.content); i++ { - ac.content[pos] = ac.content[i] - pos++ - } - // We 'shink' the slice. - ac.content = ac.content[:pos] - return ac -} - -func (ac *arrayContainer) andNotArray(value2 *arrayContainer) container { - value1 := ac - desiredcapacity := value1.getCardinality() - answer := newArrayContainerCapacity(desiredcapacity) - length := difference(value1.content, value2.content, answer.content) - answer.content = answer.content[:length] - return answer -} - -func (ac *arrayContainer) iandNotArray(value2 *arrayContainer) container { - length := difference(ac.content, value2.content, ac.content) - ac.content = ac.content[:length] - return ac -} - -func (ac *arrayContainer) andNotBitmap(value2 *bitmapContainer) container { - desiredcapacity := ac.getCardinality() - answer := newArrayContainerCapacity(desiredcapacity) - answer.content = answer.content[:desiredcapacity] - pos := 0 - for _, v := range ac.content { - answer.content[pos] = v - pos += 1 - int(value2.bitValue(v)) - } - answer.content = answer.content[:pos] - return answer -} - -func (ac *arrayContainer) andBitmap(value2 *bitmapContainer) container { - desiredcapacity := ac.getCardinality() - answer := newArrayContainerCapacity(desiredcapacity) - answer.content = answer.content[:desiredcapacity] - pos := 0 - for _, v := range ac.content { - answer.content[pos] = v - pos += int(value2.bitValue(v)) - } - answer.content = answer.content[:pos] - return answer -} - -func (ac *arrayContainer) iandNotBitmap(value2 *bitmapContainer) container { - pos := 0 - for _, v := range ac.content { - ac.content[pos] = v - pos += 1 - int(value2.bitValue(v)) - } - ac.content = ac.content[:pos] - return ac -} - -func copyOf(array []uint16, size int) []uint16 { - result := make([]uint16, size) - for i, x := range array { - if i == size { - break - } - result[i] = x - } - return result -} - -// flip the values in the range [firstOfRange,endx) -func (ac *arrayContainer) inot(firstOfRange, endx int) container { - if firstOfRange >= endx { - return ac - } - return ac.inotClose(firstOfRange, endx-1) // remove everything in [firstOfRange,endx-1] -} - -// flip the values in the range [firstOfRange,lastOfRange] -func (ac *arrayContainer) inotClose(firstOfRange, lastOfRange int) container { - if firstOfRange > lastOfRange { // unlike add and remove, not uses an inclusive range [firstOfRange,lastOfRange] - return ac - } - // determine the span of array indices to be affected - startIndex := binarySearch(ac.content, uint16(firstOfRange)) - if startIndex < 0 { - startIndex = -startIndex - 1 - } - lastIndex := binarySearch(ac.content, uint16(lastOfRange)) - if lastIndex < 0 { - lastIndex = -lastIndex - 1 - 1 - } - currentValuesInRange := lastIndex - startIndex + 1 - spanToBeFlipped := lastOfRange - firstOfRange + 1 - - newValuesInRange := spanToBeFlipped - currentValuesInRange - buffer := make([]uint16, newValuesInRange) - cardinalityChange := newValuesInRange - currentValuesInRange - newCardinality := len(ac.content) + cardinalityChange - if cardinalityChange > 0 { - if newCardinality > len(ac.content) { - if newCardinality > arrayDefaultMaxSize { - bcRet := ac.toBitmapContainer() - bcRet.inot(firstOfRange, lastOfRange+1) - *ac = *bcRet.toArrayContainer() - return bcRet - } - ac.content = copyOf(ac.content, newCardinality) - } - base := lastIndex + 1 - copy(ac.content[lastIndex+1+cardinalityChange:], ac.content[base:base+len(ac.content)-1-lastIndex]) - ac.negateRange(buffer, startIndex, lastIndex, firstOfRange, lastOfRange+1) - } else { // no expansion needed - ac.negateRange(buffer, startIndex, lastIndex, firstOfRange, lastOfRange+1) - if cardinalityChange < 0 { - - for i := startIndex + newValuesInRange; i < newCardinality; i++ { - ac.content[i] = ac.content[i-cardinalityChange] - } - } - } - ac.content = ac.content[:newCardinality] - return ac -} - -func (ac *arrayContainer) negateRange(buffer []uint16, startIndex, lastIndex, startRange, lastRange int) { - // compute the negation into buffer - outPos := 0 - inPos := startIndex // value here always >= valInRange, - // until it is exhausted - // n.b., we can start initially exhausted. - - valInRange := startRange - for ; valInRange < lastRange && inPos <= lastIndex; valInRange++ { - if uint16(valInRange) != ac.content[inPos] { - buffer[outPos] = uint16(valInRange) - outPos++ - } else { - inPos++ - } - } - - // if there are extra items (greater than the biggest - // pre-existing one in range), buffer them - for ; valInRange < lastRange; valInRange++ { - buffer[outPos] = uint16(valInRange) - outPos++ - } - - if outPos != len(buffer) { - panic("negateRange: internal bug") - } - - for i, item := range buffer { - ac.content[i+startIndex] = item - } -} - -func (ac *arrayContainer) isFull() bool { - return false -} - -func (ac *arrayContainer) andArray(value2 *arrayContainer) container { - desiredcapacity := minOfInt(ac.getCardinality(), value2.getCardinality()) - answer := newArrayContainerCapacity(desiredcapacity) - length := intersection2by2( - ac.content, - value2.content, - answer.content) - answer.content = answer.content[:length] - return answer -} - -func (ac *arrayContainer) andArrayCardinality(value2 *arrayContainer) int { - return intersection2by2Cardinality( - ac.content, - value2.content) -} - -func (ac *arrayContainer) intersectsArray(value2 *arrayContainer) bool { - return intersects2by2( - ac.content, - value2.content) -} - -func (ac *arrayContainer) iandArray(value2 *arrayContainer) container { - length := intersection2by2( - ac.content, - value2.content, - ac.content) - ac.content = ac.content[:length] - return ac -} - -func (ac *arrayContainer) getCardinality() int { - return len(ac.content) -} - -func (ac *arrayContainer) isEmpty() bool { - return len(ac.content) == 0 -} - -func (ac *arrayContainer) rank(x uint16) int { - answer := binarySearch(ac.content, x) - if answer >= 0 { - return answer + 1 - } - return -answer - 1 - -} - -func (ac *arrayContainer) selectInt(x uint16) int { - return int(ac.content[x]) -} - -func (ac *arrayContainer) clone() container { - ptr := arrayContainer{make([]uint16, len(ac.content))} - copy(ptr.content, ac.content[:]) - return &ptr -} - -func (ac *arrayContainer) contains(x uint16) bool { - return binarySearch(ac.content, x) >= 0 -} - -func (ac *arrayContainer) loadData(bitmapContainer *bitmapContainer) { - ac.content = make([]uint16, bitmapContainer.cardinality, bitmapContainer.cardinality) - bitmapContainer.fillArray(ac.content) -} - -func (ac *arrayContainer) resetTo(a container) { - switch x := a.(type) { - case *arrayContainer: - ac.realloc(len(x.content)) - copy(ac.content, x.content) - - case *bitmapContainer: - ac.realloc(x.cardinality) - x.fillArray(ac.content) - - case *runContainer16: - card := int(x.getCardinality()) - ac.realloc(card) - cur := 0 - for _, r := range x.iv { - for val := r.start; val <= r.last(); val++ { - ac.content[cur] = val - cur++ - } - } - - default: - panic("unsupported container type") - } -} - -func (ac *arrayContainer) realloc(size int) { - if cap(ac.content) < size { - ac.content = make([]uint16, size) - } else { - ac.content = ac.content[:size] - } -} - -func newArrayContainer() *arrayContainer { - p := new(arrayContainer) - return p -} - -func newArrayContainerFromBitmap(bc *bitmapContainer) *arrayContainer { - ac := &arrayContainer{} - ac.loadData(bc) - return ac -} - -func newArrayContainerCapacity(size int) *arrayContainer { - p := new(arrayContainer) - p.content = make([]uint16, 0, size) - return p -} - -func newArrayContainerSize(size int) *arrayContainer { - p := new(arrayContainer) - p.content = make([]uint16, size, size) - return p -} - -func newArrayContainerRange(firstOfRun, lastOfRun int) *arrayContainer { - valuesInRange := lastOfRun - firstOfRun + 1 - this := newArrayContainerCapacity(valuesInRange) - for i := 0; i < valuesInRange; i++ { - this.content = append(this.content, uint16(firstOfRun+i)) - } - return this -} - -func (ac *arrayContainer) numberOfRuns() (nr int) { - n := len(ac.content) - var runlen uint16 - var cur, prev uint16 - - switch n { - case 0: - return 0 - case 1: - return 1 - default: - for i := 1; i < n; i++ { - prev = ac.content[i-1] - cur = ac.content[i] - - if cur == prev+1 { - runlen++ - } else { - if cur < prev { - panic("the fundamental arrayContainer assumption of sorted ac.content was broken") - } - if cur == prev { - panic("the fundamental arrayContainer assumption of deduplicated content was broken") - } else { - nr++ - runlen = 0 - } - } - } - nr++ - } - return -} - -// convert to run or array *if needed* -func (ac *arrayContainer) toEfficientContainer() container { - - numRuns := ac.numberOfRuns() - - sizeAsRunContainer := runContainer16SerializedSizeInBytes(numRuns) - sizeAsBitmapContainer := bitmapContainerSizeInBytes() - card := ac.getCardinality() - sizeAsArrayContainer := arrayContainerSizeInBytes(card) - - if sizeAsRunContainer <= minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { - return newRunContainer16FromArray(ac) - } - if card <= arrayDefaultMaxSize { - return ac - } - return ac.toBitmapContainer() -} - -func (ac *arrayContainer) containerType() contype { - return arrayContype -} - -func (ac *arrayContainer) addOffset(x uint16) (container, container) { - var low, high *arrayContainer - - if len(ac.content) == 0 { - return nil, nil - } - - if y := uint32(ac.content[0]) + uint32(x); highbits(y) == 0 { - // Some elements will fall into low part, allocate a container. - // Checking the first one is enough because they are ordered. - low = &arrayContainer{} - } - if y := uint32(ac.content[len(ac.content)-1]) + uint32(x); highbits(y) > 0 { - // Some elements will fall into high part, allocate a container. - // Checking the last one is enough because they are ordered. - high = &arrayContainer{} - } - - for _, val := range ac.content { - y := uint32(val) + uint32(x) - if highbits(y) > 0 { - // OK, if high == nil then highbits(y) == 0 for all y. - high.content = append(high.content, lowbits(y)) - } else { - // OK, if low == nil then highbits(y) > 0 for all y. - low.content = append(low.content, lowbits(y)) - } - } - - // Ensure proper nil interface. - if low == nil { - return nil, high - } - if high == nil { - return low, nil - } - - return low, high -} diff --git a/vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go b/vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go deleted file mode 100644 index bf08bfc..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/bitmapcontainer.go +++ /dev/null @@ -1,1236 +0,0 @@ -package roaring - -import ( - "fmt" - "unsafe" -) - -type bitmapContainer struct { - cardinality int - bitmap []uint64 -} - -func (bc bitmapContainer) String() string { - var s string - for it := bc.getShortIterator(); it.hasNext(); { - s += fmt.Sprintf("%v, ", it.next()) - } - return s -} - -func newBitmapContainer() *bitmapContainer { - p := new(bitmapContainer) - size := (1 << 16) / 64 - p.bitmap = make([]uint64, size, size) - return p -} - -func newBitmapContainerwithRange(firstOfRun, lastOfRun int) *bitmapContainer { - bc := newBitmapContainer() - bc.cardinality = lastOfRun - firstOfRun + 1 - if bc.cardinality == maxCapacity { - fill(bc.bitmap, uint64(0xffffffffffffffff)) - } else { - firstWord := firstOfRun / 64 - lastWord := lastOfRun / 64 - zeroPrefixLength := uint64(firstOfRun & 63) - zeroSuffixLength := uint64(63 - (lastOfRun & 63)) - - fillRange(bc.bitmap, firstWord, lastWord+1, uint64(0xffffffffffffffff)) - bc.bitmap[firstWord] ^= ((uint64(1) << zeroPrefixLength) - 1) - blockOfOnes := (uint64(1) << zeroSuffixLength) - 1 - maskOnLeft := blockOfOnes << (uint64(64) - zeroSuffixLength) - bc.bitmap[lastWord] ^= maskOnLeft - } - return bc -} - -func (bc *bitmapContainer) minimum() uint16 { - for i := 0; i < len(bc.bitmap); i++ { - w := bc.bitmap[i] - if w != 0 { - r := countTrailingZeros(w) - return uint16(r + i*64) - } - } - return MaxUint16 -} - -// i should be non-zero -func clz(i uint64) int { - n := 1 - x := uint32(i >> 32) - if x == 0 { - n += 32 - x = uint32(i) - } - if x>>16 == 0 { - n += 16 - x = x << 16 - } - if x>>24 == 0 { - n += 8 - x = x << 8 - } - if x>>28 == 0 { - n += 4 - x = x << 4 - } - if x>>30 == 0 { - n += 2 - x = x << 2 - } - return n - int(x>>31) -} - -func (bc *bitmapContainer) maximum() uint16 { - for i := len(bc.bitmap); i > 0; i-- { - w := bc.bitmap[i-1] - if w != 0 { - r := clz(w) - return uint16((i-1)*64 + 63 - r) - } - } - return uint16(0) -} - -func (bc *bitmapContainer) iterate(cb func(x uint16) bool) bool { - iterator := bitmapContainerShortIterator{bc, bc.NextSetBit(0)} - - for iterator.hasNext() { - if !cb(iterator.next()) { - return false - } - } - - return true -} - -type bitmapContainerShortIterator struct { - ptr *bitmapContainer - i int -} - -func (bcsi *bitmapContainerShortIterator) next() uint16 { - j := bcsi.i - bcsi.i = bcsi.ptr.NextSetBit(uint(bcsi.i) + 1) - return uint16(j) -} -func (bcsi *bitmapContainerShortIterator) hasNext() bool { - return bcsi.i >= 0 -} - -func (bcsi *bitmapContainerShortIterator) peekNext() uint16 { - return uint16(bcsi.i) -} - -func (bcsi *bitmapContainerShortIterator) advanceIfNeeded(minval uint16) { - if bcsi.hasNext() && bcsi.peekNext() < minval { - bcsi.i = bcsi.ptr.NextSetBit(uint(minval)) - } -} - -func newBitmapContainerShortIterator(a *bitmapContainer) *bitmapContainerShortIterator { - return &bitmapContainerShortIterator{a, a.NextSetBit(0)} -} - -func (bc *bitmapContainer) getShortIterator() shortPeekable { - return newBitmapContainerShortIterator(bc) -} - -type reverseBitmapContainerShortIterator struct { - ptr *bitmapContainer - i int -} - -func (bcsi *reverseBitmapContainerShortIterator) next() uint16 { - if bcsi.i == -1 { - panic("reverseBitmapContainerShortIterator.next() going beyond what is available") - } - - j := bcsi.i - bcsi.i = bcsi.ptr.PrevSetBit(bcsi.i - 1) - return uint16(j) -} - -func (bcsi *reverseBitmapContainerShortIterator) hasNext() bool { - return bcsi.i >= 0 -} - -func newReverseBitmapContainerShortIterator(a *bitmapContainer) *reverseBitmapContainerShortIterator { - if a.cardinality == 0 { - return &reverseBitmapContainerShortIterator{a, -1} - } - return &reverseBitmapContainerShortIterator{a, int(a.maximum())} -} - -func (bc *bitmapContainer) getReverseIterator() shortIterable { - return newReverseBitmapContainerShortIterator(bc) -} - -type bitmapContainerManyIterator struct { - ptr *bitmapContainer - base int - bitset uint64 -} - -func (bcmi *bitmapContainerManyIterator) nextMany(hs uint32, buf []uint32) int { - n := 0 - base := bcmi.base - bitset := bcmi.bitset - - for n < len(buf) { - if bitset == 0 { - base++ - if base >= len(bcmi.ptr.bitmap) { - bcmi.base = base - bcmi.bitset = bitset - return n - } - bitset = bcmi.ptr.bitmap[base] - continue - } - t := bitset & -bitset - buf[n] = uint32(((base * 64) + int(popcount(t-1)))) | hs - n = n + 1 - bitset ^= t - } - - bcmi.base = base - bcmi.bitset = bitset - return n -} - -func (bcmi *bitmapContainerManyIterator) nextMany64(hs uint64, buf []uint64) int { - n := 0 - base := bcmi.base - bitset := bcmi.bitset - - for n < len(buf) { - if bitset == 0 { - base++ - if base >= len(bcmi.ptr.bitmap) { - bcmi.base = base - bcmi.bitset = bitset - return n - } - bitset = bcmi.ptr.bitmap[base] - continue - } - t := bitset & -bitset - buf[n] = uint64(((base * 64) + int(popcount(t-1)))) | hs - n = n + 1 - bitset ^= t - } - - bcmi.base = base - bcmi.bitset = bitset - return n -} - -func newBitmapContainerManyIterator(a *bitmapContainer) *bitmapContainerManyIterator { - return &bitmapContainerManyIterator{a, -1, 0} -} - -func (bc *bitmapContainer) getManyIterator() manyIterable { - return newBitmapContainerManyIterator(bc) -} - -func (bc *bitmapContainer) getSizeInBytes() int { - return len(bc.bitmap) * 8 // + bcBaseBytes -} - -func (bc *bitmapContainer) serializedSizeInBytes() int { - //return bc.Msgsize()// NOO! This breaks GetSerializedSizeInBytes - return len(bc.bitmap) * 8 -} - -const bcBaseBytes = int(unsafe.Sizeof(bitmapContainer{})) - -// bitmapContainer doesn't depend on card, always fully allocated -func bitmapContainerSizeInBytes() int { - return bcBaseBytes + (1<<16)/8 -} - -func bitmapEquals(a, b []uint64) bool { - if len(a) != len(b) { - return false - } - for i, v := range a { - if v != b[i] { - return false - } - } - return true -} - -func (bc *bitmapContainer) fillLeastSignificant16bits(x []uint32, i int, mask uint32) int { - // TODO: should be written as optimized assembly - pos := i - base := mask - for k := 0; k < len(bc.bitmap); k++ { - bitset := bc.bitmap[k] - for bitset != 0 { - t := bitset & -bitset - x[pos] = base + uint32(popcount(t-1)) - pos++ - bitset ^= t - } - base += 64 - } - return pos -} - -func (bc *bitmapContainer) equals(o container) bool { - srb, ok := o.(*bitmapContainer) - if ok { - if srb.cardinality != bc.cardinality { - return false - } - return bitmapEquals(bc.bitmap, srb.bitmap) - } - - // use generic comparison - if bc.getCardinality() != o.getCardinality() { - return false - } - ait := o.getShortIterator() - bit := bc.getShortIterator() - - for ait.hasNext() { - if bit.next() != ait.next() { - return false - } - } - return true -} - -func (bc *bitmapContainer) iaddReturnMinimized(i uint16) container { - bc.iadd(i) - if bc.isFull() { - return newRunContainer16Range(0, MaxUint16) - } - return bc -} - -func (bc *bitmapContainer) iadd(i uint16) bool { - x := int(i) - previous := bc.bitmap[x/64] - mask := uint64(1) << (uint(x) % 64) - newb := previous | mask - bc.bitmap[x/64] = newb - bc.cardinality += int((previous ^ newb) >> (uint(x) % 64)) - return newb != previous -} - -func (bc *bitmapContainer) iremoveReturnMinimized(i uint16) container { - if bc.iremove(i) { - if bc.cardinality == arrayDefaultMaxSize { - return bc.toArrayContainer() - } - } - return bc -} - -// iremove returns true if i was found. -func (bc *bitmapContainer) iremove(i uint16) bool { - if bc.contains(i) { - bc.cardinality-- - bc.bitmap[i/64] &^= (uint64(1) << (i % 64)) - return true - } - return false -} - -func (bc *bitmapContainer) isFull() bool { - return bc.cardinality == int(MaxUint16)+1 -} - -func (bc *bitmapContainer) getCardinality() int { - return bc.cardinality -} - -func (bc *bitmapContainer) isEmpty() bool { - return bc.cardinality == 0 -} - -func (bc *bitmapContainer) clone() container { - ptr := bitmapContainer{bc.cardinality, make([]uint64, len(bc.bitmap))} - copy(ptr.bitmap, bc.bitmap[:]) - return &ptr -} - -// add all values in range [firstOfRange,lastOfRange) -func (bc *bitmapContainer) iaddRange(firstOfRange, lastOfRange int) container { - bc.cardinality += setBitmapRangeAndCardinalityChange(bc.bitmap, firstOfRange, lastOfRange) - return bc -} - -// remove all values in range [firstOfRange,lastOfRange) -func (bc *bitmapContainer) iremoveRange(firstOfRange, lastOfRange int) container { - bc.cardinality += resetBitmapRangeAndCardinalityChange(bc.bitmap, firstOfRange, lastOfRange) - if bc.getCardinality() <= arrayDefaultMaxSize { - return bc.toArrayContainer() - } - return bc -} - -// flip all values in range [firstOfRange,endx) -func (bc *bitmapContainer) inot(firstOfRange, endx int) container { - if endx-firstOfRange == maxCapacity { - flipBitmapRange(bc.bitmap, firstOfRange, endx) - bc.cardinality = maxCapacity - bc.cardinality - } else if endx-firstOfRange > maxCapacity/2 { - flipBitmapRange(bc.bitmap, firstOfRange, endx) - bc.computeCardinality() - } else { - bc.cardinality += flipBitmapRangeAndCardinalityChange(bc.bitmap, firstOfRange, endx) - } - if bc.getCardinality() <= arrayDefaultMaxSize { - return bc.toArrayContainer() - } - return bc -} - -// flip all values in range [firstOfRange,endx) -func (bc *bitmapContainer) not(firstOfRange, endx int) container { - answer := bc.clone() - return answer.inot(firstOfRange, endx) -} - -func (bc *bitmapContainer) or(a container) container { - switch x := a.(type) { - case *arrayContainer: - return bc.orArray(x) - case *bitmapContainer: - return bc.orBitmap(x) - case *runContainer16: - if x.isFull() { - return x.clone() - } - return x.orBitmapContainer(bc) - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) orCardinality(a container) int { - switch x := a.(type) { - case *arrayContainer: - return bc.orArrayCardinality(x) - case *bitmapContainer: - return bc.orBitmapCardinality(x) - case *runContainer16: - return x.orBitmapContainerCardinality(bc) - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) ior(a container) container { - switch x := a.(type) { - case *arrayContainer: - return bc.iorArray(x) - case *bitmapContainer: - return bc.iorBitmap(x) - case *runContainer16: - if x.isFull() { - return x.clone() - } - for i := range x.iv { - bc.iaddRange(int(x.iv[i].start), int(x.iv[i].last())+1) - } - if bc.isFull() { - return newRunContainer16Range(0, MaxUint16) - } - //bc.computeCardinality() - return bc - } - panic(fmt.Errorf("unsupported container type %T", a)) -} - -func (bc *bitmapContainer) lazyIOR(a container) container { - switch x := a.(type) { - case *arrayContainer: - return bc.lazyIORArray(x) - case *bitmapContainer: - return bc.lazyIORBitmap(x) - case *runContainer16: - if x.isFull() { - return x.clone() - } - - // Manually inlined setBitmapRange function - bitmap := bc.bitmap - for _, iv := range x.iv { - start := int(iv.start) - end := int(iv.last()) + 1 - if start >= end { - continue - } - firstword := start / 64 - endword := (end - 1) / 64 - if firstword == endword { - bitmap[firstword] |= (^uint64(0) << uint(start%64)) & (^uint64(0) >> (uint(-end) % 64)) - continue - } - bitmap[firstword] |= ^uint64(0) << uint(start%64) - for i := firstword + 1; i < endword; i++ { - bitmap[i] = ^uint64(0) - } - bitmap[endword] |= ^uint64(0) >> (uint(-end) % 64) - } - bc.cardinality = invalidCardinality - return bc - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) lazyOR(a container) container { - switch x := a.(type) { - case *arrayContainer: - return bc.lazyORArray(x) - case *bitmapContainer: - return bc.lazyORBitmap(x) - case *runContainer16: - if x.isFull() { - return x.clone() - } - // TODO: implement lazy OR - return x.orBitmapContainer(bc) - - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) orArray(value2 *arrayContainer) container { - answer := bc.clone().(*bitmapContainer) - c := value2.getCardinality() - for k := 0; k < c; k++ { - v := value2.content[k] - i := uint(v) >> 6 - bef := answer.bitmap[i] - aft := bef | (uint64(1) << (v % 64)) - answer.bitmap[i] = aft - answer.cardinality += int((bef - aft) >> 63) - } - return answer -} - -func (bc *bitmapContainer) orArrayCardinality(value2 *arrayContainer) int { - answer := 0 - c := value2.getCardinality() - for k := 0; k < c; k++ { - // branchless: - v := value2.content[k] - i := uint(v) >> 6 - bef := bc.bitmap[i] - aft := bef | (uint64(1) << (v % 64)) - answer += int((bef - aft) >> 63) - } - return answer -} - -func (bc *bitmapContainer) orBitmap(value2 *bitmapContainer) container { - answer := newBitmapContainer() - for k := 0; k < len(answer.bitmap); k++ { - answer.bitmap[k] = bc.bitmap[k] | value2.bitmap[k] - } - answer.computeCardinality() - if answer.isFull() { - return newRunContainer16Range(0, MaxUint16) - } - return answer -} - -func (bc *bitmapContainer) orBitmapCardinality(value2 *bitmapContainer) int { - return int(popcntOrSlice(bc.bitmap, value2.bitmap)) -} - -func (bc *bitmapContainer) andBitmapCardinality(value2 *bitmapContainer) int { - return int(popcntAndSlice(bc.bitmap, value2.bitmap)) -} - -func (bc *bitmapContainer) computeCardinality() { - bc.cardinality = int(popcntSlice(bc.bitmap)) -} - -func (bc *bitmapContainer) iorArray(ac *arrayContainer) container { - for k := range ac.content { - vc := ac.content[k] - i := uint(vc) >> 6 - bef := bc.bitmap[i] - aft := bef | (uint64(1) << (vc % 64)) - bc.bitmap[i] = aft - bc.cardinality += int((bef - aft) >> 63) - } - if bc.isFull() { - return newRunContainer16Range(0, MaxUint16) - } - return bc -} - -func (bc *bitmapContainer) iorBitmap(value2 *bitmapContainer) container { - answer := bc - answer.cardinality = 0 - for k := 0; k < len(answer.bitmap); k++ { - answer.bitmap[k] = bc.bitmap[k] | value2.bitmap[k] - } - answer.computeCardinality() - if bc.isFull() { - return newRunContainer16Range(0, MaxUint16) - } - return answer -} - -func (bc *bitmapContainer) lazyIORArray(value2 *arrayContainer) container { - answer := bc - c := value2.getCardinality() - for k := 0; k+3 < c; k += 4 { - content := (*[4]uint16)(unsafe.Pointer(&value2.content[k])) - vc0 := content[0] - i0 := uint(vc0) >> 6 - answer.bitmap[i0] = answer.bitmap[i0] | (uint64(1) << (vc0 % 64)) - - vc1 := content[1] - i1 := uint(vc1) >> 6 - answer.bitmap[i1] = answer.bitmap[i1] | (uint64(1) << (vc1 % 64)) - - vc2 := content[2] - i2 := uint(vc2) >> 6 - answer.bitmap[i2] = answer.bitmap[i2] | (uint64(1) << (vc2 % 64)) - - vc3 := content[3] - i3 := uint(vc3) >> 6 - answer.bitmap[i3] = answer.bitmap[i3] | (uint64(1) << (vc3 % 64)) - } - - for k := c &^ 3; k < c; k++ { - vc := value2.content[k] - i := uint(vc) >> 6 - answer.bitmap[i] = answer.bitmap[i] | (uint64(1) << (vc % 64)) - } - - answer.cardinality = invalidCardinality - return answer -} - -func (bc *bitmapContainer) lazyORArray(value2 *arrayContainer) container { - answer := bc.clone().(*bitmapContainer) - return answer.lazyIORArray(value2) -} - -func (bc *bitmapContainer) lazyIORBitmap(value2 *bitmapContainer) container { - answer := bc - for k := 0; k < len(answer.bitmap); k++ { - answer.bitmap[k] = bc.bitmap[k] | value2.bitmap[k] - } - bc.cardinality = invalidCardinality - return answer -} - -func (bc *bitmapContainer) lazyORBitmap(value2 *bitmapContainer) container { - answer := bc.clone().(*bitmapContainer) - return answer.lazyIORBitmap(value2) -} - -func (bc *bitmapContainer) xor(a container) container { - switch x := a.(type) { - case *arrayContainer: - return bc.xorArray(x) - case *bitmapContainer: - return bc.xorBitmap(x) - case *runContainer16: - return x.xorBitmap(bc) - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) xorArray(value2 *arrayContainer) container { - answer := bc.clone().(*bitmapContainer) - c := value2.getCardinality() - for k := 0; k < c; k++ { - vc := value2.content[k] - index := uint(vc) >> 6 - abi := answer.bitmap[index] - mask := uint64(1) << (vc % 64) - answer.cardinality += 1 - 2*int((abi&mask)>>(vc%64)) - answer.bitmap[index] = abi ^ mask - } - if answer.cardinality <= arrayDefaultMaxSize { - return answer.toArrayContainer() - } - return answer -} - -func (bc *bitmapContainer) rank(x uint16) int { - // TODO: rewrite in assembly - leftover := (uint(x) + 1) & 63 - if leftover == 0 { - return int(popcntSlice(bc.bitmap[:(uint(x)+1)/64])) - } - return int(popcntSlice(bc.bitmap[:(uint(x)+1)/64]) + popcount(bc.bitmap[(uint(x)+1)/64]<<(64-leftover))) -} - -func (bc *bitmapContainer) selectInt(x uint16) int { - remaining := x - for k := 0; k < len(bc.bitmap); k++ { - w := popcount(bc.bitmap[k]) - if uint16(w) > remaining { - return k*64 + selectBitPosition(bc.bitmap[k], int(remaining)) - } - remaining -= uint16(w) - } - return -1 -} - -func (bc *bitmapContainer) xorBitmap(value2 *bitmapContainer) container { - newCardinality := int(popcntXorSlice(bc.bitmap, value2.bitmap)) - - if newCardinality > arrayDefaultMaxSize { - answer := newBitmapContainer() - for k := 0; k < len(answer.bitmap); k++ { - answer.bitmap[k] = bc.bitmap[k] ^ value2.bitmap[k] - } - answer.cardinality = newCardinality - if answer.isFull() { - return newRunContainer16Range(0, MaxUint16) - } - return answer - } - ac := newArrayContainerSize(newCardinality) - fillArrayXOR(ac.content, bc.bitmap, value2.bitmap) - ac.content = ac.content[:newCardinality] - return ac -} - -func (bc *bitmapContainer) and(a container) container { - switch x := a.(type) { - case *arrayContainer: - return bc.andArray(x) - case *bitmapContainer: - return bc.andBitmap(x) - case *runContainer16: - if x.isFull() { - return bc.clone() - } - return x.andBitmapContainer(bc) - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) andCardinality(a container) int { - switch x := a.(type) { - case *arrayContainer: - return bc.andArrayCardinality(x) - case *bitmapContainer: - return bc.andBitmapCardinality(x) - case *runContainer16: - return x.andBitmapContainerCardinality(bc) - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) intersects(a container) bool { - switch x := a.(type) { - case *arrayContainer: - return bc.intersectsArray(x) - case *bitmapContainer: - return bc.intersectsBitmap(x) - case *runContainer16: - return x.intersects(bc) - - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) iand(a container) container { - switch x := a.(type) { - case *arrayContainer: - return bc.iandArray(x) - case *bitmapContainer: - return bc.iandBitmap(x) - case *runContainer16: - if x.isFull() { - return bc.clone() - } - return bc.iandRun16(x) - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) iandRun16(rc *runContainer16) container { - rcb := newBitmapContainerFromRun(rc) - return bc.iandBitmap(rcb) -} - -func (bc *bitmapContainer) iandArray(ac *arrayContainer) container { - acb := ac.toBitmapContainer() - return bc.iandBitmap(acb) -} - -func (bc *bitmapContainer) andArray(value2 *arrayContainer) *arrayContainer { - answer := newArrayContainerCapacity(len(value2.content)) - answer.content = answer.content[:cap(answer.content)] - c := value2.getCardinality() - pos := 0 - for k := 0; k < c; k++ { - v := value2.content[k] - answer.content[pos] = v - pos += int(bc.bitValue(v)) - } - answer.content = answer.content[:pos] - return answer -} - -func (bc *bitmapContainer) andArrayCardinality(value2 *arrayContainer) int { - c := value2.getCardinality() - pos := 0 - for k := 0; k < c; k++ { - v := value2.content[k] - pos += int(bc.bitValue(v)) - } - return pos -} - -func (bc *bitmapContainer) getCardinalityInRange(start, end uint) int { - if start >= end { - return 0 - } - firstword := start / 64 - endword := (end - 1) / 64 - const allones = ^uint64(0) - if firstword == endword { - return int(popcount(bc.bitmap[firstword] & ((allones << (start % 64)) & (allones >> ((64 - end) & 63))))) - } - answer := popcount(bc.bitmap[firstword] & (allones << (start % 64))) - answer += popcntSlice(bc.bitmap[firstword+1 : endword]) - answer += popcount(bc.bitmap[endword] & (allones >> ((64 - end) & 63))) - return int(answer) -} - -func (bc *bitmapContainer) andBitmap(value2 *bitmapContainer) container { - newcardinality := int(popcntAndSlice(bc.bitmap, value2.bitmap)) - if newcardinality > arrayDefaultMaxSize { - answer := newBitmapContainer() - for k := 0; k < len(answer.bitmap); k++ { - answer.bitmap[k] = bc.bitmap[k] & value2.bitmap[k] - } - answer.cardinality = newcardinality - return answer - } - ac := newArrayContainerSize(newcardinality) - fillArrayAND(ac.content, bc.bitmap, value2.bitmap) - ac.content = ac.content[:newcardinality] //not sure why i need this - return ac - -} - -func (bc *bitmapContainer) intersectsArray(value2 *arrayContainer) bool { - c := value2.getCardinality() - for k := 0; k < c; k++ { - v := value2.content[k] - if bc.contains(v) { - return true - } - } - return false -} - -func (bc *bitmapContainer) intersectsBitmap(value2 *bitmapContainer) bool { - for k := 0; k < len(bc.bitmap); k++ { - if (bc.bitmap[k] & value2.bitmap[k]) != 0 { - return true - } - } - return false - -} - -func (bc *bitmapContainer) iandBitmap(value2 *bitmapContainer) container { - newcardinality := int(popcntAndSlice(bc.bitmap, value2.bitmap)) - for k := 0; k < len(bc.bitmap); k++ { - bc.bitmap[k] = bc.bitmap[k] & value2.bitmap[k] - } - bc.cardinality = newcardinality - - if newcardinality <= arrayDefaultMaxSize { - return newArrayContainerFromBitmap(bc) - } - return bc -} - -func (bc *bitmapContainer) andNot(a container) container { - switch x := a.(type) { - case *arrayContainer: - return bc.andNotArray(x) - case *bitmapContainer: - return bc.andNotBitmap(x) - case *runContainer16: - return bc.andNotRun16(x) - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) andNotRun16(rc *runContainer16) container { - rcb := rc.toBitmapContainer() - return bc.andNotBitmap(rcb) -} - -func (bc *bitmapContainer) iandNot(a container) container { - switch x := a.(type) { - case *arrayContainer: - return bc.iandNotArray(x) - case *bitmapContainer: - return bc.iandNotBitmapSurely(x) - case *runContainer16: - return bc.iandNotRun16(x) - } - panic("unsupported container type") -} - -func (bc *bitmapContainer) iandNotArray(ac *arrayContainer) container { - if ac.isEmpty() || bc.isEmpty() { - // Nothing to do. - return bc - } - - // Word by word, we remove the elements in ac from bc. The approach is to build - // a mask of the elements to remove, and then apply it to the bitmap. - wordIdx := uint16(0) - mask := uint64(0) - for i, v := range ac.content { - if v/64 != wordIdx { - // Flush the current word. - if i != 0 { - // We're removing bits that are set in the mask and in the current word. - // To figure out the cardinality change, we count the number of bits that - // are set in the mask and in the current word. - mask &= bc.bitmap[wordIdx] - bc.bitmap[wordIdx] &= ^mask - bc.cardinality -= int(popcount(mask)) - } - - wordIdx = v / 64 - mask = 0 - } - mask |= 1 << (v % 64) - } - - // Flush the last word. - mask &= bc.bitmap[wordIdx] - bc.bitmap[wordIdx] &= ^mask - bc.cardinality -= int(popcount(mask)) - - if bc.getCardinality() <= arrayDefaultMaxSize { - return bc.toArrayContainer() - } - return bc -} - -func (bc *bitmapContainer) iandNotRun16(rc *runContainer16) container { - if rc.isEmpty() || bc.isEmpty() { - // Nothing to do. - return bc - } - - wordRangeStart := rc.iv[0].start / 64 - wordRangeEnd := (rc.iv[len(rc.iv)-1].last()) / 64 // inclusive - - cardinalityChange := popcntSlice(bc.bitmap[wordRangeStart : wordRangeEnd+1]) // before cardinality - after cardinality (for word range) - - for _, iv := range rc.iv { - resetBitmapRange(bc.bitmap, int(iv.start), int(iv.last())+1) - } - - cardinalityChange -= popcntSlice(bc.bitmap[wordRangeStart : wordRangeEnd+1]) - - bc.cardinality -= int(cardinalityChange) - - if bc.getCardinality() <= arrayDefaultMaxSize { - return bc.toArrayContainer() - } - return bc -} - -func (bc *bitmapContainer) andNotArray(value2 *arrayContainer) container { - answer := bc.clone().(*bitmapContainer) - c := value2.getCardinality() - for k := 0; k < c; k++ { - vc := value2.content[k] - i := uint(vc) >> 6 - oldv := answer.bitmap[i] - newv := oldv &^ (uint64(1) << (vc % 64)) - answer.bitmap[i] = newv - answer.cardinality -= int((oldv ^ newv) >> (vc % 64)) - } - if answer.cardinality <= arrayDefaultMaxSize { - return answer.toArrayContainer() - } - return answer -} - -func (bc *bitmapContainer) andNotBitmap(value2 *bitmapContainer) container { - newCardinality := int(popcntMaskSlice(bc.bitmap, value2.bitmap)) - if newCardinality > arrayDefaultMaxSize { - answer := newBitmapContainer() - for k := 0; k < len(answer.bitmap); k++ { - answer.bitmap[k] = bc.bitmap[k] &^ value2.bitmap[k] - } - answer.cardinality = newCardinality - return answer - } - ac := newArrayContainerSize(newCardinality) - fillArrayANDNOT(ac.content, bc.bitmap, value2.bitmap) - return ac -} - -func (bc *bitmapContainer) iandNotBitmapSurely(value2 *bitmapContainer) container { - newCardinality := int(popcntMaskSlice(bc.bitmap, value2.bitmap)) - for k := 0; k < len(bc.bitmap); k++ { - bc.bitmap[k] = bc.bitmap[k] &^ value2.bitmap[k] - } - bc.cardinality = newCardinality - if bc.getCardinality() <= arrayDefaultMaxSize { - return bc.toArrayContainer() - } - return bc -} - -func (bc *bitmapContainer) contains(i uint16) bool { //testbit - x := uint(i) - w := bc.bitmap[x>>6] - mask := uint64(1) << (x & 63) - return (w & mask) != 0 -} - -func (bc *bitmapContainer) bitValue(i uint16) uint64 { - x := uint(i) - w := bc.bitmap[x>>6] - return (w >> (x & 63)) & 1 -} - -func (bc *bitmapContainer) loadData(arrayContainer *arrayContainer) { - bc.cardinality = arrayContainer.getCardinality() - c := arrayContainer.getCardinality() - for k := 0; k < c; k++ { - x := arrayContainer.content[k] - i := int(x) / 64 - bc.bitmap[i] |= (uint64(1) << uint(x%64)) - } -} - -func (bc *bitmapContainer) resetTo(a container) { - switch x := a.(type) { - case *arrayContainer: - fill(bc.bitmap, 0) - bc.loadData(x) - - case *bitmapContainer: - bc.cardinality = x.cardinality - copy(bc.bitmap, x.bitmap) - - case *runContainer16: - bc.cardinality = len(x.iv) - lastEnd := 0 - for _, r := range x.iv { - bc.cardinality += int(r.length) - resetBitmapRange(bc.bitmap, lastEnd, int(r.start)) - lastEnd = int(r.start+r.length) + 1 - setBitmapRange(bc.bitmap, int(r.start), lastEnd) - } - resetBitmapRange(bc.bitmap, lastEnd, maxCapacity) - - default: - panic("unsupported container type") - } -} - -func (bc *bitmapContainer) toArrayContainer() *arrayContainer { - ac := &arrayContainer{} - ac.loadData(bc) - return ac -} - -func (bc *bitmapContainer) fillArray(container []uint16) { - //TODO: rewrite in assembly - pos := 0 - base := 0 - for k := 0; k < len(bc.bitmap); k++ { - bitset := bc.bitmap[k] - for bitset != 0 { - t := bitset & -bitset - container[pos] = uint16((base + int(popcount(t-1)))) - pos = pos + 1 - bitset ^= t - } - base += 64 - } -} - -func (bc *bitmapContainer) NextSetBit(i uint) int { - var ( - x = i / 64 - length = uint(len(bc.bitmap)) - ) - if x >= length { - return -1 - } - w := bc.bitmap[x] - w = w >> uint(i%64) - if w != 0 { - return int(i) + countTrailingZeros(w) - } - x++ - for ; x < length; x++ { - if bc.bitmap[x] != 0 { - return int(x*64) + countTrailingZeros(bc.bitmap[x]) - } - } - return -1 -} - -func (bc *bitmapContainer) PrevSetBit(i int) int { - if i < 0 { - return -1 - } - x := i / 64 - if x >= len(bc.bitmap) { - return -1 - } - - w := bc.bitmap[x] - - b := i % 64 - - w = w << uint(63-b) - if w != 0 { - return i - countLeadingZeros(w) - } - x-- - for ; x >= 0; x-- { - if bc.bitmap[x] != 0 { - return (x * 64) + 63 - countLeadingZeros(bc.bitmap[x]) - } - } - return -1 -} - -// reference the java implementation -// https://github.com/RoaringBitmap/RoaringBitmap/blob/master/src/main/java/org/roaringbitmap/BitmapContainer.java#L875-L892 -func (bc *bitmapContainer) numberOfRuns() int { - if bc.cardinality == 0 { - return 0 - } - - var numRuns uint64 - nextWord := bc.bitmap[0] - - for i := 0; i < len(bc.bitmap)-1; i++ { - word := nextWord - nextWord = bc.bitmap[i+1] - numRuns += popcount((^word)&(word<<1)) + ((word >> 63) &^ nextWord) - } - - word := nextWord - numRuns += popcount((^word) & (word << 1)) - if (word & 0x8000000000000000) != 0 { - numRuns++ - } - - return int(numRuns) -} - -// convert to run or array *if needed* -func (bc *bitmapContainer) toEfficientContainer() container { - - numRuns := bc.numberOfRuns() - - sizeAsRunContainer := runContainer16SerializedSizeInBytes(numRuns) - sizeAsBitmapContainer := bitmapContainerSizeInBytes() - card := bc.getCardinality() - sizeAsArrayContainer := arrayContainerSizeInBytes(card) - - if sizeAsRunContainer <= minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { - return newRunContainer16FromBitmapContainer(bc) - } - if card <= arrayDefaultMaxSize { - return bc.toArrayContainer() - } - return bc -} - -func newBitmapContainerFromRun(rc *runContainer16) *bitmapContainer { - - if len(rc.iv) == 1 { - return newBitmapContainerwithRange(int(rc.iv[0].start), int(rc.iv[0].last())) - } - - bc := newBitmapContainer() - for i := range rc.iv { - setBitmapRange(bc.bitmap, int(rc.iv[i].start), int(rc.iv[i].last())+1) - bc.cardinality += int(rc.iv[i].last()) + 1 - int(rc.iv[i].start) - } - //bc.computeCardinality() - return bc -} - -func (bc *bitmapContainer) containerType() contype { - return bitmapContype -} - -func (bc *bitmapContainer) addOffset(x uint16) (container, container) { - var low, high *bitmapContainer - - if bc.cardinality == 0 { - return nil, nil - } - - b := uint32(x) >> 6 - i := uint32(x) % 64 - end := uint32(1024) - b - - low = newBitmapContainer() - if i == 0 { - copy(low.bitmap[b:], bc.bitmap[:end]) - } else { - low.bitmap[b] = bc.bitmap[0] << i - for k := uint32(1); k < end; k++ { - newval := bc.bitmap[k] << i - newval |= bc.bitmap[k-1] >> (64 - i) - low.bitmap[b+k] = newval - } - } - low.computeCardinality() - - if low.cardinality == bc.cardinality { - // All elements from bc ended up in low, meaning high will be empty. - return low, nil - } - - if low.cardinality == 0 { - // low is empty, let's reuse the container for high. - high = low - low = nil - } else { - // None of the containers will be empty, so allocate both. - high = newBitmapContainer() - } - - if i == 0 { - copy(high.bitmap[:b], bc.bitmap[end:]) - } else { - for k := end; k < 1024; k++ { - newval := bc.bitmap[k] << i - newval |= bc.bitmap[k-1] >> (64 - i) - high.bitmap[k-end] = newval - } - high.bitmap[b] = bc.bitmap[1023] >> (64 - i) - } - high.computeCardinality() - - // Ensure proper nil interface. - if low == nil { - return nil, high - } - - return low, high -} diff --git a/vendor/github.com/RoaringBitmap/roaring/clz.go b/vendor/github.com/RoaringBitmap/roaring/clz.go deleted file mode 100644 index ee0ebc6..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/clz.go +++ /dev/null @@ -1,13 +0,0 @@ -//go:build go1.9 -// +build go1.9 - -// "go1.9", from Go version 1.9 onward -// See https://golang.org/pkg/go/build/#hdr-Build_Constraints - -package roaring - -import "math/bits" - -func countLeadingZeros(x uint64) int { - return bits.LeadingZeros64(x) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/clz_compat.go b/vendor/github.com/RoaringBitmap/roaring/clz_compat.go deleted file mode 100644 index 7ee16b4..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/clz_compat.go +++ /dev/null @@ -1,37 +0,0 @@ -//go:build !go1.9 -// +build !go1.9 - -package roaring - -// LeadingZeroBits returns the number of consecutive most significant zero -// bits of x. -func countLeadingZeros(i uint64) int { - if i == 0 { - return 64 - } - n := 1 - x := uint32(i >> 32) - if x == 0 { - n += 32 - x = uint32(i) - } - if (x >> 16) == 0 { - n += 16 - x <<= 16 - } - if (x >> 24) == 0 { - n += 8 - x <<= 8 - } - if x>>28 == 0 { - n += 4 - x <<= 4 - } - if x>>30 == 0 { - n += 2 - x <<= 2 - - } - n -= int(x >> 31) - return n -} diff --git a/vendor/github.com/RoaringBitmap/roaring/ctz.go b/vendor/github.com/RoaringBitmap/roaring/ctz.go deleted file mode 100644 index fbcfe91..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/ctz.go +++ /dev/null @@ -1,13 +0,0 @@ -//go:build go1.9 -// +build go1.9 - -// "go1.9", from Go version 1.9 onward -// See https://golang.org/pkg/go/build/#hdr-Build_Constraints - -package roaring - -import "math/bits" - -func countTrailingZeros(x uint64) int { - return bits.TrailingZeros64(x) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/ctz_compat.go b/vendor/github.com/RoaringBitmap/roaring/ctz_compat.go deleted file mode 100644 index d01df82..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/ctz_compat.go +++ /dev/null @@ -1,72 +0,0 @@ -//go:build !go1.9 -// +build !go1.9 - -package roaring - -// Reuse of portions of go/src/math/big standard lib code -// under this license: -/* -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -const deBruijn32 = 0x077CB531 - -var deBruijn32Lookup = []byte{ - 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, - 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9, -} - -const deBruijn64 = 0x03f79d71b4ca8b09 - -var deBruijn64Lookup = []byte{ - 0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4, - 62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5, - 63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11, - 54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6, -} - -// trailingZeroBits returns the number of consecutive least significant zero -// bits of x. -func countTrailingZeros(x uint64) int { - // x & -x leaves only the right-most bit set in the word. Let k be the - // index of that bit. Since only a single bit is set, the value is two - // to the power of k. Multiplying by a power of two is equivalent to - // left shifting, in this case by k bits. The de Bruijn constant is - // such that all six bit, consecutive substrings are distinct. - // Therefore, if we have a left shifted version of this constant we can - // find by how many bits it was shifted by looking at which six bit - // substring ended up at the top of the word. - // (Knuth, volume 4, section 7.3.1) - if x == 0 { - // We have to special case 0; the fomula - // below doesn't work for 0. - return 64 - } - return int(deBruijn64Lookup[((x&-x)*(deBruijn64))>>58]) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/fastaggregation.go b/vendor/github.com/RoaringBitmap/roaring/fastaggregation.go deleted file mode 100644 index 7d0a92f..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/fastaggregation.go +++ /dev/null @@ -1,313 +0,0 @@ -package roaring - -import ( - "container/heap" -) - -// Or function that requires repairAfterLazy -func lazyOR(x1, x2 *Bitmap) *Bitmap { - answer := NewBitmap() - pos1 := 0 - pos2 := 0 - length1 := x1.highlowcontainer.size() - length2 := x2.highlowcontainer.size() -main: - for (pos1 < length1) && (pos2 < length2) { - s1 := x1.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - - for { - if s1 < s2 { - answer.highlowcontainer.appendCopy(x1.highlowcontainer, pos1) - pos1++ - if pos1 == length1 { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - } else if s1 > s2 { - answer.highlowcontainer.appendCopy(x2.highlowcontainer, pos2) - pos2++ - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else { - c1 := x1.highlowcontainer.getContainerAtIndex(pos1) - answer.highlowcontainer.appendContainer(s1, c1.lazyOR(x2.highlowcontainer.getContainerAtIndex(pos2)), false) - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } - if pos1 == length1 { - answer.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) - } else if pos2 == length2 { - answer.highlowcontainer.appendCopyMany(x1.highlowcontainer, pos1, length1) - } - return answer -} - -// In-place Or function that requires repairAfterLazy -func (x1 *Bitmap) lazyOR(x2 *Bitmap) *Bitmap { - pos1 := 0 - pos2 := 0 - length1 := x1.highlowcontainer.size() - length2 := x2.highlowcontainer.size() -main: - for (pos1 < length1) && (pos2 < length2) { - s1 := x1.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - - for { - if s1 < s2 { - pos1++ - if pos1 == length1 { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - } else if s1 > s2 { - x1.highlowcontainer.insertNewKeyValueAt(pos1, s2, x2.highlowcontainer.getContainerAtIndex(pos2).clone()) - pos2++ - pos1++ - length1++ - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else { - c1 := x1.highlowcontainer.getWritableContainerAtIndex(pos1) - x1.highlowcontainer.containers[pos1] = c1.lazyIOR(x2.highlowcontainer.getContainerAtIndex(pos2)) - x1.highlowcontainer.needCopyOnWrite[pos1] = false - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } - if pos1 == length1 { - x1.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) - } - return x1 -} - -// to be called after lazy aggregates -func (x1 *Bitmap) repairAfterLazy() { - for pos := 0; pos < x1.highlowcontainer.size(); pos++ { - c := x1.highlowcontainer.getContainerAtIndex(pos) - switch c.(type) { - case *bitmapContainer: - if c.(*bitmapContainer).cardinality == invalidCardinality { - c = x1.highlowcontainer.getWritableContainerAtIndex(pos) - c.(*bitmapContainer).computeCardinality() - if c.(*bitmapContainer).getCardinality() <= arrayDefaultMaxSize { - x1.highlowcontainer.setContainerAtIndex(pos, c.(*bitmapContainer).toArrayContainer()) - } else if c.(*bitmapContainer).isFull() { - x1.highlowcontainer.setContainerAtIndex(pos, newRunContainer16Range(0, MaxUint16)) - } - } - } - } -} - -// FastAnd computes the intersection between many bitmaps quickly -// Compared to the And function, it can take many bitmaps as input, thus saving the trouble -// of manually calling "And" many times. -// -// Performance hints: if you have very large and tiny bitmaps, -// it may be beneficial performance-wise to put a tiny bitmap -// in first position. -func FastAnd(bitmaps ...*Bitmap) *Bitmap { - if len(bitmaps) == 0 { - return NewBitmap() - } else if len(bitmaps) == 1 { - return bitmaps[0].Clone() - } - answer := And(bitmaps[0], bitmaps[1]) - for _, bm := range bitmaps[2:] { - answer.And(bm) - } - return answer -} - -// FastOr computes the union between many bitmaps quickly, as opposed to having to call Or repeatedly. -// It might also be faster than calling Or repeatedly. -func FastOr(bitmaps ...*Bitmap) *Bitmap { - if len(bitmaps) == 0 { - return NewBitmap() - } else if len(bitmaps) == 1 { - return bitmaps[0].Clone() - } - answer := lazyOR(bitmaps[0], bitmaps[1]) - for _, bm := range bitmaps[2:] { - answer = answer.lazyOR(bm) - } - // here is where repairAfterLazy is called. - answer.repairAfterLazy() - return answer -} - -// HeapOr computes the union between many bitmaps quickly using a heap. -// It might be faster than calling Or repeatedly. -func HeapOr(bitmaps ...*Bitmap) *Bitmap { - if len(bitmaps) == 0 { - return NewBitmap() - } - // TODO: for better speed, we could do the operation lazily, see Java implementation - pq := make(priorityQueue, len(bitmaps)) - for i, bm := range bitmaps { - pq[i] = &item{bm, i} - } - heap.Init(&pq) - - for pq.Len() > 1 { - x1 := heap.Pop(&pq).(*item) - x2 := heap.Pop(&pq).(*item) - heap.Push(&pq, &item{Or(x1.value, x2.value), 0}) - } - return heap.Pop(&pq).(*item).value -} - -// HeapXor computes the symmetric difference between many bitmaps quickly (as opposed to calling Xor repeated). -// Internally, this function uses a heap. -// It might be faster than calling Xor repeatedly. -func HeapXor(bitmaps ...*Bitmap) *Bitmap { - if len(bitmaps) == 0 { - return NewBitmap() - } - - pq := make(priorityQueue, len(bitmaps)) - for i, bm := range bitmaps { - pq[i] = &item{bm, i} - } - heap.Init(&pq) - - for pq.Len() > 1 { - x1 := heap.Pop(&pq).(*item) - x2 := heap.Pop(&pq).(*item) - heap.Push(&pq, &item{Xor(x1.value, x2.value), 0}) - } - return heap.Pop(&pq).(*item).value -} - -// AndAny provides a result equivalent to x1.And(FastOr(bitmaps)). -// It's optimized to minimize allocations. It also might be faster than separate calls. -func (x1 *Bitmap) AndAny(bitmaps ...*Bitmap) { - if len(bitmaps) == 0 { - return - } else if len(bitmaps) == 1 { - x1.And(bitmaps[0]) - return - } - - type withPos struct { - bitmap *roaringArray - pos int - key uint16 - } - filters := make([]withPos, 0, len(bitmaps)) - - for _, b := range bitmaps { - if b.highlowcontainer.size() > 0 { - filters = append(filters, withPos{ - bitmap: &b.highlowcontainer, - pos: 0, - key: b.highlowcontainer.getKeyAtIndex(0), - }) - } - } - - basePos := 0 - intersections := 0 - keyContainers := make([]container, 0, len(filters)) - var ( - tmpArray *arrayContainer - tmpBitmap *bitmapContainer - minNextKey uint16 - ) - - for basePos < x1.highlowcontainer.size() && len(filters) > 0 { - baseKey := x1.highlowcontainer.getKeyAtIndex(basePos) - - // accumulate containers for current key, find next minimal key in filters - // and exclude filters that do not have related values anymore - i := 0 - maxPossibleOr := 0 - minNextKey = MaxUint16 - for _, f := range filters { - if f.key < baseKey { - f.pos = f.bitmap.advanceUntil(baseKey, f.pos) - if f.pos == f.bitmap.size() { - continue - } - f.key = f.bitmap.getKeyAtIndex(f.pos) - } - - if f.key == baseKey { - cont := f.bitmap.getContainerAtIndex(f.pos) - keyContainers = append(keyContainers, cont) - maxPossibleOr += cont.getCardinality() - - f.pos++ - if f.pos == f.bitmap.size() { - continue - } - f.key = f.bitmap.getKeyAtIndex(f.pos) - } - - minNextKey = minOfUint16(minNextKey, f.key) - filters[i] = f - i++ - } - filters = filters[:i] - - if len(keyContainers) == 0 { - basePos = x1.highlowcontainer.advanceUntil(minNextKey, basePos) - continue - } - - var ored container - - if len(keyContainers) == 1 { - ored = keyContainers[0] - } else { - //TODO: special case for run containers? - if maxPossibleOr > arrayDefaultMaxSize { - if tmpBitmap == nil { - tmpBitmap = newBitmapContainer() - } - tmpBitmap.resetTo(keyContainers[0]) - ored = tmpBitmap - } else { - if tmpArray == nil { - tmpArray = newArrayContainerCapacity(maxPossibleOr) - } - tmpArray.realloc(maxPossibleOr) - tmpArray.resetTo(keyContainers[0]) - ored = tmpArray - } - for _, c := range keyContainers[1:] { - ored = ored.ior(c) - } - } - - result := x1.highlowcontainer.getWritableContainerAtIndex(basePos).iand(ored) - if !result.isEmpty() { - x1.highlowcontainer.replaceKeyAndContainerAtIndex(intersections, baseKey, result, false) - intersections++ - } - - keyContainers = keyContainers[:0] - basePos = x1.highlowcontainer.advanceUntil(minNextKey, basePos) - } - - x1.highlowcontainer.resize(intersections) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/internal/byte_input.go b/vendor/github.com/RoaringBitmap/roaring/internal/byte_input.go deleted file mode 100644 index d5ebb91..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/internal/byte_input.go +++ /dev/null @@ -1,215 +0,0 @@ -package internal - -import ( - "encoding/binary" - "io" -) - -// ByteInput typed interface around io.Reader or raw bytes -type ByteInput interface { - // Next returns a slice containing the next n bytes from the buffer, - // advancing the buffer as if the bytes had been returned by Read. - Next(n int) ([]byte, error) - // NextReturnsSafeSlice returns true if Next() returns a safe slice as opposed - // to a slice that points to an underlying buffer possibly owned by another system. - // When NextReturnsSafeSlice returns false, the result from Next() should be copied - // before it is modified (i.e., it is immutable). - NextReturnsSafeSlice() bool - // ReadUInt32 reads uint32 with LittleEndian order - ReadUInt32() (uint32, error) - // ReadUInt16 reads uint16 with LittleEndian order - ReadUInt16() (uint16, error) - // GetReadBytes returns read bytes - GetReadBytes() int64 - // SkipBytes skips exactly n bytes - SkipBytes(n int) error -} - -// NewByteInputFromReader creates reader wrapper -func NewByteInputFromReader(reader io.Reader) ByteInput { - return &ByteInputAdapter{ - r: reader, - readBytes: 0, - } -} - -// NewByteInput creates raw bytes wrapper -func NewByteInput(buf []byte) ByteInput { - return &ByteBuffer{ - buf: buf, - off: 0, - } -} - -// ByteBuffer raw bytes wrapper -type ByteBuffer struct { - buf []byte - off int -} - -// NewByteBuffer creates a new ByteBuffer. -func NewByteBuffer(buf []byte) *ByteBuffer { - return &ByteBuffer{ - buf: buf, - } -} - -var _ io.Reader = (*ByteBuffer)(nil) - -// Read implements io.Reader. -func (b *ByteBuffer) Read(p []byte) (int, error) { - data, err := b.Next(len(p)) - if err != nil { - return 0, err - } - copy(p, data) - return len(data), nil -} - -// Next returns a slice containing the next n bytes from the reader -// If there are fewer bytes than the given n, io.ErrUnexpectedEOF will be returned -func (b *ByteBuffer) Next(n int) ([]byte, error) { - m := len(b.buf) - b.off - - if n > m { - return nil, io.ErrUnexpectedEOF - } - - data := b.buf[b.off : b.off+n] - b.off += n - - return data, nil -} - -// NextReturnsSafeSlice returns false since ByteBuffer might hold -// an array owned by some other systems. -func (b *ByteBuffer) NextReturnsSafeSlice() bool { - return false -} - -// ReadUInt32 reads uint32 with LittleEndian order -func (b *ByteBuffer) ReadUInt32() (uint32, error) { - if len(b.buf)-b.off < 4 { - return 0, io.ErrUnexpectedEOF - } - - v := binary.LittleEndian.Uint32(b.buf[b.off:]) - b.off += 4 - - return v, nil -} - -// ReadUInt16 reads uint16 with LittleEndian order -func (b *ByteBuffer) ReadUInt16() (uint16, error) { - if len(b.buf)-b.off < 2 { - return 0, io.ErrUnexpectedEOF - } - - v := binary.LittleEndian.Uint16(b.buf[b.off:]) - b.off += 2 - - return v, nil -} - -// GetReadBytes returns read bytes -func (b *ByteBuffer) GetReadBytes() int64 { - return int64(b.off) -} - -// SkipBytes skips exactly n bytes -func (b *ByteBuffer) SkipBytes(n int) error { - m := len(b.buf) - b.off - - if n > m { - return io.ErrUnexpectedEOF - } - - b.off += n - - return nil -} - -// Reset resets the given buffer with a new byte slice -func (b *ByteBuffer) Reset(buf []byte) { - b.buf = buf - b.off = 0 -} - -// ByteInputAdapter reader wrapper -type ByteInputAdapter struct { - r io.Reader - readBytes int - buf [4]byte -} - -var _ io.Reader = (*ByteInputAdapter)(nil) - -// Read implements io.Reader. -func (b *ByteInputAdapter) Read(buf []byte) (int, error) { - m, err := io.ReadAtLeast(b.r, buf, len(buf)) - b.readBytes += m - - if err != nil { - return 0, err - } - - return m, nil -} - -// Next returns a slice containing the next n bytes from the buffer, -// advancing the buffer as if the bytes had been returned by Read. -func (b *ByteInputAdapter) Next(n int) ([]byte, error) { - buf := make([]byte, n) - _, err := b.Read(buf) - - if err != nil { - return nil, err - } - return buf, nil -} - -// NextReturnsSafeSlice returns true since ByteInputAdapter always returns a slice -// allocated with make([]byte, ...) -func (b *ByteInputAdapter) NextReturnsSafeSlice() bool { - return true -} - -// ReadUInt32 reads uint32 with LittleEndian order -func (b *ByteInputAdapter) ReadUInt32() (uint32, error) { - buf := b.buf[:4] - _, err := b.Read(buf) - if err != nil { - return 0, err - } - - return binary.LittleEndian.Uint32(buf), nil -} - -// ReadUInt16 reads uint16 with LittleEndian order -func (b *ByteInputAdapter) ReadUInt16() (uint16, error) { - buf := b.buf[:2] - _, err := b.Read(buf) - if err != nil { - return 0, err - } - - return binary.LittleEndian.Uint16(buf), nil -} - -// GetReadBytes returns read bytes -func (b *ByteInputAdapter) GetReadBytes() int64 { - return int64(b.readBytes) -} - -// SkipBytes skips exactly n bytes -func (b *ByteInputAdapter) SkipBytes(n int) error { - _, err := b.Next(n) - - return err -} - -// Reset resets the given buffer with a new stream -func (b *ByteInputAdapter) Reset(stream io.Reader) { - b.r = stream - b.readBytes = 0 -} diff --git a/vendor/github.com/RoaringBitmap/roaring/internal/pools.go b/vendor/github.com/RoaringBitmap/roaring/internal/pools.go deleted file mode 100644 index d258356..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/internal/pools.go +++ /dev/null @@ -1,21 +0,0 @@ -package internal - -import ( - "sync" -) - -var ( - // ByteInputAdapterPool shared pool - ByteInputAdapterPool = sync.Pool{ - New: func() interface{} { - return &ByteInputAdapter{} - }, - } - - // ByteBufferPool shared pool - ByteBufferPool = sync.Pool{ - New: func() interface{} { - return &ByteBuffer{} - }, - } -) diff --git a/vendor/github.com/RoaringBitmap/roaring/manyiterator.go b/vendor/github.com/RoaringBitmap/roaring/manyiterator.go deleted file mode 100644 index eaa5b79..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/manyiterator.go +++ /dev/null @@ -1,32 +0,0 @@ -package roaring - -type manyIterable interface { - nextMany(hs uint32, buf []uint32) int - nextMany64(hs uint64, buf []uint64) int -} - -func (si *shortIterator) nextMany(hs uint32, buf []uint32) int { - n := 0 - l := si.loc - s := si.slice - for n < len(buf) && l < len(s) { - buf[n] = uint32(s[l]) | hs - l++ - n++ - } - si.loc = l - return n -} - -func (si *shortIterator) nextMany64(hs uint64, buf []uint64) int { - n := 0 - l := si.loc - s := si.slice - for n < len(buf) && l < len(s) { - buf[n] = uint64(s[l]) | hs - l++ - n++ - } - si.loc = l - return n -} diff --git a/vendor/github.com/RoaringBitmap/roaring/parallel.go b/vendor/github.com/RoaringBitmap/roaring/parallel.go deleted file mode 100644 index 9208e3e..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/parallel.go +++ /dev/null @@ -1,612 +0,0 @@ -package roaring - -import ( - "container/heap" - "fmt" - "runtime" - "sync" -) - -var defaultWorkerCount = runtime.NumCPU() - -type bitmapContainerKey struct { - key uint16 - idx int - bitmap *Bitmap -} - -type multipleContainers struct { - key uint16 - containers []container - idx int -} - -type keyedContainer struct { - key uint16 - container container - idx int -} - -type bitmapContainerHeap []bitmapContainerKey - -func (h bitmapContainerHeap) Len() int { return len(h) } -func (h bitmapContainerHeap) Less(i, j int) bool { return h[i].key < h[j].key } -func (h bitmapContainerHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } - -func (h *bitmapContainerHeap) Push(x interface{}) { - // Push and Pop use pointer receivers because they modify the slice's length, - // not just its contents. - *h = append(*h, x.(bitmapContainerKey)) -} - -func (h *bitmapContainerHeap) Pop() interface{} { - old := *h - n := len(old) - x := old[n-1] - *h = old[0 : n-1] - return x -} - -func (h bitmapContainerHeap) Peek() bitmapContainerKey { - return h[0] -} - -func (h *bitmapContainerHeap) popIncrementing() (key uint16, container container) { - k := h.Peek() - key = k.key - container = k.bitmap.highlowcontainer.containers[k.idx] - - newIdx := k.idx + 1 - if newIdx < k.bitmap.highlowcontainer.size() { - k = bitmapContainerKey{ - k.bitmap.highlowcontainer.keys[newIdx], - newIdx, - k.bitmap, - } - (*h)[0] = k - heap.Fix(h, 0) - } else { - heap.Pop(h) - } - - return -} - -func (h *bitmapContainerHeap) Next(containers []container) multipleContainers { - if h.Len() == 0 { - return multipleContainers{} - } - - key, container := h.popIncrementing() - containers = append(containers, container) - - for h.Len() > 0 && key == h.Peek().key { - _, container = h.popIncrementing() - containers = append(containers, container) - } - - return multipleContainers{ - key, - containers, - -1, - } -} - -func newBitmapContainerHeap(bitmaps ...*Bitmap) bitmapContainerHeap { - // Initialize heap - var h bitmapContainerHeap = make([]bitmapContainerKey, 0, len(bitmaps)) - for _, bitmap := range bitmaps { - if !bitmap.IsEmpty() { - key := bitmapContainerKey{ - bitmap.highlowcontainer.keys[0], - 0, - bitmap, - } - h = append(h, key) - } - } - - heap.Init(&h) - - return h -} - -func repairAfterLazy(c container) container { - switch t := c.(type) { - case *bitmapContainer: - if t.cardinality == invalidCardinality { - t.computeCardinality() - } - - if t.getCardinality() <= arrayDefaultMaxSize { - return t.toArrayContainer() - } else if c.(*bitmapContainer).isFull() { - return newRunContainer16Range(0, MaxUint16) - } - } - - return c -} - -func toBitmapContainer(c container) container { - switch t := c.(type) { - case *arrayContainer: - return t.toBitmapContainer() - case *runContainer16: - if !t.isFull() { - return t.toBitmapContainer() - } - } - return c -} - -func appenderRoutine(bitmapChan chan<- *Bitmap, resultChan <-chan keyedContainer, expectedKeysChan <-chan int) { - expectedKeys := -1 - appendedKeys := 0 - var keys []uint16 - var containers []container - for appendedKeys != expectedKeys { - select { - case item := <-resultChan: - if len(keys) <= item.idx { - keys = append(keys, make([]uint16, item.idx-len(keys)+1)...) - containers = append(containers, make([]container, item.idx-len(containers)+1)...) - } - keys[item.idx] = item.key - containers[item.idx] = item.container - - appendedKeys++ - case msg := <-expectedKeysChan: - expectedKeys = msg - } - } - answer := &Bitmap{ - roaringArray{ - make([]uint16, 0, expectedKeys), - make([]container, 0, expectedKeys), - make([]bool, 0, expectedKeys), - false, - }, - } - for i := range keys { - if containers[i] != nil { // in case a resulting container was empty, see ParAnd function - answer.highlowcontainer.appendContainer(keys[i], containers[i], false) - } - } - - bitmapChan <- answer -} - -// ParHeapOr computes the union (OR) of all provided bitmaps in parallel, -// where the parameter "parallelism" determines how many workers are to be used -// (if it is set to 0, a default number of workers is chosen) -// ParHeapOr uses a heap to compute the union. For rare cases it might be faster than ParOr -func ParHeapOr(parallelism int, bitmaps ...*Bitmap) *Bitmap { - - bitmapCount := len(bitmaps) - if bitmapCount == 0 { - return NewBitmap() - } else if bitmapCount == 1 { - return bitmaps[0].Clone() - } - - if parallelism == 0 { - parallelism = defaultWorkerCount - } - - h := newBitmapContainerHeap(bitmaps...) - - bitmapChan := make(chan *Bitmap) - inputChan := make(chan multipleContainers, 128) - resultChan := make(chan keyedContainer, 32) - expectedKeysChan := make(chan int) - - pool := sync.Pool{ - New: func() interface{} { - return make([]container, 0, len(bitmaps)) - }, - } - - orFunc := func() { - // Assumes only structs with >=2 containers are passed - for input := range inputChan { - c := toBitmapContainer(input.containers[0]).lazyOR(input.containers[1]) - for _, next := range input.containers[2:] { - c = c.lazyIOR(next) - } - c = repairAfterLazy(c) - kx := keyedContainer{ - input.key, - c, - input.idx, - } - resultChan <- kx - pool.Put(input.containers[:0]) - } - } - - go appenderRoutine(bitmapChan, resultChan, expectedKeysChan) - - for i := 0; i < parallelism; i++ { - go orFunc() - } - - idx := 0 - for h.Len() > 0 { - ck := h.Next(pool.Get().([]container)) - if len(ck.containers) == 1 { - resultChan <- keyedContainer{ - ck.key, - ck.containers[0], - idx, - } - pool.Put(ck.containers[:0]) - } else { - ck.idx = idx - inputChan <- ck - } - idx++ - } - expectedKeysChan <- idx - - bitmap := <-bitmapChan - - close(inputChan) - close(resultChan) - close(expectedKeysChan) - - return bitmap -} - -// ParAnd computes the intersection (AND) of all provided bitmaps in parallel, -// where the parameter "parallelism" determines how many workers are to be used -// (if it is set to 0, a default number of workers is chosen) -func ParAnd(parallelism int, bitmaps ...*Bitmap) *Bitmap { - bitmapCount := len(bitmaps) - if bitmapCount == 0 { - return NewBitmap() - } else if bitmapCount == 1 { - return bitmaps[0].Clone() - } - - if parallelism == 0 { - parallelism = defaultWorkerCount - } - - h := newBitmapContainerHeap(bitmaps...) - - bitmapChan := make(chan *Bitmap) - inputChan := make(chan multipleContainers, 128) - resultChan := make(chan keyedContainer, 32) - expectedKeysChan := make(chan int) - - andFunc := func() { - // Assumes only structs with >=2 containers are passed - for input := range inputChan { - c := input.containers[0].and(input.containers[1]) - for _, next := range input.containers[2:] { - if c.isEmpty() { - break - } - c = c.iand(next) - } - - // Send a nil explicitly if the result of the intersection is an empty container - if c.isEmpty() { - c = nil - } - - kx := keyedContainer{ - input.key, - c, - input.idx, - } - resultChan <- kx - } - } - - go appenderRoutine(bitmapChan, resultChan, expectedKeysChan) - - for i := 0; i < parallelism; i++ { - go andFunc() - } - - idx := 0 - for h.Len() > 0 { - ck := h.Next(make([]container, 0, 4)) - if len(ck.containers) == bitmapCount { - ck.idx = idx - inputChan <- ck - idx++ - } - } - expectedKeysChan <- idx - - bitmap := <-bitmapChan - - close(inputChan) - close(resultChan) - close(expectedKeysChan) - - return bitmap -} - -// ParOr computes the union (OR) of all provided bitmaps in parallel, -// where the parameter "parallelism" determines how many workers are to be used -// (if it is set to 0, a default number of workers is chosen) -func ParOr(parallelism int, bitmaps ...*Bitmap) *Bitmap { - var lKey uint16 = MaxUint16 - var hKey uint16 - - bitmapsFiltered := bitmaps[:0] - for _, b := range bitmaps { - if !b.IsEmpty() { - bitmapsFiltered = append(bitmapsFiltered, b) - } - } - bitmaps = bitmapsFiltered - - for _, b := range bitmaps { - lKey = minOfUint16(lKey, b.highlowcontainer.keys[0]) - hKey = maxOfUint16(hKey, b.highlowcontainer.keys[b.highlowcontainer.size()-1]) - } - - if lKey == MaxUint16 && hKey == 0 { - return New() - } else if len(bitmaps) == 1 { - return bitmaps[0].Clone() - } - - keyRange := int(hKey) - int(lKey) + 1 - if keyRange == 1 { - // revert to FastOr. Since the key range is 0 - // no container-level aggregation parallelism is achievable - return FastOr(bitmaps...) - } - - if parallelism == 0 { - parallelism = defaultWorkerCount - } - - var chunkSize int - var chunkCount int - if parallelism*4 > int(keyRange) { - chunkSize = 1 - chunkCount = int(keyRange) - } else { - chunkCount = parallelism * 4 - chunkSize = (int(keyRange) + chunkCount - 1) / chunkCount - } - - if chunkCount*chunkSize < int(keyRange) { - // it's fine to panic to indicate an implementation error - panic(fmt.Sprintf("invariant check failed: chunkCount * chunkSize < keyRange, %d * %d < %d", chunkCount, chunkSize, keyRange)) - } - - chunks := make([]*roaringArray, chunkCount) - - chunkSpecChan := make(chan parChunkSpec, minOfInt(maxOfInt(64, 2*parallelism), int(chunkCount))) - chunkChan := make(chan parChunk, minOfInt(32, int(chunkCount))) - - orFunc := func() { - for spec := range chunkSpecChan { - ra := lazyOrOnRange(&bitmaps[0].highlowcontainer, &bitmaps[1].highlowcontainer, spec.start, spec.end) - for _, b := range bitmaps[2:] { - ra = lazyIOrOnRange(ra, &b.highlowcontainer, spec.start, spec.end) - } - - for i, c := range ra.containers { - ra.containers[i] = repairAfterLazy(c) - } - - chunkChan <- parChunk{ra, spec.idx} - } - } - - for i := 0; i < parallelism; i++ { - go orFunc() - } - - go func() { - for i := 0; i < chunkCount; i++ { - spec := parChunkSpec{ - start: uint16(int(lKey) + i*chunkSize), - end: uint16(minOfInt(int(lKey)+(i+1)*chunkSize-1, int(hKey))), - idx: int(i), - } - chunkSpecChan <- spec - } - }() - - chunksRemaining := chunkCount - for chunk := range chunkChan { - chunks[chunk.idx] = chunk.ra - chunksRemaining-- - if chunksRemaining == 0 { - break - } - } - close(chunkChan) - close(chunkSpecChan) - - containerCount := 0 - for _, chunk := range chunks { - containerCount += chunk.size() - } - - result := Bitmap{ - roaringArray{ - containers: make([]container, containerCount), - keys: make([]uint16, containerCount), - needCopyOnWrite: make([]bool, containerCount), - }, - } - - resultOffset := 0 - for _, chunk := range chunks { - copy(result.highlowcontainer.containers[resultOffset:], chunk.containers) - copy(result.highlowcontainer.keys[resultOffset:], chunk.keys) - copy(result.highlowcontainer.needCopyOnWrite[resultOffset:], chunk.needCopyOnWrite) - resultOffset += chunk.size() - } - - return &result -} - -type parChunkSpec struct { - start uint16 - end uint16 - idx int -} - -type parChunk struct { - ra *roaringArray - idx int -} - -func (c parChunk) size() int { - return c.ra.size() -} - -func parNaiveStartAt(ra *roaringArray, start uint16, last uint16) int { - for idx, key := range ra.keys { - if key >= start && key <= last { - return idx - } else if key > last { - break - } - } - return ra.size() -} - -func lazyOrOnRange(ra1, ra2 *roaringArray, start, last uint16) *roaringArray { - answer := newRoaringArray() - length1 := ra1.size() - length2 := ra2.size() - - idx1 := parNaiveStartAt(ra1, start, last) - idx2 := parNaiveStartAt(ra2, start, last) - - var key1 uint16 - var key2 uint16 - if idx1 < length1 && idx2 < length2 { - key1 = ra1.getKeyAtIndex(idx1) - key2 = ra2.getKeyAtIndex(idx2) - - for key1 <= last && key2 <= last { - - if key1 < key2 { - answer.appendCopy(*ra1, idx1) - idx1++ - if idx1 == length1 { - break - } - key1 = ra1.getKeyAtIndex(idx1) - } else if key1 > key2 { - answer.appendCopy(*ra2, idx2) - idx2++ - if idx2 == length2 { - break - } - key2 = ra2.getKeyAtIndex(idx2) - } else { - c1 := ra1.getFastContainerAtIndex(idx1, false) - - answer.appendContainer(key1, c1.lazyOR(ra2.getContainerAtIndex(idx2)), false) - idx1++ - idx2++ - if idx1 == length1 || idx2 == length2 { - break - } - - key1 = ra1.getKeyAtIndex(idx1) - key2 = ra2.getKeyAtIndex(idx2) - } - } - } - - if idx2 < length2 { - key2 = ra2.getKeyAtIndex(idx2) - for key2 <= last { - answer.appendCopy(*ra2, idx2) - idx2++ - if idx2 == length2 { - break - } - key2 = ra2.getKeyAtIndex(idx2) - } - } - - if idx1 < length1 { - key1 = ra1.getKeyAtIndex(idx1) - for key1 <= last { - answer.appendCopy(*ra1, idx1) - idx1++ - if idx1 == length1 { - break - } - key1 = ra1.getKeyAtIndex(idx1) - } - } - return answer -} - -func lazyIOrOnRange(ra1, ra2 *roaringArray, start, last uint16) *roaringArray { - length1 := ra1.size() - length2 := ra2.size() - - idx1 := 0 - idx2 := parNaiveStartAt(ra2, start, last) - - var key1 uint16 - var key2 uint16 - if idx1 < length1 && idx2 < length2 { - key1 = ra1.getKeyAtIndex(idx1) - key2 = ra2.getKeyAtIndex(idx2) - - for key1 <= last && key2 <= last { - if key1 < key2 { - idx1++ - if idx1 >= length1 { - break - } - key1 = ra1.getKeyAtIndex(idx1) - } else if key1 > key2 { - ra1.insertNewKeyValueAt(idx1, key2, ra2.getContainerAtIndex(idx2)) - ra1.needCopyOnWrite[idx1] = true - idx2++ - idx1++ - length1++ - if idx2 >= length2 { - break - } - key2 = ra2.getKeyAtIndex(idx2) - } else { - c1 := ra1.getFastContainerAtIndex(idx1, true) - - ra1.containers[idx1] = c1.lazyIOR(ra2.getContainerAtIndex(idx2)) - ra1.needCopyOnWrite[idx1] = false - idx1++ - idx2++ - if idx1 >= length1 || idx2 >= length2 { - break - } - - key1 = ra1.getKeyAtIndex(idx1) - key2 = ra2.getKeyAtIndex(idx2) - } - } - } - if idx2 < length2 { - key2 = ra2.getKeyAtIndex(idx2) - for key2 <= last { - ra1.appendCopy(*ra2, idx2) - idx2++ - if idx2 >= length2 { - break - } - key2 = ra2.getKeyAtIndex(idx2) - } - } - return ra1 -} diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt.go b/vendor/github.com/RoaringBitmap/roaring/popcnt.go deleted file mode 100644 index b4980aa..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/popcnt.go +++ /dev/null @@ -1,13 +0,0 @@ -//go:build go1.9 -// +build go1.9 - -// "go1.9", from Go version 1.9 onward -// See https://golang.org/pkg/go/build/#hdr-Build_Constraints - -package roaring - -import "math/bits" - -func popcount(x uint64) uint64 { - return uint64(bits.OnesCount64(x)) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_amd64.s b/vendor/github.com/RoaringBitmap/roaring/popcnt_amd64.s deleted file mode 100644 index 1f13fa2..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/popcnt_amd64.s +++ /dev/null @@ -1,103 +0,0 @@ -// +build amd64,!appengine,!go1.9 - -TEXT ·hasAsm(SB),4,$0-1 -MOVQ $1, AX -CPUID -SHRQ $23, CX -ANDQ $1, CX -MOVB CX, ret+0(FP) -RET - -#define POPCNTQ_DX_DX BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0xd2 - -TEXT ·popcntSliceAsm(SB),4,$0-32 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntSliceEnd -popcntSliceLoop: -BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0x16 // POPCNTQ (SI), DX -ADDQ DX, AX -ADDQ $8, SI -LOOP popcntSliceLoop -popcntSliceEnd: -MOVQ AX, ret+24(FP) -RET - -TEXT ·popcntMaskSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntMaskSliceEnd -MOVQ m+24(FP), DI -popcntMaskSliceLoop: -MOVQ (DI), DX -NOTQ DX -ANDQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntMaskSliceLoop -popcntMaskSliceEnd: -MOVQ AX, ret+48(FP) -RET - -TEXT ·popcntAndSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntAndSliceEnd -MOVQ m+24(FP), DI -popcntAndSliceLoop: -MOVQ (DI), DX -ANDQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntAndSliceLoop -popcntAndSliceEnd: -MOVQ AX, ret+48(FP) -RET - -TEXT ·popcntOrSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntOrSliceEnd -MOVQ m+24(FP), DI -popcntOrSliceLoop: -MOVQ (DI), DX -ORQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntOrSliceLoop -popcntOrSliceEnd: -MOVQ AX, ret+48(FP) -RET - -TEXT ·popcntXorSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntXorSliceEnd -MOVQ m+24(FP), DI -popcntXorSliceLoop: -MOVQ (DI), DX -XORQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntXorSliceLoop -popcntXorSliceEnd: -MOVQ AX, ret+48(FP) -RET diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_asm.go b/vendor/github.com/RoaringBitmap/roaring/popcnt_asm.go deleted file mode 100644 index ba2dac9..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/popcnt_asm.go +++ /dev/null @@ -1,68 +0,0 @@ -//go:build amd64 && !appengine && !go1.9 -// +build amd64,!appengine,!go1.9 - -package roaring - -// *** the following functions are defined in popcnt_amd64.s - -//go:noescape - -func hasAsm() bool - -// useAsm is a flag used to select the GO or ASM implementation of the popcnt function -var useAsm = hasAsm() - -//go:noescape - -func popcntSliceAsm(s []uint64) uint64 - -//go:noescape - -func popcntMaskSliceAsm(s, m []uint64) uint64 - -//go:noescape - -func popcntAndSliceAsm(s, m []uint64) uint64 - -//go:noescape - -func popcntOrSliceAsm(s, m []uint64) uint64 - -//go:noescape - -func popcntXorSliceAsm(s, m []uint64) uint64 - -func popcntSlice(s []uint64) uint64 { - if useAsm { - return popcntSliceAsm(s) - } - return popcntSliceGo(s) -} - -func popcntMaskSlice(s, m []uint64) uint64 { - if useAsm { - return popcntMaskSliceAsm(s, m) - } - return popcntMaskSliceGo(s, m) -} - -func popcntAndSlice(s, m []uint64) uint64 { - if useAsm { - return popcntAndSliceAsm(s, m) - } - return popcntAndSliceGo(s, m) -} - -func popcntOrSlice(s, m []uint64) uint64 { - if useAsm { - return popcntOrSliceAsm(s, m) - } - return popcntOrSliceGo(s, m) -} - -func popcntXorSlice(s, m []uint64) uint64 { - if useAsm { - return popcntXorSliceAsm(s, m) - } - return popcntXorSliceGo(s, m) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_compat.go b/vendor/github.com/RoaringBitmap/roaring/popcnt_compat.go deleted file mode 100644 index 5933e52..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/popcnt_compat.go +++ /dev/null @@ -1,18 +0,0 @@ -//go:build !go1.9 -// +build !go1.9 - -package roaring - -// bit population count, take from -// https://code.google.com/p/go/issues/detail?id=4988#c11 -// credit: https://code.google.com/u/arnehormann/ -// credit: https://play.golang.org/p/U7SogJ7psJ -// credit: http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel -func popcount(x uint64) uint64 { - x -= (x >> 1) & 0x5555555555555555 - x = (x>>2)&0x3333333333333333 + x&0x3333333333333333 - x += x >> 4 - x &= 0x0f0f0f0f0f0f0f0f - x *= 0x0101010101010101 - return x >> 56 -} diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_generic.go b/vendor/github.com/RoaringBitmap/roaring/popcnt_generic.go deleted file mode 100644 index 4ae6d5a..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/popcnt_generic.go +++ /dev/null @@ -1,24 +0,0 @@ -//go:build !amd64 || appengine || go1.9 -// +build !amd64 appengine go1.9 - -package roaring - -func popcntSlice(s []uint64) uint64 { - return popcntSliceGo(s) -} - -func popcntMaskSlice(s, m []uint64) uint64 { - return popcntMaskSliceGo(s, m) -} - -func popcntAndSlice(s, m []uint64) uint64 { - return popcntAndSliceGo(s, m) -} - -func popcntOrSlice(s, m []uint64) uint64 { - return popcntOrSliceGo(s, m) -} - -func popcntXorSlice(s, m []uint64) uint64 { - return popcntXorSliceGo(s, m) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/popcnt_slices.go b/vendor/github.com/RoaringBitmap/roaring/popcnt_slices.go deleted file mode 100644 index d27c5f3..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/popcnt_slices.go +++ /dev/null @@ -1,41 +0,0 @@ -package roaring - -func popcntSliceGo(s []uint64) uint64 { - cnt := uint64(0) - for _, x := range s { - cnt += popcount(x) - } - return cnt -} - -func popcntMaskSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] &^ m[i]) - } - return cnt -} - -func popcntAndSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] & m[i]) - } - return cnt -} - -func popcntOrSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] | m[i]) - } - return cnt -} - -func popcntXorSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] ^ m[i]) - } - return cnt -} diff --git a/vendor/github.com/RoaringBitmap/roaring/priorityqueue.go b/vendor/github.com/RoaringBitmap/roaring/priorityqueue.go deleted file mode 100644 index 9259a68..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/priorityqueue.go +++ /dev/null @@ -1,101 +0,0 @@ -package roaring - -import "container/heap" - -///////////// -// The priorityQueue is used to keep Bitmaps sorted. -//////////// - -type item struct { - value *Bitmap - index int -} - -type priorityQueue []*item - -func (pq priorityQueue) Len() int { return len(pq) } - -func (pq priorityQueue) Less(i, j int) bool { - return pq[i].value.GetSizeInBytes() < pq[j].value.GetSizeInBytes() -} - -func (pq priorityQueue) Swap(i, j int) { - pq[i], pq[j] = pq[j], pq[i] - pq[i].index = i - pq[j].index = j -} - -func (pq *priorityQueue) Push(x interface{}) { - n := len(*pq) - item := x.(*item) - item.index = n - *pq = append(*pq, item) -} - -func (pq *priorityQueue) Pop() interface{} { - old := *pq - n := len(old) - item := old[n-1] - item.index = -1 // for safety - *pq = old[0 : n-1] - return item -} - -func (pq *priorityQueue) update(item *item, value *Bitmap) { - item.value = value - heap.Fix(pq, item.index) -} - -///////////// -// The containerPriorityQueue is used to keep the containers of various Bitmaps sorted. -//////////// - -type containeritem struct { - value *Bitmap - keyindex int - index int -} - -type containerPriorityQueue []*containeritem - -func (pq containerPriorityQueue) Len() int { return len(pq) } - -func (pq containerPriorityQueue) Less(i, j int) bool { - k1 := pq[i].value.highlowcontainer.getKeyAtIndex(pq[i].keyindex) - k2 := pq[j].value.highlowcontainer.getKeyAtIndex(pq[j].keyindex) - if k1 != k2 { - return k1 < k2 - } - c1 := pq[i].value.highlowcontainer.getContainerAtIndex(pq[i].keyindex) - c2 := pq[j].value.highlowcontainer.getContainerAtIndex(pq[j].keyindex) - - return c1.getCardinality() > c2.getCardinality() -} - -func (pq containerPriorityQueue) Swap(i, j int) { - pq[i], pq[j] = pq[j], pq[i] - pq[i].index = i - pq[j].index = j -} - -func (pq *containerPriorityQueue) Push(x interface{}) { - n := len(*pq) - item := x.(*containeritem) - item.index = n - *pq = append(*pq, item) -} - -func (pq *containerPriorityQueue) Pop() interface{} { - old := *pq - n := len(old) - item := old[n-1] - item.index = -1 // for safety - *pq = old[0 : n-1] - return item -} - -//func (pq *containerPriorityQueue) update(item *containeritem, value *Bitmap, keyindex int) { -// item.value = value -// item.keyindex = keyindex -// heap.Fix(pq, item.index) -//} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaring.go b/vendor/github.com/RoaringBitmap/roaring/roaring.go deleted file mode 100644 index a31cdbd..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/roaring.go +++ /dev/null @@ -1,1918 +0,0 @@ -// Package roaring is an implementation of Roaring Bitmaps in Go. -// They provide fast compressed bitmap data structures (also called bitset). -// They are ideally suited to represent sets of integers over -// relatively small ranges. -// See http://roaringbitmap.org for details. -package roaring - -import ( - "bytes" - "encoding/base64" - "fmt" - "io" - "strconv" - - "github.com/RoaringBitmap/roaring/internal" - "github.com/bits-and-blooms/bitset" -) - -// Bitmap represents a compressed bitmap where you can add integers. -type Bitmap struct { - highlowcontainer roaringArray -} - -// ToBase64 serializes a bitmap as Base64 -func (rb *Bitmap) ToBase64() (string, error) { - buf := new(bytes.Buffer) - _, err := rb.WriteTo(buf) - return base64.StdEncoding.EncodeToString(buf.Bytes()), err - -} - -// FromBase64 deserializes a bitmap from Base64 -func (rb *Bitmap) FromBase64(str string) (int64, error) { - data, err := base64.StdEncoding.DecodeString(str) - if err != nil { - return 0, err - } - buf := bytes.NewBuffer(data) - - return rb.ReadFrom(buf) -} - -// WriteTo writes a serialized version of this bitmap to stream. -// The format is compatible with other RoaringBitmap -// implementations (Java, C) and is documented here: -// https://github.com/RoaringBitmap/RoaringFormatSpec -func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) { - return rb.highlowcontainer.writeTo(stream) -} - -// ToBytes returns an array of bytes corresponding to what is written -// when calling WriteTo -func (rb *Bitmap) ToBytes() ([]byte, error) { - return rb.highlowcontainer.toBytes() -} - -const wordSize = uint64(64) -const log2WordSize = uint64(6) -const capacity = ^uint64(0) -const bitmapContainerSize = (1 << 16) / 64 // bitmap size in words - -// DenseSize returns the size of the bitmap when stored as a dense bitmap. -func (rb *Bitmap) DenseSize() uint64 { - if rb.highlowcontainer.size() == 0 { - return 0 - } - - maximum := 1 + uint64(rb.Maximum()) - if maximum > (capacity - wordSize + 1) { - return uint64(capacity >> log2WordSize) - } - - return uint64((maximum + (wordSize - 1)) >> log2WordSize) -} - -// ToDense returns a slice of uint64s representing the bitmap as a dense bitmap. -// Useful to convert a roaring bitmap to a format that can be used by other libraries -// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap -func (rb *Bitmap) ToDense() []uint64 { - sz := rb.DenseSize() - if sz == 0 { - return nil - } - - bitmap := make([]uint64, sz) - rb.WriteDenseTo(bitmap) - return bitmap -} - -// FromDense creates a bitmap from a slice of uint64s representing the bitmap as a dense bitmap. -// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or -// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience. -// -// This function will not create any run containers, only array and bitmap containers. It's up to -// the caller to call RunOptimize if they want to further compress the runs of consecutive values. -// -// When doCopy is true, the bitmap is copied into a new slice for each bitmap container. -// This is useful when the bitmap is going to be modified after this function returns or if it's -// undesirable to hold references to large bitmaps which the GC would not be able to collect. -// One copy can still happen even when doCopy is false if the bitmap length is not divisible -// by bitmapContainerSize. -// -// See also FromBitSet. -func FromDense(bitmap []uint64, doCopy bool) *Bitmap { - sz := (len(bitmap) + bitmapContainerSize - 1) / bitmapContainerSize // round up - rb := &Bitmap{ - highlowcontainer: roaringArray{ - containers: make([]container, 0, sz), - keys: make([]uint16, 0, sz), - needCopyOnWrite: make([]bool, 0, sz), - }, - } - rb.FromDense(bitmap, doCopy) - return rb -} - -// FromDense unmarshalls from a slice of uint64s representing the bitmap as a dense bitmap. -// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or -// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience. -// Callers are responsible for ensuring that the bitmap is empty before calling this function. -// -// This function will not create any run containers, only array and bitmap containers. It is up to -// the caller to call RunOptimize if they want to further compress the runs of consecutive values. -// -// When doCopy is true, the bitmap is copied into a new slice for each bitmap container. -// This is useful when the bitmap is going to be modified after this function returns or if it's -// undesirable to hold references to large bitmaps which the GC would not be able to collect. -// One copy can still happen even when doCopy is false if the bitmap length is not divisible -// by bitmapContainerSize. -// -// See FromBitSet. -func (rb *Bitmap) FromDense(bitmap []uint64, doCopy bool) { - if len(bitmap) == 0 { - return - } - - var k uint16 - const size = bitmapContainerSize - - for len(bitmap) > 0 { - hi := size - if len(bitmap) < size { - hi = len(bitmap) - } - - words := bitmap[:hi] - count := int(popcntSlice(words)) - - switch { - case count > arrayDefaultMaxSize: - c := &bitmapContainer{cardinality: count, bitmap: words} - cow := true - - if doCopy || len(words) < size { - c.bitmap = make([]uint64, size) - copy(c.bitmap, words) - cow = false - } - - rb.highlowcontainer.appendContainer(k, c, cow) - - case count > 0: - c := &arrayContainer{content: make([]uint16, count)} - var pos, base int - for _, w := range words { - for w != 0 { - t := w & -w - c.content[pos] = uint16(base + int(popcount(t-1))) - pos++ - w ^= t - } - base += 64 - } - rb.highlowcontainer.appendContainer(k, c, false) - } - - bitmap = bitmap[hi:] - k++ - } -} - -// WriteDenseTo writes to a slice of uint64s representing the bitmap as a dense bitmap. -// Callers are responsible for allocating enough space in the bitmap using DenseSize. -// Useful to convert a roaring bitmap to a format that can be used by other libraries -// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap -func (rb *Bitmap) WriteDenseTo(bitmap []uint64) { - for i, ct := range rb.highlowcontainer.containers { - hb := uint32(rb.highlowcontainer.keys[i]) << 16 - - switch c := ct.(type) { - case *arrayContainer: - for _, x := range c.content { - n := int(hb | uint32(x)) - bitmap[n>>log2WordSize] |= uint64(1) << uint(x%64) - } - - case *bitmapContainer: - copy(bitmap[int(hb)>>log2WordSize:], c.bitmap) - - case *runContainer16: - for j := range c.iv { - start := uint32(c.iv[j].start) - end := start + uint32(c.iv[j].length) + 1 - lo := int(hb|start) >> log2WordSize - hi := int(hb|(end-1)) >> log2WordSize - - if lo == hi { - bitmap[lo] |= (^uint64(0) << uint(start%64)) & - (^uint64(0) >> (uint(-end) % 64)) - continue - } - - bitmap[lo] |= ^uint64(0) << uint(start%64) - for n := lo + 1; n < hi; n++ { - bitmap[n] = ^uint64(0) - } - bitmap[hi] |= ^uint64(0) >> (uint(-end) % 64) - } - default: - panic("unsupported container type") - } - } -} - -// Checksum computes a hash (currently FNV-1a) for a bitmap that is suitable for -// using bitmaps as elements in hash sets or as keys in hash maps, as well as -// generally quicker comparisons. -// The implementation is biased towards efficiency in little endian machines, so -// expect some extra CPU cycles and memory to be used if your machine is big endian. -// Likewise, do not use this to verify integrity unless you are certain you will load -// the bitmap on a machine with the same endianess used to create it. (Thankfully -// very few people use big endian machines these days.) -func (rb *Bitmap) Checksum() uint64 { - const ( - offset = 14695981039346656037 - prime = 1099511628211 - ) - - var bytes []byte - - hash := uint64(offset) - - bytes = uint16SliceAsByteSlice(rb.highlowcontainer.keys) - - for _, b := range bytes { - hash ^= uint64(b) - hash *= prime - } - - for _, c := range rb.highlowcontainer.containers { - // 0 separator - hash ^= 0 - hash *= prime - - switch c := c.(type) { - case *bitmapContainer: - bytes = uint64SliceAsByteSlice(c.bitmap) - case *arrayContainer: - bytes = uint16SliceAsByteSlice(c.content) - case *runContainer16: - bytes = interval16SliceAsByteSlice(c.iv) - default: - panic("invalid container type") - } - - if len(bytes) == 0 { - panic("empty containers are not supported") - } - - for _, b := range bytes { - hash ^= uint64(b) - hash *= prime - } - } - - return hash -} - -// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy. -// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap. -// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually. -// The containers in the resulting bitmap are immutable containers tied to the provided byte array and they rely on -// copy-on-write which means that modifying them creates copies. Thus FromUnsafeBytes is more likely to be appropriate for read-only use cases, -// when the resulting bitmap can be considered immutable. -// -// See also the FromBuffer function. -// See https://github.com/RoaringBitmap/roaring/pull/395 for more details. -func (rb *Bitmap) FromUnsafeBytes(data []byte, cookieHeader ...byte) (p int64, err error) { - stream := internal.NewByteBuffer(data) - return rb.ReadFrom(stream) -} - -// ReadFrom reads a serialized version of this bitmap from stream. -// The format is compatible with other RoaringBitmap -// implementations (Java, C) and is documented here: -// https://github.com/RoaringBitmap/RoaringFormatSpec -// Since io.Reader is regarded as a stream and cannot be read twice. -// So add cookieHeader to accept the 4-byte data that has been read in roaring64.ReadFrom. -// It is not necessary to pass cookieHeader when call roaring.ReadFrom to read the roaring32 data directly. -func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err error) { - stream, ok := reader.(internal.ByteInput) - if !ok { - byteInputAdapter := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter) - byteInputAdapter.Reset(reader) - stream = byteInputAdapter - } - - p, err = rb.highlowcontainer.readFrom(stream, cookieHeader...) - - if !ok { - internal.ByteInputAdapterPool.Put(stream.(*internal.ByteInputAdapter)) - } - return -} - -// FromBuffer creates a bitmap from its serialized version stored in buffer -// -// The format specification is available here: -// https://github.com/RoaringBitmap/RoaringFormatSpec -// -// The provided byte array (buf) is expected to be a constant. -// The function makes the best effort attempt not to copy data. -// You should take care not to modify buff as it will -// likely result in unexpected program behavior. -// -// Resulting bitmaps are effectively immutable in the following sense: -// a copy-on-write marker is used so that when you modify the resulting -// bitmap, copies of selected data (containers) are made. -// You should *not* change the copy-on-write status of the resulting -// bitmaps (SetCopyOnWrite). -// -// Thus FromBuffer is more likely to be appropriate for read-only use cases, -// when the resulting bitmap can be considered immutable. -// -// If buf becomes unavailable, then a bitmap created with -// FromBuffer would be effectively broken. Furthermore, any -// bitmap derived from this bitmap (e.g., via Or, And) might -// also be broken. Thus, before making buf unavailable, you should -// call CloneCopyOnWriteContainers on all such bitmaps. -// -// See also the FromUnsafeBytes function which can have better performance -// in some cases. -func (rb *Bitmap) FromBuffer(buf []byte) (p int64, err error) { - stream := internal.ByteBufferPool.Get().(*internal.ByteBuffer) - stream.Reset(buf) - - p, err = rb.highlowcontainer.readFrom(stream) - internal.ByteBufferPool.Put(stream) - - return -} - -// RunOptimize attempts to further compress the runs of consecutive values found in the bitmap -func (rb *Bitmap) RunOptimize() { - rb.highlowcontainer.runOptimize() -} - -// HasRunCompression returns true if the bitmap benefits from run compression -func (rb *Bitmap) HasRunCompression() bool { - return rb.highlowcontainer.hasRunCompression() -} - -// MarshalBinary implements the encoding.BinaryMarshaler interface for the bitmap -// (same as ToBytes) -func (rb *Bitmap) MarshalBinary() ([]byte, error) { - return rb.ToBytes() -} - -// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface for the bitmap -func (rb *Bitmap) UnmarshalBinary(data []byte) error { - r := bytes.NewReader(data) - _, err := rb.ReadFrom(r) - return err -} - -// NewBitmap creates a new empty Bitmap (see also New) -func NewBitmap() *Bitmap { - return &Bitmap{} -} - -// New creates a new empty Bitmap (same as NewBitmap) -func New() *Bitmap { - return &Bitmap{} -} - -// Clear resets the Bitmap to be logically empty, but may retain -// some memory allocations that may speed up future operations -func (rb *Bitmap) Clear() { - rb.highlowcontainer.clear() -} - -// ToBitSet copies the content of the RoaringBitmap into a bitset.BitSet instance -func (rb *Bitmap) ToBitSet() *bitset.BitSet { - return bitset.From(rb.ToDense()) -} - -// FromBitSet creates a new RoaringBitmap from a bitset.BitSet instance -func FromBitSet(bitset *bitset.BitSet) *Bitmap { - return FromDense(bitset.Bytes(), false) -} - -// ToArray creates a new slice containing all of the integers stored in the Bitmap in sorted order -func (rb *Bitmap) ToArray() []uint32 { - array := make([]uint32, rb.GetCardinality()) - pos := 0 - pos2 := 0 - - for pos < rb.highlowcontainer.size() { - hs := uint32(rb.highlowcontainer.getKeyAtIndex(pos)) << 16 - c := rb.highlowcontainer.getContainerAtIndex(pos) - pos++ - pos2 = c.fillLeastSignificant16bits(array, pos2, hs) - } - return array -} - -// GetSizeInBytes estimates the memory usage of the Bitmap. Note that this -// might differ slightly from the amount of bytes required for persistent storage -func (rb *Bitmap) GetSizeInBytes() uint64 { - size := uint64(8) - for _, c := range rb.highlowcontainer.containers { - size += uint64(2) + uint64(c.getSizeInBytes()) - } - return size -} - -// GetSerializedSizeInBytes computes the serialized size in bytes -// of the Bitmap. It should correspond to the -// number of bytes written when invoking WriteTo. You can expect -// that this function is much cheaper computationally than WriteTo. -func (rb *Bitmap) GetSerializedSizeInBytes() uint64 { - return rb.highlowcontainer.serializedSizeInBytes() -} - -// BoundSerializedSizeInBytes returns an upper bound on the serialized size in bytes -// assuming that one wants to store "cardinality" integers in [0, universe_size) -func BoundSerializedSizeInBytes(cardinality uint64, universeSize uint64) uint64 { - contnbr := (universeSize + uint64(65535)) / uint64(65536) - if contnbr > cardinality { - contnbr = cardinality - // we cannot have more containers than we have values - } - headermax := 8*contnbr + 4 - if 4 > (contnbr+7)/8 { - headermax += 4 - } else { - headermax += (contnbr + 7) / 8 - } - valsarray := uint64(arrayContainerSizeInBytes(int(cardinality))) - valsbitmap := contnbr * uint64(bitmapContainerSizeInBytes()) - valsbest := valsarray - if valsbest > valsbitmap { - valsbest = valsbitmap - } - return valsbest + headermax -} - -// IntIterable allows you to iterate over the values in a Bitmap -type IntIterable interface { - HasNext() bool - Next() uint32 -} - -// IntPeekable allows you to look at the next value without advancing and -// advance as long as the next value is smaller than minval -type IntPeekable interface { - IntIterable - // PeekNext peeks the next value without advancing the iterator - PeekNext() uint32 - // AdvanceIfNeeded advances as long as the next value is smaller than minval - AdvanceIfNeeded(minval uint32) -} - -type intIterator struct { - pos int - hs uint32 - iter shortPeekable - highlowcontainer *roaringArray - - // These embedded iterators per container type help reduce load in the GC. - // This way, instead of making up-to 64k allocations per full iteration - // we get a single allocation and simply reinitialize the appropriate - // iterator and point to it in the generic `iter` member on each key bound. - shortIter shortIterator - runIter runIterator16 - bitmapIter bitmapContainerShortIterator -} - -// HasNext returns true if there are more integers to iterate over -func (ii *intIterator) HasNext() bool { - return ii.pos < ii.highlowcontainer.size() -} - -func (ii *intIterator) init() { - if ii.highlowcontainer.size() > ii.pos { - ii.hs = uint32(ii.highlowcontainer.getKeyAtIndex(ii.pos)) << 16 - c := ii.highlowcontainer.getContainerAtIndex(ii.pos) - switch t := c.(type) { - case *arrayContainer: - ii.shortIter = shortIterator{t.content, 0} - ii.iter = &ii.shortIter - case *runContainer16: - ii.runIter = runIterator16{rc: t, curIndex: 0, curPosInIndex: 0} - ii.iter = &ii.runIter - case *bitmapContainer: - ii.bitmapIter = bitmapContainerShortIterator{t, t.NextSetBit(0)} - ii.iter = &ii.bitmapIter - } - } -} - -// Next returns the next integer -func (ii *intIterator) Next() uint32 { - x := uint32(ii.iter.next()) | ii.hs - if !ii.iter.hasNext() { - ii.pos = ii.pos + 1 - ii.init() - } - return x -} - -// PeekNext peeks the next value without advancing the iterator -func (ii *intIterator) PeekNext() uint32 { - return uint32(ii.iter.peekNext()&maxLowBit) | ii.hs -} - -// AdvanceIfNeeded advances as long as the next value is smaller than minval -func (ii *intIterator) AdvanceIfNeeded(minval uint32) { - to := minval & 0xffff0000 - - for ii.HasNext() && ii.hs < to { - ii.pos++ - ii.init() - } - - if ii.HasNext() && ii.hs == to { - ii.iter.advanceIfNeeded(lowbits(minval)) - - if !ii.iter.hasNext() { - ii.pos++ - ii.init() - } - } -} - -// IntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap) -type IntIterator = intIterator - -// Initialize configures the existing iterator so that it can iterate through the values of -// the provided bitmap. -// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove). -func (ii *intIterator) Initialize(a *Bitmap) { - ii.pos = 0 - ii.highlowcontainer = &a.highlowcontainer - ii.init() -} - -type intReverseIterator struct { - pos int - hs uint32 - iter shortIterable - highlowcontainer *roaringArray - - shortIter reverseIterator - runIter runReverseIterator16 - bitmapIter reverseBitmapContainerShortIterator -} - -// HasNext returns true if there are more integers to iterate over -func (ii *intReverseIterator) HasNext() bool { - return ii.pos >= 0 -} - -func (ii *intReverseIterator) init() { - if ii.pos >= 0 { - ii.hs = uint32(ii.highlowcontainer.getKeyAtIndex(ii.pos)) << 16 - c := ii.highlowcontainer.getContainerAtIndex(ii.pos) - switch t := c.(type) { - case *arrayContainer: - ii.shortIter = reverseIterator{t.content, len(t.content) - 1} - ii.iter = &ii.shortIter - case *runContainer16: - index := int(len(t.iv)) - 1 - pos := uint16(0) - - if index >= 0 { - pos = t.iv[index].length - } - - ii.runIter = runReverseIterator16{rc: t, curIndex: index, curPosInIndex: pos} - ii.iter = &ii.runIter - case *bitmapContainer: - pos := -1 - if t.cardinality > 0 { - pos = int(t.maximum()) - } - ii.bitmapIter = reverseBitmapContainerShortIterator{t, pos} - ii.iter = &ii.bitmapIter - } - } else { - ii.iter = nil - } -} - -// Next returns the next integer -func (ii *intReverseIterator) Next() uint32 { - x := uint32(ii.iter.next()) | ii.hs - if !ii.iter.hasNext() { - ii.pos = ii.pos - 1 - ii.init() - } - return x -} - -// IntReverseIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap) -type IntReverseIterator = intReverseIterator - -// Initialize configures the existing iterator so that it can iterate through the values of -// the provided bitmap. -// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove). -func (ii *intReverseIterator) Initialize(a *Bitmap) { - ii.highlowcontainer = &a.highlowcontainer - ii.pos = a.highlowcontainer.size() - 1 - ii.init() -} - -// ManyIntIterable allows you to iterate over the values in a Bitmap -type ManyIntIterable interface { - // NextMany fills buf up with values, returns how many values were returned - NextMany(buf []uint32) int - // NextMany64 fills up buf with 64 bit values, uses hs as a mask (OR), returns how many values were returned - NextMany64(hs uint64, buf []uint64) int -} - -type manyIntIterator struct { - pos int - hs uint32 - iter manyIterable - highlowcontainer *roaringArray - - shortIter shortIterator - runIter runIterator16 - bitmapIter bitmapContainerManyIterator -} - -func (ii *manyIntIterator) init() { - if ii.highlowcontainer.size() > ii.pos { - ii.hs = uint32(ii.highlowcontainer.getKeyAtIndex(ii.pos)) << 16 - c := ii.highlowcontainer.getContainerAtIndex(ii.pos) - switch t := c.(type) { - case *arrayContainer: - ii.shortIter = shortIterator{t.content, 0} - ii.iter = &ii.shortIter - case *runContainer16: - ii.runIter = runIterator16{rc: t, curIndex: 0, curPosInIndex: 0} - ii.iter = &ii.runIter - case *bitmapContainer: - ii.bitmapIter = bitmapContainerManyIterator{t, -1, 0} - ii.iter = &ii.bitmapIter - } - } else { - ii.iter = nil - } -} - -func (ii *manyIntIterator) NextMany(buf []uint32) int { - n := 0 - for n < len(buf) { - if ii.iter == nil { - break - } - moreN := ii.iter.nextMany(ii.hs, buf[n:]) - n += moreN - if moreN == 0 { - ii.pos = ii.pos + 1 - ii.init() - } - } - - return n -} - -func (ii *manyIntIterator) NextMany64(hs64 uint64, buf []uint64) int { - n := 0 - for n < len(buf) { - if ii.iter == nil { - break - } - - hs := uint64(ii.hs) | hs64 - moreN := ii.iter.nextMany64(hs, buf[n:]) - n += moreN - if moreN == 0 { - ii.pos = ii.pos + 1 - ii.init() - } - } - - return n -} - -// ManyIntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap) -type ManyIntIterator = manyIntIterator - -// Initialize configures the existing iterator so that it can iterate through the values of -// the provided bitmap. -// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove). -func (ii *manyIntIterator) Initialize(a *Bitmap) { - ii.pos = 0 - ii.highlowcontainer = &a.highlowcontainer - ii.init() -} - -// String creates a string representation of the Bitmap -func (rb *Bitmap) String() string { - // inspired by https://github.com/fzandona/goroar/ - var buffer bytes.Buffer - start := []byte("{") - buffer.Write(start) - i := rb.Iterator() - counter := 0 - if i.HasNext() { - counter = counter + 1 - buffer.WriteString(strconv.FormatInt(int64(i.Next()), 10)) - } - for i.HasNext() { - buffer.WriteString(",") - counter = counter + 1 - // to avoid exhausting the memory - if counter > 0x40000 { - buffer.WriteString("...") - break - } - buffer.WriteString(strconv.FormatInt(int64(i.Next()), 10)) - } - buffer.WriteString("}") - return buffer.String() -} - -// Iterate iterates over the bitmap, calling the given callback with each value in the bitmap. If the callback returns -// false, the iteration is halted. -// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove). -// There is no guarantee as to what order the values will be iterated. -func (rb *Bitmap) Iterate(cb func(x uint32) bool) { - for i := 0; i < rb.highlowcontainer.size(); i++ { - hs := uint32(rb.highlowcontainer.getKeyAtIndex(i)) << 16 - c := rb.highlowcontainer.getContainerAtIndex(i) - - var shouldContinue bool - // This is hacky but it avoids allocations from invoking an interface method with a closure - switch t := c.(type) { - case *arrayContainer: - shouldContinue = t.iterate(func(x uint16) bool { - return cb(uint32(x) | hs) - }) - case *runContainer16: - shouldContinue = t.iterate(func(x uint16) bool { - return cb(uint32(x) | hs) - }) - case *bitmapContainer: - shouldContinue = t.iterate(func(x uint16) bool { - return cb(uint32(x) | hs) - }) - } - - if !shouldContinue { - break - } - } -} - -// Iterator creates a new IntPeekable to iterate over the integers contained in the bitmap, in sorted order; -// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). -func (rb *Bitmap) Iterator() IntPeekable { - p := new(intIterator) - p.Initialize(rb) - return p -} - -// ReverseIterator creates a new IntIterable to iterate over the integers contained in the bitmap, in sorted order; -// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). -func (rb *Bitmap) ReverseIterator() IntIterable { - p := new(intReverseIterator) - p.Initialize(rb) - return p -} - -// ManyIterator creates a new ManyIntIterable to iterate over the integers contained in the bitmap, in sorted order; -// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). -func (rb *Bitmap) ManyIterator() ManyIntIterable { - p := new(manyIntIterator) - p.Initialize(rb) - return p -} - -// Clone creates a copy of the Bitmap -func (rb *Bitmap) Clone() *Bitmap { - ptr := new(Bitmap) - ptr.highlowcontainer = *rb.highlowcontainer.clone() - return ptr -} - -// Minimum get the smallest value stored in this roaring bitmap, assumes that it is not empty -func (rb *Bitmap) Minimum() uint32 { - if len(rb.highlowcontainer.containers) == 0 { - panic("Empty bitmap") - } - return uint32(rb.highlowcontainer.containers[0].minimum()) | (uint32(rb.highlowcontainer.keys[0]) << 16) -} - -// Maximum get the largest value stored in this roaring bitmap, assumes that it is not empty -func (rb *Bitmap) Maximum() uint32 { - if len(rb.highlowcontainer.containers) == 0 { - panic("Empty bitmap") - } - lastindex := len(rb.highlowcontainer.containers) - 1 - return uint32(rb.highlowcontainer.containers[lastindex].maximum()) | (uint32(rb.highlowcontainer.keys[lastindex]) << 16) -} - -// Contains returns true if the integer is contained in the bitmap -func (rb *Bitmap) Contains(x uint32) bool { - hb := highbits(x) - c := rb.highlowcontainer.getContainer(hb) - return c != nil && c.contains(lowbits(x)) -} - -// ContainsInt returns true if the integer is contained in the bitmap (this is a convenience method, the parameter is casted to uint32 and Contains is called) -func (rb *Bitmap) ContainsInt(x int) bool { - return rb.Contains(uint32(x)) -} - -// Equals returns true if the two bitmaps contain the same integers -func (rb *Bitmap) Equals(o interface{}) bool { - srb, ok := o.(*Bitmap) - if ok { - return srb.highlowcontainer.equals(rb.highlowcontainer) - } - return false -} - -// AddOffset adds the value 'offset' to each and every value in a bitmap, generating a new bitmap in the process -func AddOffset(x *Bitmap, offset uint32) (answer *Bitmap) { - return AddOffset64(x, int64(offset)) -} - -// AddOffset64 adds the value 'offset' to each and every value in a bitmap, generating a new bitmap in the process -// If offset + element is outside of the range [0,2^32), that the element will be dropped -func AddOffset64(x *Bitmap, offset int64) (answer *Bitmap) { - // we need "offset" to be a long because we want to support values - // between -0xFFFFFFFF up to +-0xFFFFFFFF - var containerOffset64 int64 - - if offset < 0 { - containerOffset64 = (offset - (1 << 16) + 1) / (1 << 16) - } else { - containerOffset64 = offset >> 16 - } - - answer = New() - - if containerOffset64 >= (1<<16) || containerOffset64 < -(1<<16) { - return answer - } - - containerOffset := int32(containerOffset64) - inOffset := (uint16)(offset - containerOffset64*(1<<16)) - - if inOffset == 0 { - for pos := 0; pos < x.highlowcontainer.size(); pos++ { - key := int32(x.highlowcontainer.getKeyAtIndex(pos)) - key += containerOffset - - if key >= 0 && key <= MaxUint16 { - c := x.highlowcontainer.getContainerAtIndex(pos).clone() - answer.highlowcontainer.appendContainer(uint16(key), c, false) - } - } - } else { - for pos := 0; pos < x.highlowcontainer.size(); pos++ { - key := int32(x.highlowcontainer.getKeyAtIndex(pos)) - key += containerOffset - - if key+1 < 0 || key > MaxUint16 { - continue - } - - c := x.highlowcontainer.getContainerAtIndex(pos) - lo, hi := c.addOffset(inOffset) - - if lo != nil && key >= 0 { - curSize := answer.highlowcontainer.size() - lastkey := int32(0) - - if curSize > 0 { - lastkey = int32(answer.highlowcontainer.getKeyAtIndex(curSize - 1)) - } - - if curSize > 0 && lastkey == key { - prev := answer.highlowcontainer.getContainerAtIndex(curSize - 1) - orresult := prev.ior(lo) - answer.highlowcontainer.setContainerAtIndex(curSize-1, orresult) - } else { - answer.highlowcontainer.appendContainer(uint16(key), lo, false) - } - } - - if hi != nil && key+1 <= MaxUint16 { - answer.highlowcontainer.appendContainer(uint16(key+1), hi, false) - } - } - } - - return answer -} - -// Add the integer x to the bitmap -func (rb *Bitmap) Add(x uint32) { - hb := highbits(x) - ra := &rb.highlowcontainer - i := ra.getIndex(hb) - if i >= 0 { - var c container - c = ra.getWritableContainerAtIndex(i).iaddReturnMinimized(lowbits(x)) - rb.highlowcontainer.setContainerAtIndex(i, c) - } else { - newac := newArrayContainer() - rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, newac.iaddReturnMinimized(lowbits(x))) - } -} - -// add the integer x to the bitmap, return the container and its index -func (rb *Bitmap) addwithptr(x uint32) (int, container) { - hb := highbits(x) - ra := &rb.highlowcontainer - i := ra.getIndex(hb) - var c container - if i >= 0 { - c = ra.getWritableContainerAtIndex(i).iaddReturnMinimized(lowbits(x)) - rb.highlowcontainer.setContainerAtIndex(i, c) - return i, c - } - newac := newArrayContainer() - c = newac.iaddReturnMinimized(lowbits(x)) - rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, c) - return -i - 1, c -} - -// CheckedAdd adds the integer x to the bitmap and return true if it was added (false if the integer was already present) -func (rb *Bitmap) CheckedAdd(x uint32) bool { - // TODO: add unit tests for this method - hb := highbits(x) - i := rb.highlowcontainer.getIndex(hb) - if i >= 0 { - C := rb.highlowcontainer.getWritableContainerAtIndex(i) - oldcard := C.getCardinality() - C = C.iaddReturnMinimized(lowbits(x)) - rb.highlowcontainer.setContainerAtIndex(i, C) - return C.getCardinality() > oldcard - } - newac := newArrayContainer() - rb.highlowcontainer.insertNewKeyValueAt(-i-1, hb, newac.iaddReturnMinimized(lowbits(x))) - return true - -} - -// AddInt adds the integer x to the bitmap (convenience method: the parameter is casted to uint32 and we call Add) -func (rb *Bitmap) AddInt(x int) { - rb.Add(uint32(x)) -} - -// Remove the integer x from the bitmap -func (rb *Bitmap) Remove(x uint32) { - hb := highbits(x) - i := rb.highlowcontainer.getIndex(hb) - if i >= 0 { - c := rb.highlowcontainer.getWritableContainerAtIndex(i).iremoveReturnMinimized(lowbits(x)) - rb.highlowcontainer.setContainerAtIndex(i, c) - if rb.highlowcontainer.getContainerAtIndex(i).isEmpty() { - rb.highlowcontainer.removeAtIndex(i) - } - } -} - -// CheckedRemove removes the integer x from the bitmap and return true if the integer was effectively removed (and false if the integer was not present) -func (rb *Bitmap) CheckedRemove(x uint32) bool { - // TODO: add unit tests for this method - hb := highbits(x) - i := rb.highlowcontainer.getIndex(hb) - if i >= 0 { - C := rb.highlowcontainer.getWritableContainerAtIndex(i) - oldcard := C.getCardinality() - C = C.iremoveReturnMinimized(lowbits(x)) - rb.highlowcontainer.setContainerAtIndex(i, C) - if rb.highlowcontainer.getContainerAtIndex(i).isEmpty() { - rb.highlowcontainer.removeAtIndex(i) - return true - } - return C.getCardinality() < oldcard - } - return false - -} - -// IsEmpty returns true if the Bitmap is empty (it is faster than doing (GetCardinality() == 0)) -func (rb *Bitmap) IsEmpty() bool { - return rb.highlowcontainer.size() == 0 -} - -// GetCardinality returns the number of integers contained in the bitmap -func (rb *Bitmap) GetCardinality() uint64 { - size := uint64(0) - for _, c := range rb.highlowcontainer.containers { - size += uint64(c.getCardinality()) - } - return size -} - -// Rank returns the number of integers that are smaller or equal to x (Rank(infinity) would be GetCardinality()). -// If you pass the smallest value, you get the value 1. If you pass a value that is smaller than the smallest -// value, you get 0. Note that this function differs in convention from the Select function since it -// return 1 and not 0 on the smallest value. -func (rb *Bitmap) Rank(x uint32) uint64 { - size := uint64(0) - for i := 0; i < rb.highlowcontainer.size(); i++ { - key := rb.highlowcontainer.getKeyAtIndex(i) - if key > highbits(x) { - return size - } - if key < highbits(x) { - size += uint64(rb.highlowcontainer.getContainerAtIndex(i).getCardinality()) - } else { - return size + uint64(rb.highlowcontainer.getContainerAtIndex(i).rank(lowbits(x))) - } - } - return size -} - -// Select returns the xth integer in the bitmap. If you pass 0, you get -// the smallest element. Note that this function differs in convention from -// the Rank function which returns 1 on the smallest value. -func (rb *Bitmap) Select(x uint32) (uint32, error) { - remaining := x - for i := 0; i < rb.highlowcontainer.size(); i++ { - c := rb.highlowcontainer.getContainerAtIndex(i) - card := uint32(c.getCardinality()) - if remaining >= card { - remaining -= card - } else { - key := rb.highlowcontainer.getKeyAtIndex(i) - return uint32(key)<<16 + uint32(c.selectInt(uint16(remaining))), nil - } - } - return 0, fmt.Errorf("cannot find %dth integer in a bitmap with only %d items", x, rb.GetCardinality()) -} - -// And computes the intersection between two bitmaps and stores the result in the current bitmap -func (rb *Bitmap) And(x2 *Bitmap) { - pos1 := 0 - pos2 := 0 - intersectionsize := 0 - length1 := rb.highlowcontainer.size() - length2 := x2.highlowcontainer.size() - -main: - for { - if pos1 < length1 && pos2 < length2 { - s1 := rb.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - for { - if s1 == s2 { - c1 := rb.highlowcontainer.getWritableContainerAtIndex(pos1) - c2 := x2.highlowcontainer.getContainerAtIndex(pos2) - diff := c1.iand(c2) - if !diff.isEmpty() { - rb.highlowcontainer.replaceKeyAndContainerAtIndex(intersectionsize, s1, diff, false) - intersectionsize++ - } - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else if s1 < s2 { - pos1 = rb.highlowcontainer.advanceUntil(s2, pos1) - if pos1 == length1 { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 - pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } else { - break - } - } - rb.highlowcontainer.resize(intersectionsize) -} - -// OrCardinality returns the cardinality of the union between two bitmaps, bitmaps are not modified -func (rb *Bitmap) OrCardinality(x2 *Bitmap) uint64 { - pos1 := 0 - pos2 := 0 - length1 := rb.highlowcontainer.size() - length2 := x2.highlowcontainer.size() - answer := uint64(0) -main: - for { - if (pos1 < length1) && (pos2 < length2) { - s1 := rb.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - - for { - if s1 < s2 { - answer += uint64(rb.highlowcontainer.getContainerAtIndex(pos1).getCardinality()) - pos1++ - if pos1 == length1 { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else if s1 > s2 { - answer += uint64(x2.highlowcontainer.getContainerAtIndex(pos2).getCardinality()) - pos2++ - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else { - // TODO: could be faster if we did not have to materialize the container - answer += uint64(rb.highlowcontainer.getContainerAtIndex(pos1).or(x2.highlowcontainer.getContainerAtIndex(pos2)).getCardinality()) - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } else { - break - } - } - for ; pos1 < length1; pos1++ { - answer += uint64(rb.highlowcontainer.getContainerAtIndex(pos1).getCardinality()) - } - for ; pos2 < length2; pos2++ { - answer += uint64(x2.highlowcontainer.getContainerAtIndex(pos2).getCardinality()) - } - return answer -} - -// AndCardinality returns the cardinality of the intersection between two bitmaps, bitmaps are not modified -func (rb *Bitmap) AndCardinality(x2 *Bitmap) uint64 { - pos1 := 0 - pos2 := 0 - answer := uint64(0) - length1 := rb.highlowcontainer.size() - length2 := x2.highlowcontainer.size() - -main: - for { - if pos1 < length1 && pos2 < length2 { - s1 := rb.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - for { - if s1 == s2 { - c1 := rb.highlowcontainer.getContainerAtIndex(pos1) - c2 := x2.highlowcontainer.getContainerAtIndex(pos2) - answer += uint64(c1.andCardinality(c2)) - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else if s1 < s2 { - pos1 = rb.highlowcontainer.advanceUntil(s2, pos1) - if pos1 == length1 { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 - pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } else { - break - } - } - return answer -} - -// IntersectsWithInterval checks whether a bitmap 'rb' and an open interval '[x,y)' intersect. -func (rb *Bitmap) IntersectsWithInterval(x, y uint64) bool { - if x >= y { - return false - } - if x > MaxUint32 { - return false - } - - it := intIterator{} - it.Initialize(rb) - it.AdvanceIfNeeded(uint32(x)) - if !it.HasNext() { - return false - } - if uint64(it.Next()) >= y { - return false - } - - return true -} - -// Intersects checks whether two bitmap intersects, bitmaps are not modified -func (rb *Bitmap) Intersects(x2 *Bitmap) bool { - pos1 := 0 - pos2 := 0 - length1 := rb.highlowcontainer.size() - length2 := x2.highlowcontainer.size() - -main: - for { - if pos1 < length1 && pos2 < length2 { - s1 := rb.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - for { - if s1 == s2 { - c1 := rb.highlowcontainer.getContainerAtIndex(pos1) - c2 := x2.highlowcontainer.getContainerAtIndex(pos2) - if c1.intersects(c2) { - return true - } - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else if s1 < s2 { - pos1 = rb.highlowcontainer.advanceUntil(s2, pos1) - if pos1 == length1 { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 - pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } else { - break - } - } - return false -} - -// Xor computes the symmetric difference between two bitmaps and stores the result in the current bitmap -func (rb *Bitmap) Xor(x2 *Bitmap) { - pos1 := 0 - pos2 := 0 - length1 := rb.highlowcontainer.size() - length2 := x2.highlowcontainer.size() - for { - if (pos1 < length1) && (pos2 < length2) { - s1 := rb.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - if s1 < s2 { - pos1 = rb.highlowcontainer.advanceUntil(s2, pos1) - if pos1 == length1 { - break - } - } else if s1 > s2 { - c := x2.highlowcontainer.getWritableContainerAtIndex(pos2) - rb.highlowcontainer.insertNewKeyValueAt(pos1, x2.highlowcontainer.getKeyAtIndex(pos2), c) - length1++ - pos1++ - pos2++ - } else { - // TODO: couple be computed in-place for reduced memory usage - c := rb.highlowcontainer.getContainerAtIndex(pos1).xor(x2.highlowcontainer.getContainerAtIndex(pos2)) - if !c.isEmpty() { - rb.highlowcontainer.setContainerAtIndex(pos1, c) - pos1++ - } else { - rb.highlowcontainer.removeAtIndex(pos1) - length1-- - } - pos2++ - } - } else { - break - } - } - if pos1 == length1 { - rb.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) - } -} - -// Or computes the union between two bitmaps and stores the result in the current bitmap -func (rb *Bitmap) Or(x2 *Bitmap) { - pos1 := 0 - pos2 := 0 - length1 := rb.highlowcontainer.size() - length2 := x2.highlowcontainer.size() -main: - for (pos1 < length1) && (pos2 < length2) { - s1 := rb.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - - for { - if s1 < s2 { - pos1++ - if pos1 == length1 { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else if s1 > s2 { - rb.highlowcontainer.insertNewKeyValueAt(pos1, s2, x2.highlowcontainer.getContainerAtIndex(pos2).clone()) - pos1++ - length1++ - pos2++ - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else { - rb.highlowcontainer.replaceKeyAndContainerAtIndex(pos1, s1, rb.highlowcontainer.getUnionedWritableContainer(pos1, x2.highlowcontainer.getContainerAtIndex(pos2)), false) - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } - if pos1 == length1 { - rb.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) - } -} - -// AndNot computes the difference between two bitmaps and stores the result in the current bitmap -func (rb *Bitmap) AndNot(x2 *Bitmap) { - pos1 := 0 - pos2 := 0 - intersectionsize := 0 - length1 := rb.highlowcontainer.size() - length2 := x2.highlowcontainer.size() - -main: - for { - if pos1 < length1 && pos2 < length2 { - s1 := rb.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - for { - if s1 == s2 { - c1 := rb.highlowcontainer.getWritableContainerAtIndex(pos1) - c2 := x2.highlowcontainer.getContainerAtIndex(pos2) - diff := c1.iandNot(c2) - if !diff.isEmpty() { - rb.highlowcontainer.replaceKeyAndContainerAtIndex(intersectionsize, s1, diff, false) - intersectionsize++ - } - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else if s1 < s2 { - c1 := rb.highlowcontainer.getContainerAtIndex(pos1) - mustCopyOnWrite := rb.highlowcontainer.needsCopyOnWrite(pos1) - rb.highlowcontainer.replaceKeyAndContainerAtIndex(intersectionsize, s1, c1, mustCopyOnWrite) - intersectionsize++ - pos1++ - if pos1 == length1 { - break main - } - s1 = rb.highlowcontainer.getKeyAtIndex(pos1) - } else { //s1 > s2 - pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } else { - break - } - } - // TODO:implement as a copy - for pos1 < length1 { - c1 := rb.highlowcontainer.getContainerAtIndex(pos1) - s1 := rb.highlowcontainer.getKeyAtIndex(pos1) - mustCopyOnWrite := rb.highlowcontainer.needsCopyOnWrite(pos1) - rb.highlowcontainer.replaceKeyAndContainerAtIndex(intersectionsize, s1, c1, mustCopyOnWrite) - intersectionsize++ - pos1++ - } - rb.highlowcontainer.resize(intersectionsize) -} - -// Or computes the union between two bitmaps and returns the result -func Or(x1, x2 *Bitmap) *Bitmap { - answer := NewBitmap() - pos1 := 0 - pos2 := 0 - length1 := x1.highlowcontainer.size() - length2 := x2.highlowcontainer.size() -main: - for (pos1 < length1) && (pos2 < length2) { - s1 := x1.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - - for { - if s1 < s2 { - answer.highlowcontainer.appendCopy(x1.highlowcontainer, pos1) - pos1++ - if pos1 == length1 { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - } else if s1 > s2 { - answer.highlowcontainer.appendCopy(x2.highlowcontainer, pos2) - pos2++ - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else { - - answer.highlowcontainer.appendContainer(s1, x1.highlowcontainer.getContainerAtIndex(pos1).or(x2.highlowcontainer.getContainerAtIndex(pos2)), false) - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } - if pos1 == length1 { - answer.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) - } else if pos2 == length2 { - answer.highlowcontainer.appendCopyMany(x1.highlowcontainer, pos1, length1) - } - return answer -} - -// And computes the intersection between two bitmaps and returns the result -func And(x1, x2 *Bitmap) *Bitmap { - answer := NewBitmap() - pos1 := 0 - pos2 := 0 - length1 := x1.highlowcontainer.size() - length2 := x2.highlowcontainer.size() -main: - for pos1 < length1 && pos2 < length2 { - s1 := x1.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - for { - if s1 == s2 { - C := x1.highlowcontainer.getContainerAtIndex(pos1) - C = C.and(x2.highlowcontainer.getContainerAtIndex(pos2)) - - if !C.isEmpty() { - answer.highlowcontainer.appendContainer(s1, C, false) - } - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else if s1 < s2 { - pos1 = x1.highlowcontainer.advanceUntil(s2, pos1) - if pos1 == length1 { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - } else { // s1 > s2 - pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } - return answer -} - -// Xor computes the symmetric difference between two bitmaps and returns the result -func Xor(x1, x2 *Bitmap) *Bitmap { - answer := NewBitmap() - pos1 := 0 - pos2 := 0 - length1 := x1.highlowcontainer.size() - length2 := x2.highlowcontainer.size() - for { - if (pos1 < length1) && (pos2 < length2) { - s1 := x1.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - if s1 < s2 { - answer.highlowcontainer.appendCopy(x1.highlowcontainer, pos1) - pos1++ - } else if s1 > s2 { - answer.highlowcontainer.appendCopy(x2.highlowcontainer, pos2) - pos2++ - } else { - c := x1.highlowcontainer.getContainerAtIndex(pos1).xor(x2.highlowcontainer.getContainerAtIndex(pos2)) - if !c.isEmpty() { - answer.highlowcontainer.appendContainer(s1, c, false) - } - pos1++ - pos2++ - } - } else { - break - } - } - if pos1 == length1 { - answer.highlowcontainer.appendCopyMany(x2.highlowcontainer, pos2, length2) - } else if pos2 == length2 { - answer.highlowcontainer.appendCopyMany(x1.highlowcontainer, pos1, length1) - } - return answer -} - -// AndNot computes the difference between two bitmaps and returns the result -func AndNot(x1, x2 *Bitmap) *Bitmap { - answer := NewBitmap() - pos1 := 0 - pos2 := 0 - length1 := x1.highlowcontainer.size() - length2 := x2.highlowcontainer.size() - -main: - for { - if pos1 < length1 && pos2 < length2 { - s1 := x1.highlowcontainer.getKeyAtIndex(pos1) - s2 := x2.highlowcontainer.getKeyAtIndex(pos2) - for { - if s1 < s2 { - answer.highlowcontainer.appendCopy(x1.highlowcontainer, pos1) - pos1++ - if pos1 == length1 { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - } else if s1 == s2 { - c1 := x1.highlowcontainer.getContainerAtIndex(pos1) - c2 := x2.highlowcontainer.getContainerAtIndex(pos2) - diff := c1.andNot(c2) - if !diff.isEmpty() { - answer.highlowcontainer.appendContainer(s1, diff, false) - } - pos1++ - pos2++ - if (pos1 == length1) || (pos2 == length2) { - break main - } - s1 = x1.highlowcontainer.getKeyAtIndex(pos1) - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } else { //s1 > s2 - pos2 = x2.highlowcontainer.advanceUntil(s1, pos2) - if pos2 == length2 { - break main - } - s2 = x2.highlowcontainer.getKeyAtIndex(pos2) - } - } - } else { - break - } - } - if pos2 == length2 { - answer.highlowcontainer.appendCopyMany(x1.highlowcontainer, pos1, length1) - } - return answer -} - -// AddMany add all of the values in dat -func (rb *Bitmap) AddMany(dat []uint32) { - if len(dat) == 0 { - return - } - prev := dat[0] - idx, c := rb.addwithptr(prev) - for _, i := range dat[1:] { - if highbits(prev) == highbits(i) { - c = c.iaddReturnMinimized(lowbits(i)) - rb.highlowcontainer.setContainerAtIndex(idx, c) - } else { - idx, c = rb.addwithptr(i) - } - prev = i - } -} - -// BitmapOf generates a new bitmap filled with the specified integers -func BitmapOf(dat ...uint32) *Bitmap { - ans := NewBitmap() - ans.AddMany(dat) - return ans -} - -// Flip negates the bits in the given range (i.e., [rangeStart,rangeEnd)), any integer present in this range and in the bitmap is removed, -// and any integer present in the range and not in the bitmap is added. -// The function uses 64-bit parameters even though a Bitmap stores 32-bit values because it is allowed and meaningful to use [0,uint64(0x100000000)) as a range -// while uint64(0x100000000) cannot be represented as a 32-bit value. -func (rb *Bitmap) Flip(rangeStart, rangeEnd uint64) { - - if rangeEnd > MaxUint32+1 { - panic("rangeEnd > MaxUint32+1") - } - if rangeStart > MaxUint32+1 { - panic("rangeStart > MaxUint32+1") - } - - if rangeStart >= rangeEnd { - return - } - - hbStart := uint32(highbits(uint32(rangeStart))) - lbStart := uint32(lowbits(uint32(rangeStart))) - hbLast := uint32(highbits(uint32(rangeEnd - 1))) - lbLast := uint32(lowbits(uint32(rangeEnd - 1))) - - var max uint32 = maxLowBit - for hb := hbStart; hb <= hbLast; hb++ { - var containerStart uint32 - if hb == hbStart { - containerStart = uint32(lbStart) - } - containerLast := max - if hb == hbLast { - containerLast = uint32(lbLast) - } - - i := rb.highlowcontainer.getIndex(uint16(hb)) - - if i >= 0 { - c := rb.highlowcontainer.getWritableContainerAtIndex(i).inot(int(containerStart), int(containerLast)+1) - if !c.isEmpty() { - rb.highlowcontainer.setContainerAtIndex(i, c) - } else { - rb.highlowcontainer.removeAtIndex(i) - } - } else { // *think* the range of ones must never be - // empty. - rb.highlowcontainer.insertNewKeyValueAt(-i-1, uint16(hb), rangeOfOnes(int(containerStart), int(containerLast))) - } - } -} - -// FlipInt calls Flip after casting the parameters (convenience method) -func (rb *Bitmap) FlipInt(rangeStart, rangeEnd int) { - rb.Flip(uint64(rangeStart), uint64(rangeEnd)) -} - -// AddRange adds the integers in [rangeStart, rangeEnd) to the bitmap. -// The function uses 64-bit parameters even though a Bitmap stores 32-bit values because it is allowed and meaningful to use [0,uint64(0x100000000)) as a range -// while uint64(0x100000000) cannot be represented as a 32-bit value. -func (rb *Bitmap) AddRange(rangeStart, rangeEnd uint64) { - if rangeStart >= rangeEnd { - return - } - if rangeEnd-1 > MaxUint32 { - panic("rangeEnd-1 > MaxUint32") - } - hbStart := uint32(highbits(uint32(rangeStart))) - lbStart := uint32(lowbits(uint32(rangeStart))) - hbLast := uint32(highbits(uint32(rangeEnd - 1))) - lbLast := uint32(lowbits(uint32(rangeEnd - 1))) - - var max uint32 = maxLowBit - for hb := hbStart; hb <= hbLast; hb++ { - containerStart := uint32(0) - if hb == hbStart { - containerStart = lbStart - } - containerLast := max - if hb == hbLast { - containerLast = lbLast - } - - i := rb.highlowcontainer.getIndex(uint16(hb)) - - if i >= 0 { - c := rb.highlowcontainer.getWritableContainerAtIndex(i).iaddRange(int(containerStart), int(containerLast)+1) - rb.highlowcontainer.setContainerAtIndex(i, c) - } else { // *think* the range of ones must never be - // empty. - rb.highlowcontainer.insertNewKeyValueAt(-i-1, uint16(hb), rangeOfOnes(int(containerStart), int(containerLast))) - } - } -} - -// RemoveRange removes the integers in [rangeStart, rangeEnd) from the bitmap. -// The function uses 64-bit parameters even though a Bitmap stores 32-bit values because it is allowed and meaningful to use [0,uint64(0x100000000)) as a range -// while uint64(0x100000000) cannot be represented as a 32-bit value. -func (rb *Bitmap) RemoveRange(rangeStart, rangeEnd uint64) { - if rangeStart >= rangeEnd { - return - } - if rangeEnd-1 > MaxUint32 { - // logically, we should assume that the user wants to - // remove all values from rangeStart to infinity - // see https://github.com/RoaringBitmap/roaring/issues/141 - rangeEnd = uint64(0x100000000) - } - hbStart := uint32(highbits(uint32(rangeStart))) - lbStart := uint32(lowbits(uint32(rangeStart))) - hbLast := uint32(highbits(uint32(rangeEnd - 1))) - lbLast := uint32(lowbits(uint32(rangeEnd - 1))) - - var max uint32 = maxLowBit - - if hbStart == hbLast { - i := rb.highlowcontainer.getIndex(uint16(hbStart)) - if i < 0 { - return - } - c := rb.highlowcontainer.getWritableContainerAtIndex(i).iremoveRange(int(lbStart), int(lbLast+1)) - if !c.isEmpty() { - rb.highlowcontainer.setContainerAtIndex(i, c) - } else { - rb.highlowcontainer.removeAtIndex(i) - } - return - } - ifirst := rb.highlowcontainer.getIndex(uint16(hbStart)) - ilast := rb.highlowcontainer.getIndex(uint16(hbLast)) - - if ifirst >= 0 { - if lbStart != 0 { - c := rb.highlowcontainer.getWritableContainerAtIndex(ifirst).iremoveRange(int(lbStart), int(max+1)) - if !c.isEmpty() { - rb.highlowcontainer.setContainerAtIndex(ifirst, c) - ifirst++ - } - } - } else { - ifirst = -ifirst - 1 - } - if ilast >= 0 { - if lbLast != max { - c := rb.highlowcontainer.getWritableContainerAtIndex(ilast).iremoveRange(int(0), int(lbLast+1)) - if !c.isEmpty() { - rb.highlowcontainer.setContainerAtIndex(ilast, c) - } else { - ilast++ - } - } else { - ilast++ - } - } else { - ilast = -ilast - 1 - } - rb.highlowcontainer.removeIndexRange(ifirst, ilast) -} - -// Flip negates the bits in the given range (i.e., [rangeStart,rangeEnd)), any integer present in this range and in the bitmap is removed, -// and any integer present in the range and not in the bitmap is added, a new bitmap is returned leaving -// the current bitmap unchanged. -// The function uses 64-bit parameters even though a Bitmap stores 32-bit values because it is allowed and meaningful to use [0,uint64(0x100000000)) as a range -// while uint64(0x100000000) cannot be represented as a 32-bit value. -func Flip(bm *Bitmap, rangeStart, rangeEnd uint64) *Bitmap { - if rangeStart >= rangeEnd { - return bm.Clone() - } - - if rangeStart > MaxUint32 { - panic("rangeStart > MaxUint32") - } - if rangeEnd-1 > MaxUint32 { - panic("rangeEnd-1 > MaxUint32") - } - - answer := NewBitmap() - hbStart := uint32(highbits(uint32(rangeStart))) - lbStart := uint32(lowbits(uint32(rangeStart))) - hbLast := uint32(highbits(uint32(rangeEnd - 1))) - lbLast := uint32(lowbits(uint32(rangeEnd - 1))) - - // copy the containers before the active area - answer.highlowcontainer.appendCopiesUntil(bm.highlowcontainer, uint16(hbStart)) - - var max uint32 = maxLowBit - for hb := hbStart; hb <= hbLast; hb++ { - var containerStart uint32 - if hb == hbStart { - containerStart = uint32(lbStart) - } - containerLast := max - if hb == hbLast { - containerLast = uint32(lbLast) - } - - i := bm.highlowcontainer.getIndex(uint16(hb)) - j := answer.highlowcontainer.getIndex(uint16(hb)) - - if i >= 0 { - c := bm.highlowcontainer.getContainerAtIndex(i).not(int(containerStart), int(containerLast)+1) - if !c.isEmpty() { - answer.highlowcontainer.insertNewKeyValueAt(-j-1, uint16(hb), c) - } - - } else { // *think* the range of ones must never be - // empty. - answer.highlowcontainer.insertNewKeyValueAt(-j-1, uint16(hb), - rangeOfOnes(int(containerStart), int(containerLast))) - } - } - // copy the containers after the active area. - answer.highlowcontainer.appendCopiesAfter(bm.highlowcontainer, uint16(hbLast)) - - return answer -} - -// SetCopyOnWrite sets this bitmap to use copy-on-write so that copies are fast and memory conscious -// if the parameter is true, otherwise we leave the default where hard copies are made -// (copy-on-write requires extra care in a threaded context). -// Calling SetCopyOnWrite(true) on a bitmap created with FromBuffer is unsafe. -func (rb *Bitmap) SetCopyOnWrite(val bool) { - rb.highlowcontainer.copyOnWrite = val -} - -// GetCopyOnWrite gets this bitmap's copy-on-write property -func (rb *Bitmap) GetCopyOnWrite() (val bool) { - return rb.highlowcontainer.copyOnWrite -} - -// CloneCopyOnWriteContainers clones all containers which have -// needCopyOnWrite set to true. -// This can be used to make sure it is safe to munmap a []byte -// that the roaring array may still have a reference to, after -// calling FromBuffer. -// More generally this function is useful if you call FromBuffer -// to construct a bitmap with a backing array buf -// and then later discard the buf array. Note that you should call -// CloneCopyOnWriteContainers on all bitmaps that were derived -// from the 'FromBuffer' bitmap since they map have dependencies -// on the buf array as well. -func (rb *Bitmap) CloneCopyOnWriteContainers() { - rb.highlowcontainer.cloneCopyOnWriteContainers() -} - -// FlipInt calls Flip after casting the parameters (convenience method) -func FlipInt(bm *Bitmap, rangeStart, rangeEnd int) *Bitmap { - return Flip(bm, uint64(rangeStart), uint64(rangeEnd)) -} - -// Statistics provides details on the container types in use. -type Statistics struct { - Cardinality uint64 - Containers uint64 - - ArrayContainers uint64 - ArrayContainerBytes uint64 - ArrayContainerValues uint64 - - BitmapContainers uint64 - BitmapContainerBytes uint64 - BitmapContainerValues uint64 - - RunContainers uint64 - RunContainerBytes uint64 - RunContainerValues uint64 -} - -// Stats returns details on container type usage in a Statistics struct. -func (rb *Bitmap) Stats() Statistics { - stats := Statistics{} - stats.Containers = uint64(len(rb.highlowcontainer.containers)) - for _, c := range rb.highlowcontainer.containers { - stats.Cardinality += uint64(c.getCardinality()) - - switch c.(type) { - case *arrayContainer: - stats.ArrayContainers++ - stats.ArrayContainerBytes += uint64(c.getSizeInBytes()) - stats.ArrayContainerValues += uint64(c.getCardinality()) - case *bitmapContainer: - stats.BitmapContainers++ - stats.BitmapContainerBytes += uint64(c.getSizeInBytes()) - stats.BitmapContainerValues += uint64(c.getCardinality()) - case *runContainer16: - stats.RunContainers++ - stats.RunContainerBytes += uint64(c.getSizeInBytes()) - stats.RunContainerValues += uint64(c.getCardinality()) - } - } - return stats -} diff --git a/vendor/github.com/RoaringBitmap/roaring/roaringarray.go b/vendor/github.com/RoaringBitmap/roaring/roaringarray.go deleted file mode 100644 index 079195d..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/roaringarray.go +++ /dev/null @@ -1,761 +0,0 @@ -package roaring - -import ( - "bytes" - "encoding/binary" - "fmt" - "io" - - "github.com/RoaringBitmap/roaring/internal" -) - -type container interface { - // addOffset returns the (low, high) parts of the shifted container. - // Whenever one of them would be empty, nil will be returned instead to - // avoid unnecessary allocations. - addOffset(uint16) (container, container) - - clone() container - and(container) container - andCardinality(container) int - iand(container) container // i stands for inplace - andNot(container) container - iandNot(container) container // i stands for inplace - isEmpty() bool - getCardinality() int - // rank returns the number of integers that are - // smaller or equal to x. rank(infinity) would be getCardinality(). - rank(uint16) int - - iadd(x uint16) bool // inplace, returns true if x was new. - iaddReturnMinimized(uint16) container // may change return type to minimize storage. - - //addRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) - iaddRange(start, endx int) container // i stands for inplace, range is [firstOfRange,endx) - - iremove(x uint16) bool // inplace, returns true if x was present. - iremoveReturnMinimized(uint16) container // may change return type to minimize storage. - - not(start, final int) container // range is [firstOfRange,lastOfRange) - inot(firstOfRange, endx int) container // i stands for inplace, range is [firstOfRange,endx) - xor(r container) container - getShortIterator() shortPeekable - iterate(cb func(x uint16) bool) bool - getReverseIterator() shortIterable - getManyIterator() manyIterable - contains(i uint16) bool - maximum() uint16 - minimum() uint16 - - // equals is now logical equals; it does not require the - // same underlying container types, but compares across - // any of the implementations. - equals(r container) bool - - fillLeastSignificant16bits(array []uint32, i int, mask uint32) int - or(r container) container - orCardinality(r container) int - isFull() bool - ior(r container) container // i stands for inplace - intersects(r container) bool // whether the two containers intersect - lazyOR(r container) container - lazyIOR(r container) container - getSizeInBytes() int - //removeRange(start, final int) container // range is [firstOfRange,lastOfRange) (unused) - iremoveRange(start, final int) container // i stands for inplace, range is [firstOfRange,lastOfRange) - selectInt(x uint16) int // selectInt returns the xth integer in the container - serializedSizeInBytes() int - writeTo(io.Writer) (int, error) - - numberOfRuns() int - toEfficientContainer() container - String() string - containerType() contype -} - -type contype uint8 - -const ( - bitmapContype contype = iota - arrayContype - run16Contype - run32Contype -) - -// careful: range is [firstOfRange,lastOfRange] -func rangeOfOnes(start, last int) container { - if start > MaxUint16 { - panic("rangeOfOnes called with start > MaxUint16") - } - if last > MaxUint16 { - panic("rangeOfOnes called with last > MaxUint16") - } - if start < 0 { - panic("rangeOfOnes called with start < 0") - } - if last < 0 { - panic("rangeOfOnes called with last < 0") - } - return newRunContainer16Range(uint16(start), uint16(last)) -} - -type roaringArray struct { - keys []uint16 - containers []container `msg:"-"` // don't try to serialize directly. - needCopyOnWrite []bool - copyOnWrite bool -} - -func newRoaringArray() *roaringArray { - return &roaringArray{} -} - -// runOptimize compresses the element containers to minimize space consumed. -// Q: how does this interact with copyOnWrite and needCopyOnWrite? -// A: since we aren't changing the logical content, just the representation, -// -// we don't bother to check the needCopyOnWrite bits. We replace -// (possibly all) elements of ra.containers in-place with space -// optimized versions. -func (ra *roaringArray) runOptimize() { - for i := range ra.containers { - ra.containers[i] = ra.containers[i].toEfficientContainer() - } -} - -func (ra *roaringArray) appendContainer(key uint16, value container, mustCopyOnWrite bool) { - ra.keys = append(ra.keys, key) - ra.containers = append(ra.containers, value) - ra.needCopyOnWrite = append(ra.needCopyOnWrite, mustCopyOnWrite) -} - -func (ra *roaringArray) appendWithoutCopy(sa roaringArray, startingindex int) { - mustCopyOnWrite := sa.needCopyOnWrite[startingindex] - ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex], mustCopyOnWrite) -} - -func (ra *roaringArray) appendCopy(sa roaringArray, startingindex int) { - // cow only if the two request it, or if we already have a lightweight copy - copyonwrite := (ra.copyOnWrite && sa.copyOnWrite) || sa.needsCopyOnWrite(startingindex) - if !copyonwrite { - // since there is no copy-on-write, we need to clone the container (this is important) - ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex].clone(), copyonwrite) - } else { - ra.appendContainer(sa.keys[startingindex], sa.containers[startingindex], copyonwrite) - if !sa.needsCopyOnWrite(startingindex) { - sa.setNeedsCopyOnWrite(startingindex) - } - } -} - -func (ra *roaringArray) appendWithoutCopyMany(sa roaringArray, startingindex, end int) { - for i := startingindex; i < end; i++ { - ra.appendWithoutCopy(sa, i) - } -} - -func (ra *roaringArray) appendCopyMany(sa roaringArray, startingindex, end int) { - for i := startingindex; i < end; i++ { - ra.appendCopy(sa, i) - } -} - -func (ra *roaringArray) appendCopiesUntil(sa roaringArray, stoppingKey uint16) { - // cow only if the two request it, or if we already have a lightweight copy - copyonwrite := ra.copyOnWrite && sa.copyOnWrite - - for i := 0; i < sa.size(); i++ { - if sa.keys[i] >= stoppingKey { - break - } - thiscopyonewrite := copyonwrite || sa.needsCopyOnWrite(i) - if thiscopyonewrite { - ra.appendContainer(sa.keys[i], sa.containers[i], thiscopyonewrite) - if !sa.needsCopyOnWrite(i) { - sa.setNeedsCopyOnWrite(i) - } - - } else { - // since there is no copy-on-write, we need to clone the container (this is important) - ra.appendContainer(sa.keys[i], sa.containers[i].clone(), thiscopyonewrite) - - } - } -} - -func (ra *roaringArray) appendCopiesAfter(sa roaringArray, beforeStart uint16) { - // cow only if the two request it, or if we already have a lightweight copy - copyonwrite := ra.copyOnWrite && sa.copyOnWrite - - startLocation := sa.getIndex(beforeStart) - if startLocation >= 0 { - startLocation++ - } else { - startLocation = -startLocation - 1 - } - - for i := startLocation; i < sa.size(); i++ { - thiscopyonewrite := copyonwrite || sa.needsCopyOnWrite(i) - if thiscopyonewrite { - ra.appendContainer(sa.keys[i], sa.containers[i], thiscopyonewrite) - if !sa.needsCopyOnWrite(i) { - sa.setNeedsCopyOnWrite(i) - } - } else { - // since there is no copy-on-write, we need to clone the container (this is important) - ra.appendContainer(sa.keys[i], sa.containers[i].clone(), thiscopyonewrite) - - } - } -} - -func (ra *roaringArray) removeIndexRange(begin, end int) { - if end <= begin { - return - } - - r := end - begin - - copy(ra.keys[begin:], ra.keys[end:]) - copy(ra.containers[begin:], ra.containers[end:]) - copy(ra.needCopyOnWrite[begin:], ra.needCopyOnWrite[end:]) - - ra.resize(len(ra.keys) - r) -} - -func (ra *roaringArray) resize(newsize int) { - for k := newsize; k < len(ra.containers); k++ { - ra.containers[k] = nil - } - - ra.keys = ra.keys[:newsize] - ra.containers = ra.containers[:newsize] - ra.needCopyOnWrite = ra.needCopyOnWrite[:newsize] -} - -func (ra *roaringArray) clear() { - ra.resize(0) - ra.copyOnWrite = false -} - -func (ra *roaringArray) clone() *roaringArray { - - sa := roaringArray{} - sa.copyOnWrite = ra.copyOnWrite - - // this is where copyOnWrite is used. - if ra.copyOnWrite { - sa.keys = make([]uint16, len(ra.keys)) - copy(sa.keys, ra.keys) - sa.containers = make([]container, len(ra.containers)) - copy(sa.containers, ra.containers) - sa.needCopyOnWrite = make([]bool, len(ra.needCopyOnWrite)) - - ra.markAllAsNeedingCopyOnWrite() - sa.markAllAsNeedingCopyOnWrite() - - // sa.needCopyOnWrite is shared - } else { - // make a full copy - - sa.keys = make([]uint16, len(ra.keys)) - copy(sa.keys, ra.keys) - - sa.containers = make([]container, len(ra.containers)) - for i := range sa.containers { - sa.containers[i] = ra.containers[i].clone() - } - - sa.needCopyOnWrite = make([]bool, len(ra.needCopyOnWrite)) - } - return &sa -} - -// clone all containers which have needCopyOnWrite set to true -// This can be used to make sure it is safe to munmap a []byte -// that the roaring array may still have a reference to. -func (ra *roaringArray) cloneCopyOnWriteContainers() { - for i, needCopyOnWrite := range ra.needCopyOnWrite { - if needCopyOnWrite { - ra.containers[i] = ra.containers[i].clone() - ra.needCopyOnWrite[i] = false - } - } -} - -// unused function: -//func (ra *roaringArray) containsKey(x uint16) bool { -// return (ra.binarySearch(0, int64(len(ra.keys)), x) >= 0) -//} - -func (ra *roaringArray) getContainer(x uint16) container { - i := ra.binarySearch(0, int64(len(ra.keys)), x) - if i < 0 { - return nil - } - return ra.containers[i] -} - -func (ra *roaringArray) getContainerAtIndex(i int) container { - return ra.containers[i] -} - -func (ra *roaringArray) getFastContainerAtIndex(i int, needsWriteable bool) container { - c := ra.getContainerAtIndex(i) - switch t := c.(type) { - case *arrayContainer: - c = t.toBitmapContainer() - case *runContainer16: - if !t.isFull() { - c = t.toBitmapContainer() - } - case *bitmapContainer: - if needsWriteable && ra.needCopyOnWrite[i] { - c = ra.containers[i].clone() - } - } - return c -} - -// getUnionedWritableContainer switches behavior for in-place Or -// depending on whether the container requires a copy on write. -// If it does using the non-inplace or() method leads to fewer allocations. -func (ra *roaringArray) getUnionedWritableContainer(pos int, other container) container { - if ra.needCopyOnWrite[pos] { - return ra.getContainerAtIndex(pos).or(other) - } - return ra.getContainerAtIndex(pos).ior(other) - -} - -func (ra *roaringArray) getWritableContainerAtIndex(i int) container { - if ra.needCopyOnWrite[i] { - ra.containers[i] = ra.containers[i].clone() - ra.needCopyOnWrite[i] = false - } - return ra.containers[i] -} - -func (ra *roaringArray) getIndex(x uint16) int { - // before the binary search, we optimize for frequent cases - size := len(ra.keys) - if (size == 0) || (ra.keys[size-1] == x) { - return size - 1 - } - return ra.binarySearch(0, int64(size), x) -} - -func (ra *roaringArray) getKeyAtIndex(i int) uint16 { - return ra.keys[i] -} - -func (ra *roaringArray) insertNewKeyValueAt(i int, key uint16, value container) { - ra.keys = append(ra.keys, 0) - ra.containers = append(ra.containers, nil) - - copy(ra.keys[i+1:], ra.keys[i:]) - copy(ra.containers[i+1:], ra.containers[i:]) - - ra.keys[i] = key - ra.containers[i] = value - - ra.needCopyOnWrite = append(ra.needCopyOnWrite, false) - copy(ra.needCopyOnWrite[i+1:], ra.needCopyOnWrite[i:]) - ra.needCopyOnWrite[i] = false -} - -func (ra *roaringArray) remove(key uint16) bool { - i := ra.binarySearch(0, int64(len(ra.keys)), key) - if i >= 0 { // if a new key - ra.removeAtIndex(i) - return true - } - return false -} - -func (ra *roaringArray) removeAtIndex(i int) { - copy(ra.keys[i:], ra.keys[i+1:]) - copy(ra.containers[i:], ra.containers[i+1:]) - - copy(ra.needCopyOnWrite[i:], ra.needCopyOnWrite[i+1:]) - - ra.resize(len(ra.keys) - 1) -} - -func (ra *roaringArray) setContainerAtIndex(i int, c container) { - ra.containers[i] = c -} - -func (ra *roaringArray) replaceKeyAndContainerAtIndex(i int, key uint16, c container, mustCopyOnWrite bool) { - ra.keys[i] = key - ra.containers[i] = c - ra.needCopyOnWrite[i] = mustCopyOnWrite -} - -func (ra *roaringArray) size() int { - return len(ra.keys) -} - -func (ra *roaringArray) binarySearch(begin, end int64, ikey uint16) int { - low := begin - high := end - 1 - for low+16 <= high { - middleIndex := low + (high-low)/2 // avoid overflow - middleValue := ra.keys[middleIndex] - - if middleValue < ikey { - low = middleIndex + 1 - } else if middleValue > ikey { - high = middleIndex - 1 - } else { - return int(middleIndex) - } - } - for ; low <= high; low++ { - val := ra.keys[low] - if val >= ikey { - if val == ikey { - return int(low) - } - break - } - } - return -int(low + 1) -} - -func (ra *roaringArray) equals(o interface{}) bool { - srb, ok := o.(roaringArray) - if ok { - - if srb.size() != ra.size() { - return false - } - for i, k := range ra.keys { - if k != srb.keys[i] { - return false - } - } - - for i, c := range ra.containers { - if !c.equals(srb.containers[i]) { - return false - } - } - return true - } - return false -} - -func (ra *roaringArray) headerSize() uint64 { - size := uint64(len(ra.keys)) - if ra.hasRunCompression() { - if size < noOffsetThreshold { // for small bitmaps, we omit the offsets - return 4 + (size+7)/8 + 4*size - } - return 4 + (size+7)/8 + 8*size // - 4 because we pack the size with the cookie - } - return 4 + 4 + 8*size - -} - -// should be dirt cheap -func (ra *roaringArray) serializedSizeInBytes() uint64 { - answer := ra.headerSize() - for _, c := range ra.containers { - answer += uint64(c.serializedSizeInBytes()) - } - return answer -} - -// spec: https://github.com/RoaringBitmap/RoaringFormatSpec -func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) { - hasRun := ra.hasRunCompression() - isRunSizeInBytes := 0 - cookieSize := 8 - if hasRun { - cookieSize = 4 - isRunSizeInBytes = (len(ra.keys) + 7) / 8 - } - descriptiveHeaderSize := 4 * len(ra.keys) - preambleSize := cookieSize + isRunSizeInBytes + descriptiveHeaderSize - - buf := make([]byte, preambleSize+4*len(ra.keys)) - - nw := 0 - - if hasRun { - binary.LittleEndian.PutUint16(buf[0:], uint16(serialCookie)) - nw += 2 - binary.LittleEndian.PutUint16(buf[2:], uint16(len(ra.keys)-1)) - nw += 2 - // compute isRun bitmap without temporary allocation - var runbitmapslice = buf[nw : nw+isRunSizeInBytes] - for i, c := range ra.containers { - switch c.(type) { - case *runContainer16: - runbitmapslice[i/8] |= 1 << (uint(i) % 8) - } - } - nw += isRunSizeInBytes - } else { - binary.LittleEndian.PutUint32(buf[0:], uint32(serialCookieNoRunContainer)) - nw += 4 - binary.LittleEndian.PutUint32(buf[4:], uint32(len(ra.keys))) - nw += 4 - } - - // descriptive header - for i, key := range ra.keys { - binary.LittleEndian.PutUint16(buf[nw:], key) - nw += 2 - c := ra.containers[i] - binary.LittleEndian.PutUint16(buf[nw:], uint16(c.getCardinality()-1)) - nw += 2 - } - - startOffset := int64(preambleSize + 4*len(ra.keys)) - if !hasRun || (len(ra.keys) >= noOffsetThreshold) { - // offset header - for _, c := range ra.containers { - binary.LittleEndian.PutUint32(buf[nw:], uint32(startOffset)) - nw += 4 - switch rc := c.(type) { - case *runContainer16: - startOffset += 2 + int64(len(rc.iv))*4 - default: - startOffset += int64(getSizeInBytesFromCardinality(c.getCardinality())) - } - } - } - - written, err := w.Write(buf[:nw]) - if err != nil { - return n, err - } - n += int64(written) - - for _, c := range ra.containers { - written, err := c.writeTo(w) - if err != nil { - return n, err - } - n += int64(written) - } - return n, nil -} - -// spec: https://github.com/RoaringBitmap/RoaringFormatSpec -func (ra *roaringArray) toBytes() ([]byte, error) { - var buf bytes.Buffer - _, err := ra.writeTo(&buf) - return buf.Bytes(), err -} - -// Reads a serialized roaringArray from a byte slice. -func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte) (int64, error) { - var cookie uint32 - var err error - if len(cookieHeader) > 0 && len(cookieHeader) != 4 { - return int64(len(cookieHeader)), fmt.Errorf("error in roaringArray.readFrom: could not read initial cookie: incorrect size of cookie header") - } - if len(cookieHeader) == 4 { - cookie = binary.LittleEndian.Uint32(cookieHeader) - } else { - cookie, err = stream.ReadUInt32() - if err != nil { - return stream.GetReadBytes(), fmt.Errorf("error in roaringArray.readFrom: could not read initial cookie: %s", err) - } - } - // If NextReturnsSafeSlice is false, then willNeedCopyOnWrite should be true - willNeedCopyOnWrite := !stream.NextReturnsSafeSlice() - - var size uint32 - var isRunBitmap []byte - - if cookie&0x0000FFFF == serialCookie { - size = uint32(cookie>>16 + 1) - // create is-run-container bitmap - isRunBitmapSize := (int(size) + 7) / 8 - isRunBitmap, err = stream.Next(isRunBitmapSize) - - if err != nil { - return stream.GetReadBytes(), fmt.Errorf("malformed bitmap, failed to read is-run bitmap, got: %s", err) - } - } else if cookie == serialCookieNoRunContainer { - size, err = stream.ReadUInt32() - if err != nil { - return stream.GetReadBytes(), fmt.Errorf("malformed bitmap, failed to read a bitmap size: %s", err) - } - } else { - return stream.GetReadBytes(), fmt.Errorf("error in roaringArray.readFrom: did not find expected serialCookie in header") - } - - if size > (1 << 16) { - return stream.GetReadBytes(), fmt.Errorf("it is logically impossible to have more than (1<<16) containers") - } - - // descriptive header - buf, err := stream.Next(2 * 2 * int(size)) - - if err != nil { - return stream.GetReadBytes(), fmt.Errorf("failed to read descriptive header: %s", err) - } - - keycard := byteSliceAsUint16Slice(buf) - - if isRunBitmap == nil || size >= noOffsetThreshold { - if err := stream.SkipBytes(int(size) * 4); err != nil { - return stream.GetReadBytes(), fmt.Errorf("failed to skip bytes: %s", err) - } - } - - // Allocate slices upfront as number of containers is known - if cap(ra.containers) >= int(size) { - ra.containers = ra.containers[:size] - } else { - ra.containers = make([]container, size) - } - - if cap(ra.keys) >= int(size) { - ra.keys = ra.keys[:size] - } else { - ra.keys = make([]uint16, size) - } - - if cap(ra.needCopyOnWrite) >= int(size) { - ra.needCopyOnWrite = ra.needCopyOnWrite[:size] - } else { - ra.needCopyOnWrite = make([]bool, size) - } - - for i := uint32(0); i < size; i++ { - key := keycard[2*i] - card := int(keycard[2*i+1]) + 1 - ra.keys[i] = key - ra.needCopyOnWrite[i] = willNeedCopyOnWrite - - if isRunBitmap != nil && isRunBitmap[i/8]&(1<<(i%8)) != 0 { - // run container - nr, err := stream.ReadUInt16() - - if err != nil { - return 0, fmt.Errorf("failed to read runtime container size: %s", err) - } - - buf, err := stream.Next(int(nr) * 4) - - if err != nil { - return stream.GetReadBytes(), fmt.Errorf("failed to read runtime container content: %s", err) - } - - nb := runContainer16{ - iv: byteSliceAsInterval16Slice(buf), - } - - ra.containers[i] = &nb - } else if card > arrayDefaultMaxSize { - // bitmap container - buf, err := stream.Next(arrayDefaultMaxSize * 2) - - if err != nil { - return stream.GetReadBytes(), fmt.Errorf("failed to read bitmap container: %s", err) - } - - nb := bitmapContainer{ - cardinality: card, - bitmap: byteSliceAsUint64Slice(buf), - } - - ra.containers[i] = &nb - } else { - // array container - buf, err := stream.Next(card * 2) - - if err != nil { - return stream.GetReadBytes(), fmt.Errorf("failed to read array container: %s", err) - } - - nb := arrayContainer{ - byteSliceAsUint16Slice(buf), - } - - ra.containers[i] = &nb - } - } - - return stream.GetReadBytes(), nil -} - -func (ra *roaringArray) hasRunCompression() bool { - for _, c := range ra.containers { - switch c.(type) { - case *runContainer16: - return true - } - } - return false -} - -func (ra *roaringArray) advanceUntil(min uint16, pos int) int { - lower := pos + 1 - - if lower >= len(ra.keys) || ra.keys[lower] >= min { - return lower - } - - spansize := 1 - - for lower+spansize < len(ra.keys) && ra.keys[lower+spansize] < min { - spansize *= 2 - } - var upper int - if lower+spansize < len(ra.keys) { - upper = lower + spansize - } else { - upper = len(ra.keys) - 1 - } - - if ra.keys[upper] == min { - return upper - } - - if ra.keys[upper] < min { - // means - // array - // has no - // item - // >= min - // pos = array.length; - return len(ra.keys) - } - - // we know that the next-smallest span was too small - lower += (spansize >> 1) - - mid := 0 - for lower+1 != upper { - mid = (lower + upper) >> 1 - if ra.keys[mid] == min { - return mid - } else if ra.keys[mid] < min { - lower = mid - } else { - upper = mid - } - } - return upper -} - -func (ra *roaringArray) markAllAsNeedingCopyOnWrite() { - for i := range ra.needCopyOnWrite { - ra.needCopyOnWrite[i] = true - } -} - -func (ra *roaringArray) needsCopyOnWrite(i int) bool { - return ra.needCopyOnWrite[i] -} - -func (ra *roaringArray) setNeedsCopyOnWrite(i int) { - ra.needCopyOnWrite[i] = true -} diff --git a/vendor/github.com/RoaringBitmap/roaring/runcontainer.go b/vendor/github.com/RoaringBitmap/roaring/runcontainer.go deleted file mode 100644 index 7098ba2..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/runcontainer.go +++ /dev/null @@ -1,2624 +0,0 @@ -package roaring - -// -// Copyright (c) 2016 by the roaring authors. -// Licensed under the Apache License, Version 2.0. -// -// We derive a few lines of code from the sort.Search -// function in the golang standard library. That function -// is Copyright 2009 The Go Authors, and licensed -// under the following BSD-style license. -/* -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -import ( - "fmt" - "sort" - "unsafe" -) - -// runContainer16 does run-length encoding of sets of -// uint16 integers. -type runContainer16 struct { - // iv is a slice of sorted, non-overlapping, non-adjacent intervals. - iv []interval16 -} - -// interval16 is the internal to runContainer16 -// structure that maintains the individual [start, last] -// closed intervals. -type interval16 struct { - start uint16 - length uint16 // length minus 1 -} - -func newInterval16Range(start, last uint16) interval16 { - if last < start { - panic(fmt.Sprintf("last (%d) cannot be smaller than start (%d)", last, start)) - } - - return interval16{ - start, - last - start, - } -} - -// runlen returns the count of integers in the interval. -func (iv interval16) runlen() int { - return int(iv.length) + 1 -} - -func (iv interval16) last() uint16 { - return iv.start + iv.length -} - -// String produces a human viewable string of the contents. -func (iv interval16) String() string { - return fmt.Sprintf("[%d, %d]", iv.start, iv.length) -} - -func ivalString16(iv []interval16) string { - var s string - var j int - var p interval16 - for j, p = range iv { - s += fmt.Sprintf("%v:[%d, %d], ", j, p.start, p.last()) - } - return s -} - -// String produces a human viewable string of the contents. -func (rc *runContainer16) String() string { - if len(rc.iv) == 0 { - return "runContainer16{}" - } - is := ivalString16(rc.iv) - return `runContainer16{` + is + `}` -} - -// uint16Slice is a sort.Sort convenience method -type uint16Slice []uint16 - -// Len returns the length of p. -func (p uint16Slice) Len() int { return len(p) } - -// Less returns p[i] < p[j] -func (p uint16Slice) Less(i, j int) bool { return p[i] < p[j] } - -// Swap swaps elements i and j. -func (p uint16Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } - -// addHelper helps build a runContainer16. -type addHelper16 struct { - runstart uint16 - runlen uint16 - actuallyAdded uint16 - m []interval16 - rc *runContainer16 -} - -func (ah *addHelper16) storeIval(runstart, runlen uint16) { - mi := interval16{start: runstart, length: runlen} - ah.m = append(ah.m, mi) -} - -func (ah *addHelper16) add(cur, prev uint16, i int) { - if cur == prev+1 { - ah.runlen++ - ah.actuallyAdded++ - } else { - if cur < prev { - panic(fmt.Sprintf("newRunContainer16FromVals sees "+ - "unsorted vals; vals[%v]=cur=%v < prev=%v. Sort your vals"+ - " before calling us with alreadySorted == true.", i, cur, prev)) - } - if cur == prev { - // ignore duplicates - } else { - ah.actuallyAdded++ - ah.storeIval(ah.runstart, ah.runlen) - ah.runstart = cur - ah.runlen = 0 - } - } -} - -// newRunContainerRange makes a new container made of just the specified closed interval [rangestart,rangelast] -func newRunContainer16Range(rangestart uint16, rangelast uint16) *runContainer16 { - rc := &runContainer16{} - rc.iv = append(rc.iv, newInterval16Range(rangestart, rangelast)) - return rc -} - -// newRunContainer16FromVals makes a new container from vals. -// -// For efficiency, vals should be sorted in ascending order. -// Ideally vals should not contain duplicates, but we detect and -// ignore them. If vals is already sorted in ascending order, then -// pass alreadySorted = true. Otherwise, for !alreadySorted, -// we will sort vals before creating a runContainer16 of them. -// We sort the original vals, so this will change what the -// caller sees in vals as a side effect. -func newRunContainer16FromVals(alreadySorted bool, vals ...uint16) *runContainer16 { - // keep this in sync with newRunContainer16FromArray below - - rc := &runContainer16{} - ah := addHelper16{rc: rc} - - if !alreadySorted { - sort.Sort(uint16Slice(vals)) - } - n := len(vals) - var cur, prev uint16 - switch { - case n == 0: - // nothing more - case n == 1: - ah.m = append(ah.m, newInterval16Range(vals[0], vals[0])) - ah.actuallyAdded++ - default: - ah.runstart = vals[0] - ah.actuallyAdded++ - for i := 1; i < n; i++ { - prev = vals[i-1] - cur = vals[i] - ah.add(cur, prev, i) - } - ah.storeIval(ah.runstart, ah.runlen) - } - rc.iv = ah.m - return rc -} - -// newRunContainer16FromBitmapContainer makes a new run container from bc, -// somewhat efficiently. For reference, see the Java -// https://github.com/RoaringBitmap/RoaringBitmap/blob/master/src/main/java/org/roaringbitmap/RunContainer.java#L145-L192 -func newRunContainer16FromBitmapContainer(bc *bitmapContainer) *runContainer16 { - - rc := &runContainer16{} - nbrRuns := bc.numberOfRuns() - if nbrRuns == 0 { - return rc - } - rc.iv = make([]interval16, nbrRuns) - - longCtr := 0 // index of current long in bitmap - curWord := bc.bitmap[0] // its value - runCount := 0 - for { - // potentially multiword advance to first 1 bit - for curWord == 0 && longCtr < len(bc.bitmap)-1 { - longCtr++ - curWord = bc.bitmap[longCtr] - } - - if curWord == 0 { - // wrap up, no more runs - return rc - } - localRunStart := countTrailingZeros(curWord) - runStart := localRunStart + 64*longCtr - // stuff 1s into number's LSBs - curWordWith1s := curWord | (curWord - 1) - - // find the next 0, potentially in a later word - runEnd := 0 - for curWordWith1s == maxWord && longCtr < len(bc.bitmap)-1 { - longCtr++ - curWordWith1s = bc.bitmap[longCtr] - } - - if curWordWith1s == maxWord { - // a final unterminated run of 1s - runEnd = wordSizeInBits + longCtr*64 - rc.iv[runCount].start = uint16(runStart) - rc.iv[runCount].length = uint16(runEnd) - uint16(runStart) - 1 - return rc - } - localRunEnd := countTrailingZeros(^curWordWith1s) - runEnd = localRunEnd + longCtr*64 - rc.iv[runCount].start = uint16(runStart) - rc.iv[runCount].length = uint16(runEnd) - 1 - uint16(runStart) - runCount++ - // now, zero out everything right of runEnd. - curWord = curWordWith1s & (curWordWith1s + 1) - // We've lathered and rinsed, so repeat... - } - -} - -// newRunContainer16FromArray populates a new -// runContainer16 from the contents of arr. -func newRunContainer16FromArray(arr *arrayContainer) *runContainer16 { - // keep this in sync with newRunContainer16FromVals above - - rc := &runContainer16{} - ah := addHelper16{rc: rc} - - n := arr.getCardinality() - var cur, prev uint16 - switch { - case n == 0: - // nothing more - case n == 1: - ah.m = append(ah.m, newInterval16Range(arr.content[0], arr.content[0])) - ah.actuallyAdded++ - default: - ah.runstart = arr.content[0] - ah.actuallyAdded++ - for i := 1; i < n; i++ { - prev = arr.content[i-1] - cur = arr.content[i] - ah.add(cur, prev, i) - } - ah.storeIval(ah.runstart, ah.runlen) - } - rc.iv = ah.m - return rc -} - -// set adds the integers in vals to the set. Vals -// must be sorted in increasing order; if not, you should set -// alreadySorted to false, and we will sort them in place for you. -// (Be aware of this side effect -- it will affect the callers -// view of vals). -// -// If you have a small number of additions to an already -// big runContainer16, calling Add() may be faster. -func (rc *runContainer16) set(alreadySorted bool, vals ...uint16) { - - rc2 := newRunContainer16FromVals(alreadySorted, vals...) - un := rc.union(rc2) - rc.iv = un.iv -} - -// canMerge returns true iff the intervals -// a and b either overlap or they are -// contiguous and so can be merged into -// a single interval. -func canMerge16(a, b interval16) bool { - if int(a.last())+1 < int(b.start) { - return false - } - return int(b.last())+1 >= int(a.start) -} - -// haveOverlap differs from canMerge in that -// it tells you if the intersection of a -// and b would contain an element (otherwise -// it would be the empty set, and we return -// false). -func haveOverlap16(a, b interval16) bool { - if int(a.last())+1 <= int(b.start) { - return false - } - return int(b.last())+1 > int(a.start) -} - -// mergeInterval16s joins a and b into a -// new interval, and panics if it cannot. -func mergeInterval16s(a, b interval16) (res interval16) { - if !canMerge16(a, b) { - panic(fmt.Sprintf("cannot merge %#v and %#v", a, b)) - } - - if b.start < a.start { - res.start = b.start - } else { - res.start = a.start - } - - if b.last() > a.last() { - res.length = b.last() - res.start - } else { - res.length = a.last() - res.start - } - - return -} - -// intersectInterval16s returns the intersection -// of a and b. The isEmpty flag will be true if -// a and b were disjoint. -func intersectInterval16s(a, b interval16) (res interval16, isEmpty bool) { - if !haveOverlap16(a, b) { - isEmpty = true - return - } - if b.start > a.start { - res.start = b.start - } else { - res.start = a.start - } - - bEnd := b.last() - aEnd := a.last() - var resEnd uint16 - - if bEnd < aEnd { - resEnd = bEnd - } else { - resEnd = aEnd - } - res.length = resEnd - res.start - return -} - -// union merges two runContainer16s, producing -// a new runContainer16 with the union of rc and b. -func (rc *runContainer16) union(b *runContainer16) *runContainer16 { - - // rc is also known as 'a' here, but golint insisted we - // call it rc for consistency with the rest of the methods. - - var m []interval16 - - alim := int(len(rc.iv)) - blim := int(len(b.iv)) - - var na int // next from a - var nb int // next from b - - // merged holds the current merge output, which might - // get additional merges before being appended to m. - var merged interval16 - var mergedUsed bool // is merged being used at the moment? - - var cura interval16 // currently considering this interval16 from a - var curb interval16 // currently considering this interval16 from b - - pass := 0 - for na < alim && nb < blim { - pass++ - cura = rc.iv[na] - curb = b.iv[nb] - - if mergedUsed { - mergedUpdated := false - if canMerge16(cura, merged) { - merged = mergeInterval16s(cura, merged) - na = rc.indexOfIntervalAtOrAfter(int(merged.last())+1, na+1) - mergedUpdated = true - } - if canMerge16(curb, merged) { - merged = mergeInterval16s(curb, merged) - nb = b.indexOfIntervalAtOrAfter(int(merged.last())+1, nb+1) - mergedUpdated = true - } - if !mergedUpdated { - // we know that merged is disjoint from cura and curb - m = append(m, merged) - mergedUsed = false - } - continue - - } else { - // !mergedUsed - if !canMerge16(cura, curb) { - if cura.start < curb.start { - m = append(m, cura) - na++ - } else { - m = append(m, curb) - nb++ - } - } else { - merged = mergeInterval16s(cura, curb) - mergedUsed = true - na = rc.indexOfIntervalAtOrAfter(int(merged.last())+1, na+1) - nb = b.indexOfIntervalAtOrAfter(int(merged.last())+1, nb+1) - } - } - } - var aDone, bDone bool - if na >= alim { - aDone = true - } - if nb >= blim { - bDone = true - } - // finish by merging anything remaining into merged we can: - if mergedUsed { - if !aDone { - aAdds: - for na < alim { - cura = rc.iv[na] - if canMerge16(cura, merged) { - merged = mergeInterval16s(cura, merged) - na = rc.indexOfIntervalAtOrAfter(int(merged.last())+1, na+1) - } else { - break aAdds - } - } - - } - - if !bDone { - bAdds: - for nb < blim { - curb = b.iv[nb] - if canMerge16(curb, merged) { - merged = mergeInterval16s(curb, merged) - nb = b.indexOfIntervalAtOrAfter(int(merged.last())+1, nb+1) - } else { - break bAdds - } - } - - } - - m = append(m, merged) - } - if na < alim { - m = append(m, rc.iv[na:]...) - } - if nb < blim { - m = append(m, b.iv[nb:]...) - } - - res := &runContainer16{iv: m} - return res -} - -// unionCardinality returns the cardinality of the merger of two runContainer16s, the union of rc and b. -func (rc *runContainer16) unionCardinality(b *runContainer16) uint { - - // rc is also known as 'a' here, but golint insisted we - // call it rc for consistency with the rest of the methods. - answer := uint(0) - - alim := int(len(rc.iv)) - blim := int(len(b.iv)) - - var na int // next from a - var nb int // next from b - - // merged holds the current merge output, which might - // get additional merges before being appended to m. - var merged interval16 - var mergedUsed bool // is merged being used at the moment? - - var cura interval16 // currently considering this interval16 from a - var curb interval16 // currently considering this interval16 from b - - pass := 0 - for na < alim && nb < blim { - pass++ - cura = rc.iv[na] - curb = b.iv[nb] - - if mergedUsed { - mergedUpdated := false - if canMerge16(cura, merged) { - merged = mergeInterval16s(cura, merged) - na = rc.indexOfIntervalAtOrAfter(int(merged.last())+1, na+1) - mergedUpdated = true - } - if canMerge16(curb, merged) { - merged = mergeInterval16s(curb, merged) - nb = b.indexOfIntervalAtOrAfter(int(merged.last())+1, nb+1) - mergedUpdated = true - } - if !mergedUpdated { - // we know that merged is disjoint from cura and curb - //m = append(m, merged) - answer += uint(merged.last()) - uint(merged.start) + 1 - mergedUsed = false - } - continue - - } else { - // !mergedUsed - if !canMerge16(cura, curb) { - if cura.start < curb.start { - answer += uint(cura.last()) - uint(cura.start) + 1 - //m = append(m, cura) - na++ - } else { - answer += uint(curb.last()) - uint(curb.start) + 1 - //m = append(m, curb) - nb++ - } - } else { - merged = mergeInterval16s(cura, curb) - mergedUsed = true - na = rc.indexOfIntervalAtOrAfter(int(merged.last())+1, na+1) - nb = b.indexOfIntervalAtOrAfter(int(merged.last())+1, nb+1) - } - } - } - var aDone, bDone bool - if na >= alim { - aDone = true - } - if nb >= blim { - bDone = true - } - // finish by merging anything remaining into merged we can: - if mergedUsed { - if !aDone { - aAdds: - for na < alim { - cura = rc.iv[na] - if canMerge16(cura, merged) { - merged = mergeInterval16s(cura, merged) - na = rc.indexOfIntervalAtOrAfter(int(merged.last())+1, na+1) - } else { - break aAdds - } - } - - } - - if !bDone { - bAdds: - for nb < blim { - curb = b.iv[nb] - if canMerge16(curb, merged) { - merged = mergeInterval16s(curb, merged) - nb = b.indexOfIntervalAtOrAfter(int(merged.last())+1, nb+1) - } else { - break bAdds - } - } - - } - - //m = append(m, merged) - answer += uint(merged.last()) - uint(merged.start) + 1 - } - for _, r := range rc.iv[na:] { - answer += uint(r.last()) - uint(r.start) + 1 - } - for _, r := range b.iv[nb:] { - answer += uint(r.last()) - uint(r.start) + 1 - } - return answer -} - -// indexOfIntervalAtOrAfter is a helper for union. -func (rc *runContainer16) indexOfIntervalAtOrAfter(key int, startIndex int) int { - w, already, _ := rc.searchRange(key, startIndex, 0) - if already { - return w - } - return w + 1 -} - -// intersect returns a new runContainer16 holding the -// intersection of rc (also known as 'a') and b. -func (rc *runContainer16) intersect(b *runContainer16) *runContainer16 { - - a := rc - numa := int(len(a.iv)) - numb := int(len(b.iv)) - res := &runContainer16{} - if numa == 0 || numb == 0 { - return res - } - - if numa == 1 && numb == 1 { - if !haveOverlap16(a.iv[0], b.iv[0]) { - return res - } - } - - var output []interval16 - - var acuri int - var bcuri int - - astart := int(a.iv[acuri].start) - bstart := int(b.iv[bcuri].start) - - var intersection interval16 - var leftoverstart int - var isOverlap, isLeftoverA, isLeftoverB bool - var done bool -toploop: - for acuri < numa && bcuri < numb { - - isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = - intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) - - if !isOverlap { - switch { - case astart < bstart: - acuri, done = a.findNextIntervalThatIntersectsStartingFrom(acuri+1, bstart) - if done { - break toploop - } - astart = int(a.iv[acuri].start) - - case astart > bstart: - bcuri, done = b.findNextIntervalThatIntersectsStartingFrom(bcuri+1, astart) - if done { - break toploop - } - bstart = int(b.iv[bcuri].start) - } - - } else { - // isOverlap - output = append(output, intersection) - switch { - case isLeftoverA: - // note that we change astart without advancing acuri, - // since we need to capture any 2ndary intersections with a.iv[acuri] - astart = leftoverstart - bcuri++ - if bcuri >= numb { - break toploop - } - bstart = int(b.iv[bcuri].start) - case isLeftoverB: - // note that we change bstart without advancing bcuri, - // since we need to capture any 2ndary intersections with b.iv[bcuri] - bstart = leftoverstart - acuri++ - if acuri >= numa { - break toploop - } - astart = int(a.iv[acuri].start) - default: - // neither had leftover, both completely consumed - - // advance to next a interval - acuri++ - if acuri >= numa { - break toploop - } - astart = int(a.iv[acuri].start) - - // advance to next b interval - bcuri++ - if bcuri >= numb { - break toploop - } - bstart = int(b.iv[bcuri].start) - } - } - } // end for toploop - - if len(output) == 0 { - return res - } - - res.iv = output - return res -} - -// intersectCardinality returns the cardinality of the -// intersection of rc (also known as 'a') and b. -func (rc *runContainer16) intersectCardinality(b *runContainer16) int { - answer := int(0) - - a := rc - numa := int(len(a.iv)) - numb := int(len(b.iv)) - if numa == 0 || numb == 0 { - return 0 - } - - if numa == 1 && numb == 1 { - if !haveOverlap16(a.iv[0], b.iv[0]) { - return 0 - } - } - - var acuri int - var bcuri int - - astart := int(a.iv[acuri].start) - bstart := int(b.iv[bcuri].start) - - var intersection interval16 - var leftoverstart int - var isOverlap, isLeftoverA, isLeftoverB bool - var done bool - pass := 0 -toploop: - for acuri < numa && bcuri < numb { - pass++ - - isOverlap, isLeftoverA, isLeftoverB, leftoverstart, intersection = - intersectWithLeftover16(astart, int(a.iv[acuri].last()), bstart, int(b.iv[bcuri].last())) - - if !isOverlap { - switch { - case astart < bstart: - acuri, done = a.findNextIntervalThatIntersectsStartingFrom(acuri+1, bstart) - if done { - break toploop - } - astart = int(a.iv[acuri].start) - - case astart > bstart: - bcuri, done = b.findNextIntervalThatIntersectsStartingFrom(bcuri+1, astart) - if done { - break toploop - } - bstart = int(b.iv[bcuri].start) - } - - } else { - // isOverlap - answer += int(intersection.last()) - int(intersection.start) + 1 - switch { - case isLeftoverA: - // note that we change astart without advancing acuri, - // since we need to capture any 2ndary intersections with a.iv[acuri] - astart = leftoverstart - bcuri++ - if bcuri >= numb { - break toploop - } - bstart = int(b.iv[bcuri].start) - case isLeftoverB: - // note that we change bstart without advancing bcuri, - // since we need to capture any 2ndary intersections with b.iv[bcuri] - bstart = leftoverstart - acuri++ - if acuri >= numa { - break toploop - } - astart = int(a.iv[acuri].start) - default: - // neither had leftover, both completely consumed - - // advance to next a interval - acuri++ - if acuri >= numa { - break toploop - } - astart = int(a.iv[acuri].start) - - // advance to next b interval - bcuri++ - if bcuri >= numb { - break toploop - } - bstart = int(b.iv[bcuri].start) - } - } - } // end for toploop - - return answer -} - -// get returns true iff key is in the container. -func (rc *runContainer16) contains(key uint16) bool { - _, in, _ := rc.search(int(key)) - return in -} - -// numIntervals returns the count of intervals in the container. -func (rc *runContainer16) numIntervals() int { - return len(rc.iv) -} - -// searchRange returns alreadyPresent to indicate if the -// key is already in one of our interval16s. -// -// If key is alreadyPresent, then whichInterval16 tells -// you where. -// -// If key is not already present, then whichInterval16 is -// set as follows: -// -// a) whichInterval16 == len(rc.iv)-1 if key is beyond our -// last interval16 in rc.iv; -// -// b) whichInterval16 == -1 if key is before our first -// interval16 in rc.iv; -// -// c) whichInterval16 is set to the minimum index of rc.iv -// which comes strictly before the key; -// so rc.iv[whichInterval16].last < key, -// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start -// (Note that whichInterval16+1 won't exist when -// whichInterval16 is the last interval.) -// -// runContainer16.search always returns whichInterval16 < len(rc.iv). -// -// The search space is from startIndex to endxIndex. If endxIndex is set to zero, then there -// no upper bound. -func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (whichInterval16 int, alreadyPresent bool, numCompares int) { - n := int(len(rc.iv)) - if n == 0 { - return -1, false, 0 - } - if endxIndex == 0 { - endxIndex = n - } - - // sort.Search returns the smallest index i - // in [0, n) at which f(i) is true, assuming that on the range [0, n), - // f(i) == true implies f(i+1) == true. - // If there is no such index, Search returns n. - - // For correctness, this began as verbatim snippet from - // sort.Search in the Go standard lib. - // We inline our comparison function for speed, and - // annotate with numCompares - // to observe and test that extra bounds are utilized. - i, j := startIndex, endxIndex - for i < j { - h := i + (j-i)/2 // avoid overflow when computing h as the bisector - // i <= h < j - numCompares++ - if !(key < int(rc.iv[h].start)) { - i = h + 1 - } else { - j = h - } - } - below := i - // end std lib snippet. - - // The above is a simple in-lining and annotation of: - /* below := sort.Search(n, - func(i int) bool { - return key < rc.iv[i].start - }) - */ - whichInterval16 = below - 1 - - if below == n { - // all falses => key is >= start of all interval16s - // ... so does it belong to the last interval16? - if key < int(rc.iv[n-1].last())+1 { - // yes, it belongs to the last interval16 - alreadyPresent = true - return - } - // no, it is beyond the last interval16. - // leave alreadyPreset = false - return - } - - // INVAR: key is below rc.iv[below] - if below == 0 { - // key is before the first first interval16. - // leave alreadyPresent = false - return - } - - // INVAR: key is >= rc.iv[below-1].start and - // key is < rc.iv[below].start - - // is key in below-1 interval16? - if key >= int(rc.iv[below-1].start) && key < int(rc.iv[below-1].last())+1 { - // yes, it is. key is in below-1 interval16. - alreadyPresent = true - return - } - - // INVAR: key >= rc.iv[below-1].endx && key < rc.iv[below].start - // leave alreadyPresent = false - return -} - -// search returns alreadyPresent to indicate if the -// key is already in one of our interval16s. -// -// If key is alreadyPresent, then whichInterval16 tells -// you where. -// -// If key is not already present, then whichInterval16 is -// set as follows: -// -// a) whichInterval16 == len(rc.iv)-1 if key is beyond our -// last interval16 in rc.iv; -// -// b) whichInterval16 == -1 if key is before our first -// interval16 in rc.iv; -// -// c) whichInterval16 is set to the minimum index of rc.iv -// which comes strictly before the key; -// so rc.iv[whichInterval16].last < key, -// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start -// (Note that whichInterval16+1 won't exist when -// whichInterval16 is the last interval.) -// -// runContainer16.search always returns whichInterval16 < len(rc.iv). -func (rc *runContainer16) search(key int) (whichInterval16 int, alreadyPresent bool, numCompares int) { - return rc.searchRange(key, 0, 0) -} - -// getCardinality returns the count of the integers stored in the -// runContainer16. The running complexity depends on the size -// of the container. -func (rc *runContainer16) getCardinality() int { - // have to compute it - n := 0 - for _, p := range rc.iv { - n += p.runlen() - } - return n -} - -// isEmpty returns true if the container is empty. -// It runs in constant time. -func (rc *runContainer16) isEmpty() bool { - return len(rc.iv) == 0 -} - -// AsSlice decompresses the contents into a []uint16 slice. -func (rc *runContainer16) AsSlice() []uint16 { - s := make([]uint16, rc.getCardinality()) - j := 0 - for _, p := range rc.iv { - for i := p.start; i <= p.last(); i++ { - s[j] = i - j++ - } - } - return s -} - -// newRunContainer16 creates an empty run container. -func newRunContainer16() *runContainer16 { - return &runContainer16{} -} - -// newRunContainer16CopyIv creates a run container, initializing -// with a copy of the supplied iv slice. -func newRunContainer16CopyIv(iv []interval16) *runContainer16 { - rc := &runContainer16{ - iv: make([]interval16, len(iv)), - } - copy(rc.iv, iv) - return rc -} - -func (rc *runContainer16) Clone() *runContainer16 { - rc2 := newRunContainer16CopyIv(rc.iv) - return rc2 -} - -// newRunContainer16TakeOwnership returns a new runContainer16 -// backed by the provided iv slice, which we will -// assume exclusive control over from now on. -func newRunContainer16TakeOwnership(iv []interval16) *runContainer16 { - rc := &runContainer16{ - iv: iv, - } - return rc -} - -const baseRc16Size = int(unsafe.Sizeof(runContainer16{})) -const perIntervalRc16Size = int(unsafe.Sizeof(interval16{})) - -const baseDiskRc16Size = int(unsafe.Sizeof(uint16(0))) - -// see also runContainer16SerializedSizeInBytes(numRuns int) int - -// getSizeInBytes returns the number of bytes of memory -// required by this runContainer16. -func (rc *runContainer16) getSizeInBytes() int { - return perIntervalRc16Size*len(rc.iv) + baseRc16Size -} - -// runContainer16SerializedSizeInBytes returns the number of bytes of disk -// required to hold numRuns in a runContainer16. -func runContainer16SerializedSizeInBytes(numRuns int) int { - return perIntervalRc16Size*numRuns + baseDiskRc16Size -} - -// Add adds a single value k to the set. -func (rc *runContainer16) Add(k uint16) (wasNew bool) { - // TODO comment from runContainer16.java: - // it might be better and simpler to do return - // toBitmapOrArrayContainer(getCardinality()).add(k) - // but note that some unit tests use this method to build up test - // runcontainers without calling runOptimize - - k64 := int(k) - - index, present, _ := rc.search(k64) - if present { - return // already there - } - wasNew = true - - n := int(len(rc.iv)) - if index == -1 { - // we may need to extend the first run - if n > 0 { - if rc.iv[0].start == k+1 { - rc.iv[0].start = k - rc.iv[0].length++ - return - } - } - // nope, k stands alone, starting the new first interval16. - rc.iv = append([]interval16{newInterval16Range(k, k)}, rc.iv...) - return - } - - // are we off the end? handle both index == n and index == n-1: - if index >= n-1 { - if int(rc.iv[n-1].last())+1 == k64 { - rc.iv[n-1].length++ - return - } - rc.iv = append(rc.iv, newInterval16Range(k, k)) - return - } - - // INVAR: index and index+1 both exist, and k goes between them. - // - // Now: add k into the middle, - // possibly fusing with index or index+1 interval16 - // and possibly resulting in fusing of two interval16s - // that had a one integer gap. - - left := index - right := index + 1 - - // are we fusing left and right by adding k? - if int(rc.iv[left].last())+1 == k64 && int(rc.iv[right].start) == k64+1 { - // fuse into left - rc.iv[left].length = rc.iv[right].last() - rc.iv[left].start - // remove redundant right - rc.iv = append(rc.iv[:left+1], rc.iv[right+1:]...) - return - } - - // are we an addition to left? - if int(rc.iv[left].last())+1 == k64 { - // yes - rc.iv[left].length++ - return - } - - // are we an addition to right? - if int(rc.iv[right].start) == k64+1 { - // yes - rc.iv[right].start = k - rc.iv[right].length++ - return - } - - // k makes a standalone new interval16, inserted in the middle - tail := append([]interval16{newInterval16Range(k, k)}, rc.iv[right:]...) - rc.iv = append(rc.iv[:left+1], tail...) - return -} - -// runIterator16 advice: you must call hasNext() -// before calling next()/peekNext() to insure there are contents. -type runIterator16 struct { - rc *runContainer16 - curIndex int - curPosInIndex uint16 -} - -// newRunIterator16 returns a new empty run container. -func (rc *runContainer16) newRunIterator16() *runIterator16 { - return &runIterator16{rc: rc, curIndex: 0, curPosInIndex: 0} -} - -func (rc *runContainer16) iterate(cb func(x uint16) bool) bool { - iterator := runIterator16{rc, 0, 0} - - for iterator.hasNext() { - if !cb(iterator.next()) { - return false - } - } - - return true -} - -// hasNext returns false if calling next will panic. It -// returns true when there is at least one more value -// available in the iteration sequence. -func (ri *runIterator16) hasNext() bool { - return int(len(ri.rc.iv)) > ri.curIndex+1 || - (int(len(ri.rc.iv)) == ri.curIndex+1 && ri.rc.iv[ri.curIndex].length >= ri.curPosInIndex) -} - -// next returns the next value in the iteration sequence. -func (ri *runIterator16) next() uint16 { - next := ri.rc.iv[ri.curIndex].start + ri.curPosInIndex - - if ri.curPosInIndex == ri.rc.iv[ri.curIndex].length { - ri.curPosInIndex = 0 - ri.curIndex++ - } else { - ri.curPosInIndex++ - } - - return next -} - -// peekNext returns the next value in the iteration sequence without advancing the iterator -func (ri *runIterator16) peekNext() uint16 { - return ri.rc.iv[ri.curIndex].start + ri.curPosInIndex -} - -// advanceIfNeeded advances as long as the next value is smaller than minval -func (ri *runIterator16) advanceIfNeeded(minval uint16) { - if !ri.hasNext() || ri.peekNext() >= minval { - return - } - - // interval cannot be -1 because of minval > peekNext - interval, isPresent, _ := ri.rc.searchRange(int(minval), ri.curIndex, int(len(ri.rc.iv))) - - // if the minval is present, set the curPosIndex at the right position - if isPresent { - ri.curIndex = interval - ri.curPosInIndex = minval - ri.rc.iv[ri.curIndex].start - } else { - // otherwise interval is set to to the minimum index of rc.iv - // which comes strictly before the key, that's why we set the next interval - ri.curIndex = interval + 1 - ri.curPosInIndex = 0 - } -} - -// runReverseIterator16 advice: you must call hasNext() -// before calling next() to insure there are contents. -type runReverseIterator16 struct { - rc *runContainer16 - curIndex int // index into rc.iv - curPosInIndex uint16 // offset in rc.iv[curIndex] -} - -// newRunReverseIterator16 returns a new empty run iterator. -func (rc *runContainer16) newRunReverseIterator16() *runReverseIterator16 { - index := int(len(rc.iv)) - 1 - pos := uint16(0) - - if index >= 0 { - pos = rc.iv[index].length - } - - return &runReverseIterator16{ - rc: rc, - curIndex: index, - curPosInIndex: pos, - } -} - -// hasNext returns false if calling next will panic. It -// returns true when there is at least one more value -// available in the iteration sequence. -func (ri *runReverseIterator16) hasNext() bool { - return ri.curIndex > 0 || ri.curIndex == 0 && ri.curPosInIndex >= 0 -} - -// next returns the next value in the iteration sequence. -func (ri *runReverseIterator16) next() uint16 { - next := ri.rc.iv[ri.curIndex].start + ri.curPosInIndex - - if ri.curPosInIndex > 0 { - ri.curPosInIndex-- - } else { - ri.curIndex-- - - if ri.curIndex >= 0 { - ri.curPosInIndex = ri.rc.iv[ri.curIndex].length - } - } - - return next -} - -func (rc *runContainer16) newManyRunIterator16() *runIterator16 { - return rc.newRunIterator16() -} - -// hs are the high bits to include to avoid needing to reiterate over the buffer in NextMany -func (ri *runIterator16) nextMany(hs uint32, buf []uint32) int { - n := 0 - - if !ri.hasNext() { - return n - } - - // start and end are inclusive - for n < len(buf) { - moreVals := 0 - - if ri.rc.iv[ri.curIndex].length >= ri.curPosInIndex { - // add as many as you can from this seq - moreVals = minOfInt(int(ri.rc.iv[ri.curIndex].length-ri.curPosInIndex)+1, len(buf)-n) - base := uint32(ri.rc.iv[ri.curIndex].start+ri.curPosInIndex) | hs - - // allows BCE - buf2 := buf[n : n+moreVals] - for i := range buf2 { - buf2[i] = base + uint32(i) - } - - // update values - n += moreVals - } - - if moreVals+int(ri.curPosInIndex) > int(ri.rc.iv[ri.curIndex].length) { - ri.curPosInIndex = 0 - ri.curIndex++ - - if ri.curIndex == int(len(ri.rc.iv)) { - break - } - } else { - ri.curPosInIndex += uint16(moreVals) //moreVals always fits in uint16 - } - } - - return n -} - -func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { - n := 0 - - if !ri.hasNext() { - return n - } - - // start and end are inclusive - for n < len(buf) { - moreVals := 0 - - if ri.rc.iv[ri.curIndex].length >= ri.curPosInIndex { - // add as many as you can from this seq - moreVals = minOfInt(int(ri.rc.iv[ri.curIndex].length-ri.curPosInIndex)+1, len(buf)-n) - base := uint64(ri.rc.iv[ri.curIndex].start+ri.curPosInIndex) | hs - - // allows BCE - buf2 := buf[n : n+moreVals] - for i := range buf2 { - buf2[i] = base + uint64(i) - } - - // update values - n += moreVals - } - - if moreVals+int(ri.curPosInIndex) > int(ri.rc.iv[ri.curIndex].length) { - ri.curPosInIndex = 0 - ri.curIndex++ - - if ri.curIndex == int(len(ri.rc.iv)) { - break - } - } else { - ri.curPosInIndex += uint16(moreVals) //moreVals always fits in uint16 - } - } - - return n -} - -// remove removes key from the container. -func (rc *runContainer16) removeKey(key uint16) (wasPresent bool) { - - var index int - index, wasPresent, _ = rc.search(int(key)) - if !wasPresent { - return // already removed, nothing to do. - } - pos := key - rc.iv[index].start - rc.deleteAt(&index, &pos) - return -} - -// internal helper functions - -func (rc *runContainer16) deleteAt(curIndex *int, curPosInIndex *uint16) { - ci := *curIndex - pos := *curPosInIndex - - // are we first, last, or in the middle of our interval16? - switch { - case pos == 0: - if int(rc.iv[ci].length) == 0 { - // our interval disappears - rc.iv = append(rc.iv[:ci], rc.iv[ci+1:]...) - // curIndex stays the same, since the delete did - // the advance for us. - *curPosInIndex = 0 - } else { - rc.iv[ci].start++ // no longer overflowable - rc.iv[ci].length-- - } - case pos == rc.iv[ci].length: - // length - rc.iv[ci].length-- - // our interval16 cannot disappear, else we would have been pos == 0, case first above. - *curPosInIndex-- - // if we leave *curIndex alone, then Next() will work properly even after the delete. - default: - //middle - // split into two, adding an interval16 - new0 := newInterval16Range(rc.iv[ci].start, rc.iv[ci].start+*curPosInIndex-1) - - new1start := int(rc.iv[ci].start+*curPosInIndex) + 1 - if new1start > int(MaxUint16) { - panic("overflow?!?!") - } - new1 := newInterval16Range(uint16(new1start), rc.iv[ci].last()) - tail := append([]interval16{new0, new1}, rc.iv[ci+1:]...) - rc.iv = append(rc.iv[:ci], tail...) - // update curIndex and curPosInIndex - *curIndex++ - *curPosInIndex = 0 - } - -} - -func have4Overlap16(astart, alast, bstart, blast int) bool { - if alast+1 <= bstart { - return false - } - return blast+1 > astart -} - -func intersectWithLeftover16(astart, alast, bstart, blast int) (isOverlap, isLeftoverA, isLeftoverB bool, leftoverstart int, intersection interval16) { - if !have4Overlap16(astart, alast, bstart, blast) { - return - } - isOverlap = true - - // do the intersection: - if bstart > astart { - intersection.start = uint16(bstart) - } else { - intersection.start = uint16(astart) - } - - switch { - case blast < alast: - isLeftoverA = true - leftoverstart = blast + 1 - intersection.length = uint16(blast) - intersection.start - case alast < blast: - isLeftoverB = true - leftoverstart = alast + 1 - intersection.length = uint16(alast) - intersection.start - default: - // alast == blast - intersection.length = uint16(alast) - intersection.start - } - - return -} - -func (rc *runContainer16) findNextIntervalThatIntersectsStartingFrom(startIndex int, key int) (index int, done bool) { - w, _, _ := rc.searchRange(key, startIndex, 0) - // rc.search always returns w < len(rc.iv) - if w < startIndex { - // not found and comes before lower bound startIndex, - // so just use the lower bound. - if startIndex == int(len(rc.iv)) { - // also this bump up means that we are done - return startIndex, true - } - return startIndex, false - } - - return w, false -} - -func sliceToString16(m []interval16) string { - s := "" - for i := range m { - s += fmt.Sprintf("%v: %s, ", i, m[i]) - } - return s -} - -// helper for invert -func (rc *runContainer16) invertlastInterval(origin uint16, lastIdx int) []interval16 { - cur := rc.iv[lastIdx] - if cur.last() == MaxUint16 { - if cur.start == origin { - return nil // empty container - } - return []interval16{newInterval16Range(origin, cur.start-1)} - } - if cur.start == origin { - return []interval16{newInterval16Range(cur.last()+1, MaxUint16)} - } - // invert splits - return []interval16{ - newInterval16Range(origin, cur.start-1), - newInterval16Range(cur.last()+1, MaxUint16), - } -} - -// invert returns a new container (not inplace), that is -// the inversion of rc. For each bit b in rc, the -// returned value has !b -func (rc *runContainer16) invert() *runContainer16 { - ni := len(rc.iv) - var m []interval16 - switch ni { - case 0: - return &runContainer16{iv: []interval16{newInterval16Range(0, MaxUint16)}} - case 1: - return &runContainer16{iv: rc.invertlastInterval(0, 0)} - } - var invstart int - ult := ni - 1 - for i, cur := range rc.iv { - if i == ult { - // invertlastInteval will add both intervals (b) and (c) in - // diagram below. - m = append(m, rc.invertlastInterval(uint16(invstart), i)...) - break - } - // INVAR: i and cur are not the last interval, there is a next at i+1 - // - // ........[cur.start, cur.last] ...... [next.start, next.last].... - // ^ ^ ^ - // (a) (b) (c) - // - // Now: we add interval (a); but if (a) is empty, for cur.start==0, we skip it. - if cur.start > 0 { - m = append(m, newInterval16Range(uint16(invstart), cur.start-1)) - } - invstart = int(cur.last() + 1) - } - return &runContainer16{iv: m} -} - -func (iv interval16) equal(b interval16) bool { - return iv.start == b.start && iv.length == b.length -} - -func (iv interval16) isSuperSetOf(b interval16) bool { - return iv.start <= b.start && b.last() <= iv.last() -} - -func (iv interval16) subtractInterval(del interval16) (left []interval16, delcount int) { - isect, isEmpty := intersectInterval16s(iv, del) - - if isEmpty { - return nil, 0 - } - if del.isSuperSetOf(iv) { - return nil, iv.runlen() - } - - switch { - case isect.start > iv.start && isect.last() < iv.last(): - new0 := newInterval16Range(iv.start, isect.start-1) - new1 := newInterval16Range(isect.last()+1, iv.last()) - return []interval16{new0, new1}, isect.runlen() - case isect.start == iv.start: - return []interval16{newInterval16Range(isect.last()+1, iv.last())}, isect.runlen() - default: - return []interval16{newInterval16Range(iv.start, isect.start-1)}, isect.runlen() - } -} - -func (rc *runContainer16) isubtract(del interval16) { - origiv := make([]interval16, len(rc.iv)) - copy(origiv, rc.iv) - n := int(len(rc.iv)) - if n == 0 { - return // already done. - } - - _, isEmpty := intersectInterval16s(newInterval16Range(rc.iv[0].start, rc.iv[n-1].last()), del) - if isEmpty { - return // done - } - - // INVAR there is some intersection between rc and del - istart, startAlready, _ := rc.search(int(del.start)) - ilast, lastAlready, _ := rc.search(int(del.last())) - if istart == -1 { - if ilast == n-1 && !lastAlready { - rc.iv = nil - return - } - } - // some intervals will remain - switch { - case startAlready && lastAlready: - res0, _ := rc.iv[istart].subtractInterval(del) - - // would overwrite values in iv b/c res0 can have len 2. so - // write to origiv instead. - lost := 1 + ilast - istart - changeSize := int(len(res0)) - lost - newSize := int(len(rc.iv)) + changeSize - - // rc.iv = append(pre, caboose...) - // return - - if ilast != istart { - res1, _ := rc.iv[ilast].subtractInterval(del) - res0 = append(res0, res1...) - changeSize = int(len(res0)) - lost - newSize = int(len(rc.iv)) + changeSize - } - switch { - case changeSize < 0: - // shrink - copy(rc.iv[istart+int(len(res0)):], rc.iv[ilast+1:]) - copy(rc.iv[istart:istart+int(len(res0))], res0) - rc.iv = rc.iv[:newSize] - return - case changeSize == 0: - // stay the same - copy(rc.iv[istart:istart+int(len(res0))], res0) - return - default: - // changeSize > 0 is only possible when ilast == istart. - // Hence we now know: changeSize == 1 and len(res0) == 2 - rc.iv = append(rc.iv, interval16{}) - // len(rc.iv) is correct now, no need to rc.iv = rc.iv[:newSize] - - // copy the tail into place - copy(rc.iv[ilast+2:], rc.iv[ilast+1:]) - // copy the new item(s) into place - copy(rc.iv[istart:istart+2], res0) - return - } - - case !startAlready && !lastAlready: - // we get to discard whole intervals - - // from the search() definition: - - // if del.start is not present, then istart is - // set as follows: - // - // a) istart == n-1 if del.start is beyond our - // last interval16 in rc.iv; - // - // b) istart == -1 if del.start is before our first - // interval16 in rc.iv; - // - // c) istart is set to the minimum index of rc.iv - // which comes strictly before the del.start; - // so del.start > rc.iv[istart].last, - // and if istart+1 exists, then del.start < rc.iv[istart+1].startx - - // if del.last is not present, then ilast is - // set as follows: - // - // a) ilast == n-1 if del.last is beyond our - // last interval16 in rc.iv; - // - // b) ilast == -1 if del.last is before our first - // interval16 in rc.iv; - // - // c) ilast is set to the minimum index of rc.iv - // which comes strictly before the del.last; - // so del.last > rc.iv[ilast].last, - // and if ilast+1 exists, then del.last < rc.iv[ilast+1].start - - // INVAR: istart >= 0 - pre := rc.iv[:istart+1] - if ilast == n-1 { - rc.iv = pre - return - } - // INVAR: ilast < n-1 - lost := ilast - istart - changeSize := -lost - newSize := int(len(rc.iv)) + changeSize - if changeSize != 0 { - copy(rc.iv[ilast+1+changeSize:], rc.iv[ilast+1:]) - } - rc.iv = rc.iv[:newSize] - return - - case startAlready && !lastAlready: - // we can only shrink or stay the same size - // i.e. we either eliminate the whole interval, - // or just cut off the right side. - res0, _ := rc.iv[istart].subtractInterval(del) - if len(res0) > 0 { - // len(res) must be 1 - rc.iv[istart] = res0[0] - } - lost := 1 + (ilast - istart) - changeSize := int(len(res0)) - lost - newSize := int(len(rc.iv)) + changeSize - if changeSize != 0 { - copy(rc.iv[ilast+1+changeSize:], rc.iv[ilast+1:]) - } - rc.iv = rc.iv[:newSize] - return - - case !startAlready && lastAlready: - // we can only shrink or stay the same size - res1, _ := rc.iv[ilast].subtractInterval(del) - lost := ilast - istart - changeSize := int(len(res1)) - lost - newSize := int(len(rc.iv)) + changeSize - if changeSize != 0 { - // move the tail first to make room for res1 - copy(rc.iv[ilast+1+changeSize:], rc.iv[ilast+1:]) - } - copy(rc.iv[istart+1:], res1) - rc.iv = rc.iv[:newSize] - return - } -} - -// compute rc minus b, and return the result as a new value (not inplace). -// port of run_container_andnot from CRoaring... -// https://github.com/RoaringBitmap/CRoaring/blob/master/src/containers/run.c#L435-L496 -func (rc *runContainer16) AndNotRunContainer16(b *runContainer16) *runContainer16 { - - if len(b.iv) == 0 || len(rc.iv) == 0 { - return rc - } - - dst := newRunContainer16() - apos := 0 - bpos := 0 - - a := rc - - astart := a.iv[apos].start - alast := a.iv[apos].last() - bstart := b.iv[bpos].start - blast := b.iv[bpos].last() - - alen := len(a.iv) - blen := len(b.iv) - - for apos < alen && bpos < blen { - switch { - case alast < bstart: - // output the first run - dst.iv = append(dst.iv, newInterval16Range(astart, alast)) - apos++ - if apos < alen { - astart = a.iv[apos].start - alast = a.iv[apos].last() - } - case blast < astart: - // exit the second run - bpos++ - if bpos < blen { - bstart = b.iv[bpos].start - blast = b.iv[bpos].last() - } - default: - // a: [ ] - // b: [ ] - // alast >= bstart - // blast >= astart - if astart < bstart { - dst.iv = append(dst.iv, newInterval16Range(astart, bstart-1)) - } - if alast > blast { - astart = blast + 1 - } else { - apos++ - if apos < alen { - astart = a.iv[apos].start - alast = a.iv[apos].last() - } - } - } - } - if apos < alen { - dst.iv = append(dst.iv, newInterval16Range(astart, alast)) - apos++ - if apos < alen { - dst.iv = append(dst.iv, a.iv[apos:]...) - } - } - - return dst -} - -func (rc *runContainer16) numberOfRuns() (nr int) { - return len(rc.iv) -} - -func (rc *runContainer16) containerType() contype { - return run16Contype -} - -func (rc *runContainer16) equals16(srb *runContainer16) bool { - // Check if the containers are the same object. - if rc == srb { - return true - } - - if len(srb.iv) != len(rc.iv) { - return false - } - - for i, v := range rc.iv { - if v != srb.iv[i] { - return false - } - } - return true -} - -// compile time verify we meet interface requirements -var _ container = &runContainer16{} - -func (rc *runContainer16) clone() container { - return newRunContainer16CopyIv(rc.iv) -} - -func (rc *runContainer16) minimum() uint16 { - return rc.iv[0].start // assume not empty -} - -func (rc *runContainer16) maximum() uint16 { - return rc.iv[len(rc.iv)-1].last() // assume not empty -} - -func (rc *runContainer16) isFull() bool { - return (len(rc.iv) == 1) && ((rc.iv[0].start == 0) && (rc.iv[0].last() == MaxUint16)) -} - -func (rc *runContainer16) and(a container) container { - if rc.isFull() { - return a.clone() - } - switch c := a.(type) { - case *runContainer16: - return rc.intersect(c) - case *arrayContainer: - return rc.andArray(c) - case *bitmapContainer: - return rc.andBitmapContainer(c) - } - panic("unsupported container type") -} - -func (rc *runContainer16) andCardinality(a container) int { - switch c := a.(type) { - case *runContainer16: - return int(rc.intersectCardinality(c)) - case *arrayContainer: - return rc.andArrayCardinality(c) - case *bitmapContainer: - return rc.andBitmapContainerCardinality(c) - } - panic("unsupported container type") -} - -// andBitmapContainer finds the intersection of rc and b. -func (rc *runContainer16) andBitmapContainer(bc *bitmapContainer) container { - bc2 := newBitmapContainerFromRun(rc) - return bc2.andBitmap(bc) -} - -func (rc *runContainer16) andArrayCardinality(ac *arrayContainer) int { - pos := 0 - answer := 0 - maxpos := ac.getCardinality() - if maxpos == 0 { - return 0 // won't happen in actual code - } - v := ac.content[pos] -mainloop: - for _, p := range rc.iv { - for v < p.start { - pos++ - if pos == maxpos { - break mainloop - } - v = ac.content[pos] - } - for v <= p.last() { - answer++ - pos++ - if pos == maxpos { - break mainloop - } - v = ac.content[pos] - } - } - return answer -} - -func (rc *runContainer16) iand(a container) container { - if rc.isFull() { - return a.clone() - } - switch c := a.(type) { - case *runContainer16: - return rc.inplaceIntersect(c) - case *arrayContainer: - return rc.andArray(c) - case *bitmapContainer: - return rc.iandBitmapContainer(c) - } - panic("unsupported container type") -} - -func (rc *runContainer16) inplaceIntersect(rc2 *runContainer16) container { - sect := rc.intersect(rc2) - *rc = *sect - return rc -} - -func (rc *runContainer16) iandBitmapContainer(bc *bitmapContainer) container { - isect := rc.andBitmapContainer(bc) - *rc = *newRunContainer16FromContainer(isect) - return rc -} - -func (rc *runContainer16) andArray(ac *arrayContainer) container { - if len(rc.iv) == 0 { - return newArrayContainer() - } - - acCardinality := ac.getCardinality() - c := newArrayContainerCapacity(acCardinality) - - for rlePos, arrayPos := 0, 0; arrayPos < acCardinality; { - iv := rc.iv[rlePos] - arrayVal := ac.content[arrayPos] - - for iv.last() < arrayVal { - rlePos++ - if rlePos == len(rc.iv) { - return c - } - iv = rc.iv[rlePos] - } - - if iv.start > arrayVal { - arrayPos = advanceUntil(ac.content, arrayPos, len(ac.content), iv.start) - } else { - c.content = append(c.content, arrayVal) - arrayPos++ - } - } - return c -} - -func (rc *runContainer16) andNot(a container) container { - switch c := a.(type) { - case *arrayContainer: - return rc.andNotArray(c) - case *bitmapContainer: - return rc.andNotBitmap(c) - case *runContainer16: - return rc.andNotRunContainer16(c) - } - panic("unsupported container type") -} - -func (rc *runContainer16) fillLeastSignificant16bits(x []uint32, i int, mask uint32) int { - k := i - var val int - for _, p := range rc.iv { - n := p.runlen() - for j := int(0); j < n; j++ { - val = int(p.start) + j - x[k] = uint32(val) | mask - k++ - } - } - return k -} - -func (rc *runContainer16) getShortIterator() shortPeekable { - return rc.newRunIterator16() -} - -func (rc *runContainer16) getReverseIterator() shortIterable { - return rc.newRunReverseIterator16() -} - -func (rc *runContainer16) getManyIterator() manyIterable { - return rc.newManyRunIterator16() -} - -// add the values in the range [firstOfRange, endx). endx -// is still abe to express 2^16 because it is an int not an uint16. -func (rc *runContainer16) iaddRange(firstOfRange, endx int) container { - - if firstOfRange > endx { - panic(fmt.Sprintf("invalid %v = endx > firstOfRange", endx)) - } - if firstOfRange == endx { - return rc - } - addme := newRunContainer16TakeOwnership([]interval16{ - { - start: uint16(firstOfRange), - length: uint16(endx - 1 - firstOfRange), - }, - }) - *rc = *rc.union(addme) - return rc -} - -// remove the values in the range [firstOfRange,endx) -func (rc *runContainer16) iremoveRange(firstOfRange, endx int) container { - if firstOfRange > endx { - panic(fmt.Sprintf("request to iremove empty set [%v, %v),"+ - " nothing to do.", firstOfRange, endx)) - } - // empty removal - if firstOfRange == endx { - return rc - } - x := newInterval16Range(uint16(firstOfRange), uint16(endx-1)) - rc.isubtract(x) - return rc -} - -// not flip the values in the range [firstOfRange,endx) -func (rc *runContainer16) not(firstOfRange, endx int) container { - if firstOfRange > endx { - panic(fmt.Sprintf("invalid %v = endx > firstOfRange = %v", endx, firstOfRange)) - } - - return rc.Not(firstOfRange, endx) -} - -// Not flips the values in the range [firstOfRange,endx). -// This is not inplace. Only the returned value has the flipped bits. -// -// Currently implemented as (!A intersect B) union (A minus B), -// where A is rc, and B is the supplied [firstOfRange, endx) interval. -// -// TODO(time optimization): convert this to a single pass -// algorithm by copying AndNotRunContainer16() and modifying it. -// Current routine is correct but -// makes 2 more passes through the arrays than should be -// strictly necessary. Measure both ways though--this may not matter. -func (rc *runContainer16) Not(firstOfRange, endx int) *runContainer16 { - - if firstOfRange > endx { - panic(fmt.Sprintf("invalid %v = endx > firstOfRange == %v", endx, firstOfRange)) - } - - if firstOfRange >= endx { - return rc.Clone() - } - - a := rc - // algo: - // (!A intersect B) union (A minus B) - - nota := a.invert() - - bs := []interval16{newInterval16Range(uint16(firstOfRange), uint16(endx-1))} - b := newRunContainer16TakeOwnership(bs) - - notAintersectB := nota.intersect(b) - - aMinusB := a.AndNotRunContainer16(b) - - rc2 := notAintersectB.union(aMinusB) - return rc2 -} - -// equals is now logical equals; it does not require the -// same underlying container type. -func (rc *runContainer16) equals(o container) bool { - srb, ok := o.(*runContainer16) - - if !ok { - // maybe value instead of pointer - val, valok := o.(*runContainer16) - if valok { - srb = val - ok = true - } - } - if ok { - // Check if the containers are the same object. - if rc == srb { - return true - } - - if len(srb.iv) != len(rc.iv) { - return false - } - - for i, v := range rc.iv { - if v != srb.iv[i] { - return false - } - } - return true - } - - // use generic comparison - if o.getCardinality() != rc.getCardinality() { - return false - } - rit := rc.getShortIterator() - bit := o.getShortIterator() - - //k := 0 - for rit.hasNext() { - if bit.next() != rit.next() { - return false - } - //k++ - } - return true -} - -func (rc *runContainer16) iaddReturnMinimized(x uint16) container { - rc.Add(x) - return rc -} - -func (rc *runContainer16) iadd(x uint16) (wasNew bool) { - return rc.Add(x) -} - -func (rc *runContainer16) iremoveReturnMinimized(x uint16) container { - rc.removeKey(x) - return rc -} - -func (rc *runContainer16) iremove(x uint16) bool { - return rc.removeKey(x) -} - -func (rc *runContainer16) or(a container) container { - if rc.isFull() { - return rc.clone() - } - switch c := a.(type) { - case *runContainer16: - return rc.union(c) - case *arrayContainer: - return rc.orArray(c) - case *bitmapContainer: - return rc.orBitmapContainer(c) - } - panic("unsupported container type") -} - -func (rc *runContainer16) orCardinality(a container) int { - switch c := a.(type) { - case *runContainer16: - return int(rc.unionCardinality(c)) - case *arrayContainer: - return rc.orArrayCardinality(c) - case *bitmapContainer: - return rc.orBitmapContainerCardinality(c) - } - panic("unsupported container type") -} - -// orBitmapContainer finds the union of rc and bc. -func (rc *runContainer16) orBitmapContainer(bc *bitmapContainer) container { - bc2 := newBitmapContainerFromRun(rc) - return bc2.iorBitmap(bc) -} - -func (rc *runContainer16) andBitmapContainerCardinality(bc *bitmapContainer) int { - answer := 0 - for i := range rc.iv { - answer += bc.getCardinalityInRange(uint(rc.iv[i].start), uint(rc.iv[i].last())+1) - } - //bc.computeCardinality() - return answer -} - -func (rc *runContainer16) orBitmapContainerCardinality(bc *bitmapContainer) int { - return rc.getCardinality() + bc.getCardinality() - rc.andBitmapContainerCardinality(bc) -} - -// orArray finds the union of rc and ac. -func (rc *runContainer16) orArray(ac *arrayContainer) container { - if ac.isEmpty() { - return rc.clone() - } - if rc.isEmpty() { - return ac.clone() - } - intervals, cardMinusOne := runArrayUnionToRuns(rc, ac) - result := newRunContainer16TakeOwnership(intervals) - if len(intervals) >= 2048 && cardMinusOne >= arrayDefaultMaxSize { - return newBitmapContainerFromRun(result) - } - if len(intervals)*2 > 1+int(cardMinusOne) { - return result.toArrayContainer() - } - return result -} - -// orArray finds the union of rc and ac. -func (rc *runContainer16) orArrayCardinality(ac *arrayContainer) int { - return ac.getCardinality() + rc.getCardinality() - rc.andArrayCardinality(ac) -} - -func (rc *runContainer16) ior(a container) container { - if rc.isFull() { - return rc - } - switch c := a.(type) { - case *runContainer16: - return rc.inplaceUnion(c) - case *arrayContainer: - return rc.iorArray(c) - case *bitmapContainer: - return rc.iorBitmapContainer(c) - } - panic("unsupported container type") -} - -func (rc *runContainer16) inplaceUnion(rc2 *runContainer16) container { - for _, p := range rc2.iv { - last := int(p.last()) - for i := int(p.start); i <= last; i++ { - rc.Add(uint16(i)) - } - } - return rc -} - -func (rc *runContainer16) iorBitmapContainer(bc *bitmapContainer) container { - - it := bc.getShortIterator() - for it.hasNext() { - rc.Add(it.next()) - } - return rc -} - -func (rc *runContainer16) iorArray(ac *arrayContainer) container { - if rc.isEmpty() { - return ac.clone() - } - if ac.isEmpty() { - return rc - } - var cardMinusOne uint16 - //TODO: perform the union algorithm in-place using rc.iv - // this can be done with methods like the in-place array container union - // but maybe lazily moving the remaining elements back. - rc.iv, cardMinusOne = runArrayUnionToRuns(rc, ac) - if len(rc.iv) >= 2048 && cardMinusOne >= arrayDefaultMaxSize { - return newBitmapContainerFromRun(rc) - } - if len(rc.iv)*2 > 1+int(cardMinusOne) { - return rc.toArrayContainer() - } - return rc -} - -func runArrayUnionToRuns(rc *runContainer16, ac *arrayContainer) ([]interval16, uint16) { - pos1 := 0 - pos2 := 0 - length1 := len(ac.content) - length2 := len(rc.iv) - target := make([]interval16, 0, len(rc.iv)) - // have to find the first range - // options are - // 1. from array container - // 2. from run container - var previousInterval interval16 - var cardMinusOne uint16 - if ac.content[0] < rc.iv[0].start { - previousInterval.start = ac.content[0] - previousInterval.length = 0 - pos1++ - } else { - previousInterval.start = rc.iv[0].start - previousInterval.length = rc.iv[0].length - pos2++ - } - - for pos1 < length1 || pos2 < length2 { - if pos1 < length1 { - s1 := ac.content[pos1] - if s1 <= previousInterval.start+previousInterval.length { - pos1++ - continue - } - if previousInterval.last() < MaxUint16 && previousInterval.last()+1 == s1 { - previousInterval.length++ - pos1++ - continue - } - } - if pos2 < length2 { - range2 := rc.iv[pos2] - if range2.start <= previousInterval.last() || range2.start > 0 && range2.start-1 == previousInterval.last() { - pos2++ - if previousInterval.last() < range2.last() { - previousInterval.length = range2.last() - previousInterval.start - } - continue - } - } - cardMinusOne += previousInterval.length + 1 - target = append(target, previousInterval) - if pos2 == length2 || pos1 < length1 && ac.content[pos1] < rc.iv[pos2].start { - previousInterval.start = ac.content[pos1] - previousInterval.length = 0 - pos1++ - } else { - previousInterval = rc.iv[pos2] - pos2++ - } - } - cardMinusOne += previousInterval.length - target = append(target, previousInterval) - - return target, cardMinusOne -} - -// lazyIOR is described (not yet implemented) in -// this nice note from @lemire on -// https://github.com/RoaringBitmap/roaring/pull/70#issuecomment-263613737 -// -// Description of lazyOR and lazyIOR from @lemire: -// -// Lazy functions are optional and can be simply -// wrapper around non-lazy functions. -// -// The idea of "laziness" is as follows. It is -// inspired by the concept of lazy evaluation -// you might be familiar with (functional programming -// and all that). So a roaring bitmap is -// such that all its containers are, in some -// sense, chosen to use as little memory as -// possible. This is nice. Also, all bitsets -// are "cardinality aware" so that you can do -// fast rank/select queries, or query the -// cardinality of the whole bitmap... very fast, -// without latency. -// -// However, imagine that you are aggregating 100 -// bitmaps together. So you OR the first two, then OR -// that with the third one and so forth. Clearly, -// intermediate bitmaps don't need to be as -// compressed as possible, right? They can be -// in a "dirty state". You only need the end -// result to be in a nice state... which you -// can achieve by calling repairAfterLazy at the end. -// -// The Java/C code does something special for -// the in-place lazy OR runs. The idea is that -// instead of taking two run containers and -// generating a new one, we actually try to -// do the computation in-place through a -// technique invented by @gssiyankai (pinging him!). -// What you do is you check whether the host -// run container has lots of extra capacity. -// If it does, you move its data at the end of -// the backing array, and then you write -// the answer at the beginning. What this -// trick does is minimize memory allocations. -func (rc *runContainer16) lazyIOR(a container) container { - // not lazy at the moment - return rc.ior(a) -} - -// lazyOR is described above in lazyIOR. -func (rc *runContainer16) lazyOR(a container) container { - // not lazy at the moment - return rc.or(a) -} - -func (rc *runContainer16) intersects(a container) bool { - // TODO: optimize by doing inplace/less allocation - isect := rc.and(a) - return !isect.isEmpty() -} - -func (rc *runContainer16) xor(a container) container { - switch c := a.(type) { - case *arrayContainer: - return rc.xorArray(c) - case *bitmapContainer: - return rc.xorBitmap(c) - case *runContainer16: - return rc.xorRunContainer16(c) - } - panic("unsupported container type") -} - -func (rc *runContainer16) iandNot(a container) container { - switch c := a.(type) { - case *arrayContainer: - return rc.iandNotArray(c) - case *bitmapContainer: - return rc.iandNotBitmap(c) - case *runContainer16: - return rc.iandNotRunContainer16(c) - } - panic("unsupported container type") -} - -// flip the values in the range [firstOfRange,endx) -func (rc *runContainer16) inot(firstOfRange, endx int) container { - if firstOfRange > endx { - panic(fmt.Sprintf("invalid %v = endx > firstOfRange = %v", endx, firstOfRange)) - } - if firstOfRange > endx { - return rc - } - // TODO: minimize copies, do it all inplace; not() makes a copy. - rc = rc.Not(firstOfRange, endx) - return rc -} - -func (rc *runContainer16) rank(x uint16) int { - n := int(len(rc.iv)) - xx := int(x) - w, already, _ := rc.search(xx) - if w < 0 { - return 0 - } - if !already && w == n-1 { - return rc.getCardinality() - } - var rnk int - if !already { - for i := int(0); i <= w; i++ { - rnk += rc.iv[i].runlen() - } - return int(rnk) - } - for i := int(0); i < w; i++ { - rnk += rc.iv[i].runlen() - } - rnk += int(x-rc.iv[w].start) + 1 - return int(rnk) -} - -func (rc *runContainer16) selectInt(x uint16) int { - var offset int - for k := range rc.iv { - nextOffset := offset + rc.iv[k].runlen() - if nextOffset > int(x) { - return int(int(rc.iv[k].start) + (int(x) - offset)) - } - offset = nextOffset - } - panic("cannot select x") -} - -func (rc *runContainer16) andNotRunContainer16(b *runContainer16) container { - return rc.AndNotRunContainer16(b) -} - -func (rc *runContainer16) andNotArray(ac *arrayContainer) container { - rcb := rc.toBitmapContainer() - acb := ac.toBitmapContainer() - return rcb.andNotBitmap(acb) -} - -func (rc *runContainer16) andNotBitmap(bc *bitmapContainer) container { - rcb := rc.toBitmapContainer() - return rcb.andNotBitmap(bc) -} - -func (rc *runContainer16) toBitmapContainer() *bitmapContainer { - bc := newBitmapContainer() - for i := range rc.iv { - bc.iaddRange(int(rc.iv[i].start), int(rc.iv[i].last())+1) - } - bc.computeCardinality() - return bc -} - -func (rc *runContainer16) iandNotRunContainer16(x2 *runContainer16) container { - rcb := rc.toBitmapContainer() - x2b := x2.toBitmapContainer() - rcb.iandNotBitmapSurely(x2b) - // TODO: check size and optimize the return value - // TODO: is inplace modification really required? If not, elide the copy. - rc2 := newRunContainer16FromBitmapContainer(rcb) - *rc = *rc2 - return rc -} - -func (rc *runContainer16) iandNotArray(ac *arrayContainer) container { - rcb := rc.toBitmapContainer() - acb := ac.toBitmapContainer() - rcb.iandNotBitmapSurely(acb) - // TODO: check size and optimize the return value - // TODO: is inplace modification really required? If not, elide the copy. - rc2 := newRunContainer16FromBitmapContainer(rcb) - *rc = *rc2 - return rc -} - -func (rc *runContainer16) iandNotBitmap(bc *bitmapContainer) container { - rcb := rc.toBitmapContainer() - rcb.iandNotBitmapSurely(bc) - // TODO: check size and optimize the return value - // TODO: is inplace modification really required? If not, elide the copy. - rc2 := newRunContainer16FromBitmapContainer(rcb) - *rc = *rc2 - return rc -} - -func (rc *runContainer16) xorRunContainer16(x2 *runContainer16) container { - rcb := rc.toBitmapContainer() - x2b := x2.toBitmapContainer() - return rcb.xorBitmap(x2b) -} - -func (rc *runContainer16) xorArray(ac *arrayContainer) container { - rcb := rc.toBitmapContainer() - acb := ac.toBitmapContainer() - return rcb.xorBitmap(acb) -} - -func (rc *runContainer16) xorBitmap(bc *bitmapContainer) container { - rcb := rc.toBitmapContainer() - return rcb.xorBitmap(bc) -} - -// convert to bitmap or array *if needed* -func (rc *runContainer16) toEfficientContainer() container { - sizeAsRunContainer := rc.getSizeInBytes() - sizeAsBitmapContainer := bitmapContainerSizeInBytes() - card := rc.getCardinality() - sizeAsArrayContainer := arrayContainerSizeInBytes(card) - if sizeAsRunContainer <= minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { - return rc - } - if card <= arrayDefaultMaxSize { - return rc.toArrayContainer() - } - bc := newBitmapContainerFromRun(rc) - return bc -} - -func (rc *runContainer16) toArrayContainer() *arrayContainer { - ac := newArrayContainer() - for i := range rc.iv { - ac.iaddRange(int(rc.iv[i].start), int(rc.iv[i].last())+1) - } - return ac -} - -func newRunContainer16FromContainer(c container) *runContainer16 { - - switch x := c.(type) { - case *runContainer16: - return x.Clone() - case *arrayContainer: - return newRunContainer16FromArray(x) - case *bitmapContainer: - return newRunContainer16FromBitmapContainer(x) - } - panic("unsupported container type") -} - -// And finds the intersection of rc and b. -func (rc *runContainer16) And(b *Bitmap) *Bitmap { - out := NewBitmap() - for _, p := range rc.iv { - plast := p.last() - for i := p.start; i <= plast; i++ { - if b.Contains(uint32(i)) { - out.Add(uint32(i)) - } - } - } - return out -} - -// Xor returns the exclusive-or of rc and b. -func (rc *runContainer16) Xor(b *Bitmap) *Bitmap { - out := b.Clone() - for _, p := range rc.iv { - plast := p.last() - for v := p.start; v <= plast; v++ { - w := uint32(v) - if out.Contains(w) { - out.RemoveRange(uint64(w), uint64(w+1)) - } else { - out.Add(w) - } - } - } - return out -} - -// Or returns the union of rc and b. -func (rc *runContainer16) Or(b *Bitmap) *Bitmap { - out := b.Clone() - for _, p := range rc.iv { - plast := p.last() - for v := p.start; v <= plast; v++ { - out.Add(uint32(v)) - } - } - return out -} - -// serializedSizeInBytes returns the number of bytes of memory -// required by this runContainer16. This is for the -// Roaring format, as specified https://github.com/RoaringBitmap/RoaringFormatSpec/ -func (rc *runContainer16) serializedSizeInBytes() int { - // number of runs in one uint16, then each run - // needs two more uint16 - return 2 + len(rc.iv)*4 -} - -func (rc *runContainer16) addOffset(x uint16) (container, container) { - var low, high *runContainer16 - - if len(rc.iv) == 0 { - return nil, nil - } - - first := uint32(rc.iv[0].start) + uint32(x) - if highbits(first) == 0 { - // Some elements will fall into low part, allocate a container. - // Checking the first one is enough because they are ordered. - low = newRunContainer16() - } - last := uint32(rc.iv[len(rc.iv)-1].start) - last += uint32(rc.iv[len(rc.iv)-1].length) - last += uint32(x) - if highbits(last) > 0 { - // Some elements will fall into high part, allocate a container. - // Checking the last one is enough because they are ordered. - high = newRunContainer16() - } - - for _, iv := range rc.iv { - val := int(iv.start) + int(x) - finalVal := int(val) + int(iv.length) - if val <= 0xffff { - if finalVal <= 0xffff { - low.iv = append(low.iv, interval16{uint16(val), iv.length}) - } else { - low.iv = append(low.iv, interval16{uint16(val), uint16(0xffff - val)}) - high.iv = append(high.iv, interval16{uint16(0), uint16(finalVal & 0xffff)}) - } - } else { - high.iv = append(high.iv, interval16{uint16(val & 0xffff), iv.length}) - } - } - - // Ensure proper nil interface. - if low == nil { - return nil, high - } - if high == nil { - return low, nil - } - - return low, high -} diff --git a/vendor/github.com/RoaringBitmap/roaring/serialization.go b/vendor/github.com/RoaringBitmap/roaring/serialization.go deleted file mode 100644 index dbfecc8..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/serialization.go +++ /dev/null @@ -1,18 +0,0 @@ -package roaring - -import ( - "encoding/binary" - "io" -) - -// writeTo for runContainer16 follows this -// spec: https://github.com/RoaringBitmap/RoaringFormatSpec -func (b *runContainer16) writeTo(stream io.Writer) (int, error) { - buf := make([]byte, 2+4*len(b.iv)) - binary.LittleEndian.PutUint16(buf[0:], uint16(len(b.iv))) - for i, v := range b.iv { - binary.LittleEndian.PutUint16(buf[2+i*4:], v.start) - binary.LittleEndian.PutUint16(buf[2+2+i*4:], v.length) - } - return stream.Write(buf) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/serialization_generic.go b/vendor/github.com/RoaringBitmap/roaring/serialization_generic.go deleted file mode 100644 index 7e1f180..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/serialization_generic.go +++ /dev/null @@ -1,145 +0,0 @@ -//go:build (!amd64 && !386 && !arm && !arm64 && !ppc64le && !mipsle && !mips64le && !mips64p32le && !wasm) || appengine -// +build !amd64,!386,!arm,!arm64,!ppc64le,!mipsle,!mips64le,!mips64p32le,!wasm appengine - -package roaring - -import ( - "encoding/binary" - "errors" - "io" -) - -func (b *arrayContainer) writeTo(stream io.Writer) (int, error) { - buf := make([]byte, 2*len(b.content)) - for i, v := range b.content { - base := i * 2 - buf[base] = byte(v) - buf[base+1] = byte(v >> 8) - } - return stream.Write(buf) -} - -func (b *arrayContainer) readFrom(stream io.Reader) (int, error) { - err := binary.Read(stream, binary.LittleEndian, b.content) - if err != nil { - return 0, err - } - return 2 * len(b.content), nil -} - -func (b *bitmapContainer) writeTo(stream io.Writer) (int, error) { - if b.cardinality <= arrayDefaultMaxSize { - return 0, errors.New("refusing to write bitmap container with cardinality of array container") - } - - // Write set - buf := make([]byte, 8*len(b.bitmap)) - for i, v := range b.bitmap { - base := i * 8 - buf[base] = byte(v) - buf[base+1] = byte(v >> 8) - buf[base+2] = byte(v >> 16) - buf[base+3] = byte(v >> 24) - buf[base+4] = byte(v >> 32) - buf[base+5] = byte(v >> 40) - buf[base+6] = byte(v >> 48) - buf[base+7] = byte(v >> 56) - } - return stream.Write(buf) -} - -func (b *bitmapContainer) readFrom(stream io.Reader) (int, error) { - err := binary.Read(stream, binary.LittleEndian, b.bitmap) - if err != nil { - return 0, err - } - b.computeCardinality() - return 8 * len(b.bitmap), nil -} - -func (bc *bitmapContainer) asLittleEndianByteSlice() []byte { - by := make([]byte, len(bc.bitmap)*8) - for i := range bc.bitmap { - binary.LittleEndian.PutUint64(by[i*8:], bc.bitmap[i]) - } - return by -} - -func uint64SliceAsByteSlice(slice []uint64) []byte { - by := make([]byte, len(slice)*8) - - for i, v := range slice { - binary.LittleEndian.PutUint64(by[i*8:], v) - } - - return by -} - -func uint16SliceAsByteSlice(slice []uint16) []byte { - by := make([]byte, len(slice)*2) - - for i, v := range slice { - binary.LittleEndian.PutUint16(by[i*2:], v) - } - - return by -} - -func interval16SliceAsByteSlice(slice []interval16) []byte { - by := make([]byte, len(slice)*4) - - for i, v := range slice { - binary.LittleEndian.PutUint16(by[i*2:], v.start) - binary.LittleEndian.PutUint16(by[i*2+2:], v.length) - } - - return by -} - -func byteSliceAsUint16Slice(slice []byte) []uint16 { - if len(slice)%2 != 0 { - panic("Slice size should be divisible by 2") - } - - b := make([]uint16, len(slice)/2) - - for i := range b { - b[i] = binary.LittleEndian.Uint16(slice[2*i:]) - } - - return b -} - -func byteSliceAsUint64Slice(slice []byte) []uint64 { - if len(slice)%8 != 0 { - panic("Slice size should be divisible by 8") - } - - b := make([]uint64, len(slice)/8) - - for i := range b { - b[i] = binary.LittleEndian.Uint64(slice[8*i:]) - } - - return b -} - -// Converts a byte slice to a interval16 slice. -// The function assumes that the slice byte buffer is run container data -// encoded according to Roaring Format Spec -func byteSliceAsInterval16Slice(byteSlice []byte) []interval16 { - if len(byteSlice)%4 != 0 { - panic("Slice size should be divisible by 4") - } - - intervalSlice := make([]interval16, len(byteSlice)/4) - - for i := range intervalSlice { - intervalSlice[i] = interval16{ - start: binary.LittleEndian.Uint16(byteSlice[i*4:]), - length: binary.LittleEndian.Uint16(byteSlice[i*4+2:]), - } - } - - return intervalSlice -} diff --git a/vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go b/vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go deleted file mode 100644 index 6e3a5d5..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/serialization_littleendian.go +++ /dev/null @@ -1,662 +0,0 @@ -//go:build (386 && !appengine) || (amd64 && !appengine) || (arm && !appengine) || (arm64 && !appengine) || (ppc64le && !appengine) || (mipsle && !appengine) || (mips64le && !appengine) || (mips64p32le && !appengine) || (wasm && !appengine) -// +build 386,!appengine amd64,!appengine arm,!appengine arm64,!appengine ppc64le,!appengine mipsle,!appengine mips64le,!appengine mips64p32le,!appengine wasm,!appengine - -package roaring - -import ( - "encoding/binary" - "errors" - "io" - "reflect" - "runtime" - "unsafe" -) - -func (ac *arrayContainer) writeTo(stream io.Writer) (int, error) { - buf := uint16SliceAsByteSlice(ac.content) - return stream.Write(buf) -} - -func (bc *bitmapContainer) writeTo(stream io.Writer) (int, error) { - if bc.cardinality <= arrayDefaultMaxSize { - return 0, errors.New("refusing to write bitmap container with cardinality of array container") - } - buf := uint64SliceAsByteSlice(bc.bitmap) - return stream.Write(buf) -} - -func uint64SliceAsByteSlice(slice []uint64) []byte { - // make a new slice header - header := *(*reflect.SliceHeader)(unsafe.Pointer(&slice)) - - // update its capacity and length - header.Len *= 8 - header.Cap *= 8 - - // instantiate result and use KeepAlive so data isn't unmapped. - result := *(*[]byte)(unsafe.Pointer(&header)) - runtime.KeepAlive(&slice) - - // return it - return result -} - -func uint16SliceAsByteSlice(slice []uint16) []byte { - // make a new slice header - header := *(*reflect.SliceHeader)(unsafe.Pointer(&slice)) - - // update its capacity and length - header.Len *= 2 - header.Cap *= 2 - - // instantiate result and use KeepAlive so data isn't unmapped. - result := *(*[]byte)(unsafe.Pointer(&header)) - runtime.KeepAlive(&slice) - - // return it - return result -} - -func interval16SliceAsByteSlice(slice []interval16) []byte { - // make a new slice header - header := *(*reflect.SliceHeader)(unsafe.Pointer(&slice)) - - // update its capacity and length - header.Len *= 4 - header.Cap *= 4 - - // instantiate result and use KeepAlive so data isn't unmapped. - result := *(*[]byte)(unsafe.Pointer(&header)) - runtime.KeepAlive(&slice) - - // return it - return result -} - -func (bc *bitmapContainer) asLittleEndianByteSlice() []byte { - return uint64SliceAsByteSlice(bc.bitmap) -} - -// Deserialization code follows - -// // -// These methods (byteSliceAsUint16Slice,...) do not make copies, -// they are pointer-based (unsafe). The caller is responsible to -// ensure that the input slice does not get garbage collected, deleted -// or modified while you hold the returned slince. -// // -func byteSliceAsUint16Slice(slice []byte) (result []uint16) { // here we create a new slice holder - if len(slice)%2 != 0 { - panic("Slice size should be divisible by 2") - } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / 2 - rHeader.Cap = bHeader.Cap / 2 - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return -} - -func byteSliceAsUint64Slice(slice []byte) (result []uint64) { - if len(slice)%8 != 0 { - panic("Slice size should be divisible by 8") - } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / 8 - rHeader.Cap = bHeader.Cap / 8 - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return -} - -func byteSliceAsInterval16Slice(slice []byte) (result []interval16) { - if len(slice)%4 != 0 { - panic("Slice size should be divisible by 4") - } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / 4 - rHeader.Cap = bHeader.Cap / 4 - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return -} - -func byteSliceAsContainerSlice(slice []byte) (result []container) { - var c container - containerSize := int(unsafe.Sizeof(c)) - - if len(slice)%containerSize != 0 { - panic("Slice size should be divisible by unsafe.Sizeof(container)") - } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / containerSize - rHeader.Cap = bHeader.Cap / containerSize - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return -} - -func byteSliceAsBitsetSlice(slice []byte) (result []bitmapContainer) { - bitsetSize := int(unsafe.Sizeof(bitmapContainer{})) - if len(slice)%bitsetSize != 0 { - panic("Slice size should be divisible by unsafe.Sizeof(bitmapContainer)") - } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / bitsetSize - rHeader.Cap = bHeader.Cap / bitsetSize - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return -} - -func byteSliceAsArraySlice(slice []byte) (result []arrayContainer) { - arraySize := int(unsafe.Sizeof(arrayContainer{})) - if len(slice)%arraySize != 0 { - panic("Slice size should be divisible by unsafe.Sizeof(arrayContainer)") - } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / arraySize - rHeader.Cap = bHeader.Cap / arraySize - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return -} - -func byteSliceAsRun16Slice(slice []byte) (result []runContainer16) { - run16Size := int(unsafe.Sizeof(runContainer16{})) - if len(slice)%run16Size != 0 { - panic("Slice size should be divisible by unsafe.Sizeof(runContainer16)") - } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / run16Size - rHeader.Cap = bHeader.Cap / run16Size - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return -} - -func byteSliceAsBoolSlice(slice []byte) (result []bool) { - boolSize := int(unsafe.Sizeof(true)) - if len(slice)%boolSize != 0 { - panic("Slice size should be divisible by unsafe.Sizeof(bool)") - } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / boolSize - rHeader.Cap = bHeader.Cap / boolSize - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return -} - -// FrozenView creates a static view of a serialized bitmap stored in buf. -// It uses CRoaring's frozen bitmap format. -// -// The format specification is available here: -// https://github.com/RoaringBitmap/CRoaring/blob/2c867e9f9c9e2a3a7032791f94c4c7ae3013f6e0/src/roaring.c#L2756-L2783 -// -// The provided byte array (buf) is expected to be a constant. -// The function makes the best effort attempt not to copy data. -// Only little endian is supported. The function will err if it detects a big -// endian serialized file. -// You should take care not to modify buff as it will likely result in -// unexpected program behavior. -// If said buffer comes from a memory map, it's advisable to give it read -// only permissions, either at creation or by calling Mprotect from the -// golang.org/x/sys/unix package. -// -// Resulting bitmaps are effectively immutable in the following sense: -// a copy-on-write marker is used so that when you modify the resulting -// bitmap, copies of selected data (containers) are made. -// You should *not* change the copy-on-write status of the resulting -// bitmaps (SetCopyOnWrite). -// -// If buf becomes unavailable, then a bitmap created with -// FromBuffer would be effectively broken. Furthermore, any -// bitmap derived from this bitmap (e.g., via Or, And) might -// also be broken. Thus, before making buf unavailable, you should -// call CloneCopyOnWriteContainers on all such bitmaps. -func (rb *Bitmap) FrozenView(buf []byte) error { - return rb.highlowcontainer.frozenView(buf) -} - -/* Verbatim specification from CRoaring. - * - * FROZEN SERIALIZATION FORMAT DESCRIPTION - * - * -- (beginning must be aligned by 32 bytes) -- - * uint64_t[BITSET_CONTAINER_SIZE_IN_WORDS * num_bitset_containers] - * rle16_t[total number of rle elements in all run containers] - * uint16_t[total number of array elements in all array containers] - * uint16_t[num_containers] - * uint16_t[num_containers] - * uint8_t[num_containers] - *
uint32_t - * - *
is a 4-byte value which is a bit union of frozenCookie (15 bits) - * and the number of containers (17 bits). - * - * stores number of elements for every container. - * Its meaning depends on container type. - * For array and bitset containers, this value is the container cardinality minus one. - * For run container, it is the number of rle_t elements (n_runs). - * - * ,, are flat arrays of elements of - * all containers of respective type. - * - * <*_data> and are kept close together because they are not accessed - * during deserilization. This may reduce IO in case of large mmaped bitmaps. - * All members have their native alignments during deserilization except
, - * which is not guaranteed to be aligned by 4 bytes. - */ -const frozenCookie = 13766 - -var ( - // ErrFrozenBitmapInvalidCookie is returned when the header does not contain the frozenCookie. - ErrFrozenBitmapInvalidCookie = errors.New("header does not contain the frozenCookie") - // ErrFrozenBitmapBigEndian is returned when the header is big endian. - ErrFrozenBitmapBigEndian = errors.New("loading big endian frozen bitmaps is not supported") - // ErrFrozenBitmapIncomplete is returned when the buffer is too small to contain a frozen bitmap. - ErrFrozenBitmapIncomplete = errors.New("input buffer too small to contain a frozen bitmap") - // ErrFrozenBitmapOverpopulated is returned when the number of containers is too large. - ErrFrozenBitmapOverpopulated = errors.New("too many containers") - // ErrFrozenBitmapUnexpectedData is returned when the buffer contains unexpected data. - ErrFrozenBitmapUnexpectedData = errors.New("spurious data in input") - // ErrFrozenBitmapInvalidTypecode is returned when the typecode is invalid. - ErrFrozenBitmapInvalidTypecode = errors.New("unrecognized typecode") - // ErrFrozenBitmapBufferTooSmall is returned when the buffer is too small. - ErrFrozenBitmapBufferTooSmall = errors.New("buffer too small") -) - -func (ra *roaringArray) frozenView(buf []byte) error { - if len(buf) < 4 { - return ErrFrozenBitmapIncomplete - } - - headerBE := binary.BigEndian.Uint32(buf[len(buf)-4:]) - if headerBE&0x7fff == frozenCookie { - return ErrFrozenBitmapBigEndian - } - - header := binary.LittleEndian.Uint32(buf[len(buf)-4:]) - buf = buf[:len(buf)-4] - - if header&0x7fff != frozenCookie { - return ErrFrozenBitmapInvalidCookie - } - - nCont := int(header >> 15) - if nCont > (1 << 16) { - return ErrFrozenBitmapOverpopulated - } - - // 1 byte per type, 2 bytes per key, 2 bytes per count. - if len(buf) < 5*nCont { - return ErrFrozenBitmapIncomplete - } - - types := buf[len(buf)-nCont:] - buf = buf[:len(buf)-nCont] - - counts := byteSliceAsUint16Slice(buf[len(buf)-2*nCont:]) - buf = buf[:len(buf)-2*nCont] - - keys := byteSliceAsUint16Slice(buf[len(buf)-2*nCont:]) - buf = buf[:len(buf)-2*nCont] - - nBitmap, nArray, nRun := 0, 0, 0 - nArrayEl, nRunEl := 0, 0 - for i, t := range types { - switch t { - case 1: - nBitmap++ - case 2: - nArray++ - nArrayEl += int(counts[i]) + 1 - case 3: - nRun++ - nRunEl += int(counts[i]) - default: - return ErrFrozenBitmapInvalidTypecode - } - } - - if len(buf) < (1<<13)*nBitmap+4*nRunEl+2*nArrayEl { - return ErrFrozenBitmapIncomplete - } - - bitsetsArena := byteSliceAsUint64Slice(buf[:(1<<13)*nBitmap]) - buf = buf[(1<<13)*nBitmap:] - - runsArena := byteSliceAsInterval16Slice(buf[:4*nRunEl]) - buf = buf[4*nRunEl:] - - arraysArena := byteSliceAsUint16Slice(buf[:2*nArrayEl]) - buf = buf[2*nArrayEl:] - - if len(buf) != 0 { - return ErrFrozenBitmapUnexpectedData - } - - var c container - containersSz := int(unsafe.Sizeof(c)) * nCont - bitsetsSz := int(unsafe.Sizeof(bitmapContainer{})) * nBitmap - arraysSz := int(unsafe.Sizeof(arrayContainer{})) * nArray - runsSz := int(unsafe.Sizeof(runContainer16{})) * nRun - needCOWSz := int(unsafe.Sizeof(true)) * nCont - - bitmapArenaSz := containersSz + bitsetsSz + arraysSz + runsSz + needCOWSz - bitmapArena := make([]byte, bitmapArenaSz) - - containers := byteSliceAsContainerSlice(bitmapArena[:containersSz]) - bitmapArena = bitmapArena[containersSz:] - - bitsets := byteSliceAsBitsetSlice(bitmapArena[:bitsetsSz]) - bitmapArena = bitmapArena[bitsetsSz:] - - arrays := byteSliceAsArraySlice(bitmapArena[:arraysSz]) - bitmapArena = bitmapArena[arraysSz:] - - runs := byteSliceAsRun16Slice(bitmapArena[:runsSz]) - bitmapArena = bitmapArena[runsSz:] - - needCOW := byteSliceAsBoolSlice(bitmapArena) - - iBitset, iArray, iRun := 0, 0, 0 - for i, t := range types { - needCOW[i] = true - - switch t { - case 1: - containers[i] = &bitsets[iBitset] - bitsets[iBitset].cardinality = int(counts[i]) + 1 - bitsets[iBitset].bitmap = bitsetsArena[:1024] - bitsetsArena = bitsetsArena[1024:] - iBitset++ - case 2: - containers[i] = &arrays[iArray] - sz := int(counts[i]) + 1 - arrays[iArray].content = arraysArena[:sz] - arraysArena = arraysArena[sz:] - iArray++ - case 3: - containers[i] = &runs[iRun] - runs[iRun].iv = runsArena[:counts[i]] - runsArena = runsArena[counts[i]:] - iRun++ - } - } - - // Not consuming the full input is a bug. - if iBitset != nBitmap || len(bitsetsArena) != 0 || - iArray != nArray || len(arraysArena) != 0 || - iRun != nRun || len(runsArena) != 0 { - panic("we missed something") - } - - ra.keys = keys - ra.containers = containers - ra.needCopyOnWrite = needCOW - ra.copyOnWrite = true - - return nil -} - -// GetFrozenSizeInBytes returns the size in bytes of the frozen bitmap. -func (rb *Bitmap) GetFrozenSizeInBytes() uint64 { - nBits, nArrayEl, nRunEl := uint64(0), uint64(0), uint64(0) - for _, c := range rb.highlowcontainer.containers { - switch v := c.(type) { - case *bitmapContainer: - nBits++ - case *arrayContainer: - nArrayEl += uint64(len(v.content)) - case *runContainer16: - nRunEl += uint64(len(v.iv)) - } - } - return 4 + 5*uint64(len(rb.highlowcontainer.containers)) + - (nBits << 13) + 2*nArrayEl + 4*nRunEl -} - -// Freeze serializes the bitmap in the CRoaring's frozen format. -func (rb *Bitmap) Freeze() ([]byte, error) { - sz := rb.GetFrozenSizeInBytes() - buf := make([]byte, sz) - _, err := rb.FreezeTo(buf) - return buf, err -} - -// FreezeTo serializes the bitmap in the CRoaring's frozen format. -func (rb *Bitmap) FreezeTo(buf []byte) (int, error) { - containers := rb.highlowcontainer.containers - nCont := len(containers) - - nBits, nArrayEl, nRunEl := 0, 0, 0 - for _, c := range containers { - switch v := c.(type) { - case *bitmapContainer: - nBits++ - case *arrayContainer: - nArrayEl += len(v.content) - case *runContainer16: - nRunEl += len(v.iv) - } - } - - serialSize := 4 + 5*nCont + (1<<13)*nBits + 4*nRunEl + 2*nArrayEl - if len(buf) < serialSize { - return 0, ErrFrozenBitmapBufferTooSmall - } - - bitsArena := byteSliceAsUint64Slice(buf[:(1<<13)*nBits]) - buf = buf[(1<<13)*nBits:] - - runsArena := byteSliceAsInterval16Slice(buf[:4*nRunEl]) - buf = buf[4*nRunEl:] - - arraysArena := byteSliceAsUint16Slice(buf[:2*nArrayEl]) - buf = buf[2*nArrayEl:] - - keys := byteSliceAsUint16Slice(buf[:2*nCont]) - buf = buf[2*nCont:] - - counts := byteSliceAsUint16Slice(buf[:2*nCont]) - buf = buf[2*nCont:] - - types := buf[:nCont] - buf = buf[nCont:] - - header := uint32(frozenCookie | (nCont << 15)) - binary.LittleEndian.PutUint32(buf[:4], header) - - copy(keys, rb.highlowcontainer.keys[:]) - - for i, c := range containers { - switch v := c.(type) { - case *bitmapContainer: - copy(bitsArena, v.bitmap) - bitsArena = bitsArena[1024:] - counts[i] = uint16(v.cardinality - 1) - types[i] = 1 - case *arrayContainer: - copy(arraysArena, v.content) - arraysArena = arraysArena[len(v.content):] - elems := len(v.content) - counts[i] = uint16(elems - 1) - types[i] = 2 - case *runContainer16: - copy(runsArena, v.iv) - runs := len(v.iv) - runsArena = runsArena[runs:] - counts[i] = uint16(runs) - types[i] = 3 - } - } - - return serialSize, nil -} - -// WriteFrozenTo serializes the bitmap in the CRoaring's frozen format. -func (rb *Bitmap) WriteFrozenTo(wr io.Writer) (int, error) { - // FIXME: this is a naive version that iterates 4 times through the - // containers and allocates 3*len(containers) bytes; it's quite likely - // it can be done more efficiently. - containers := rb.highlowcontainer.containers - written := 0 - - for _, c := range containers { - c, ok := c.(*bitmapContainer) - if !ok { - continue - } - n, err := wr.Write(uint64SliceAsByteSlice(c.bitmap)) - written += n - if err != nil { - return written, err - } - } - - for _, c := range containers { - c, ok := c.(*runContainer16) - if !ok { - continue - } - n, err := wr.Write(interval16SliceAsByteSlice(c.iv)) - written += n - if err != nil { - return written, err - } - } - - for _, c := range containers { - c, ok := c.(*arrayContainer) - if !ok { - continue - } - n, err := wr.Write(uint16SliceAsByteSlice(c.content)) - written += n - if err != nil { - return written, err - } - } - - n, err := wr.Write(uint16SliceAsByteSlice(rb.highlowcontainer.keys)) - written += n - if err != nil { - return written, err - } - - countTypeBuf := make([]byte, 3*len(containers)) - counts := byteSliceAsUint16Slice(countTypeBuf[:2*len(containers)]) - types := countTypeBuf[2*len(containers):] - - for i, c := range containers { - switch c := c.(type) { - case *bitmapContainer: - counts[i] = uint16(c.cardinality - 1) - types[i] = 1 - case *arrayContainer: - elems := len(c.content) - counts[i] = uint16(elems - 1) - types[i] = 2 - case *runContainer16: - runs := len(c.iv) - counts[i] = uint16(runs) - types[i] = 3 - } - } - - n, err = wr.Write(countTypeBuf) - written += n - if err != nil { - return written, err - } - - header := uint32(frozenCookie | (len(containers) << 15)) - if err := binary.Write(wr, binary.LittleEndian, header); err != nil { - return written, err - } - written += 4 - - return written, nil -} diff --git a/vendor/github.com/RoaringBitmap/roaring/serializationfuzz.go b/vendor/github.com/RoaringBitmap/roaring/serializationfuzz.go deleted file mode 100644 index c7fed02..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/serializationfuzz.go +++ /dev/null @@ -1,22 +0,0 @@ -//go:build gofuzz -// +build gofuzz - -package roaring - -import "bytes" - -func FuzzSerializationStream(data []byte) int { - newrb := NewBitmap() - if _, err := newrb.ReadFrom(bytes.NewReader(data)); err != nil { - return 0 - } - return 1 -} - -func FuzzSerializationBuffer(data []byte) int { - newrb := NewBitmap() - if _, err := newrb.FromBuffer(data); err != nil { - return 0 - } - return 1 -} diff --git a/vendor/github.com/RoaringBitmap/roaring/setutil.go b/vendor/github.com/RoaringBitmap/roaring/setutil.go deleted file mode 100644 index 663c4fa..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/setutil.go +++ /dev/null @@ -1,550 +0,0 @@ -package roaring - -func equal(a, b []uint16) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if a[i] != b[i] { - return false - } - } - return true -} - -func difference(set1 []uint16, set2 []uint16, buffer []uint16) int { - if 0 == len(set2) { - buffer = buffer[:len(set1)] - for k := 0; k < len(set1); k++ { - buffer[k] = set1[k] - } - return len(set1) - } - if 0 == len(set1) { - return 0 - } - pos := 0 - k1 := 0 - k2 := 0 - buffer = buffer[:cap(buffer)] - s1 := set1[k1] - s2 := set2[k2] - for { - if s1 < s2 { - buffer[pos] = s1 - pos++ - k1++ - if k1 >= len(set1) { - break - } - s1 = set1[k1] - } else if s1 == s2 { - k1++ - k2++ - if k1 >= len(set1) { - break - } - s1 = set1[k1] - if k2 >= len(set2) { - for ; k1 < len(set1); k1++ { - buffer[pos] = set1[k1] - pos++ - } - break - } - s2 = set2[k2] - } else { // if (val1>val2) - k2++ - if k2 >= len(set2) { - for ; k1 < len(set1); k1++ { - buffer[pos] = set1[k1] - pos++ - } - break - } - s2 = set2[k2] - } - } - return pos - -} - -func exclusiveUnion2by2(set1 []uint16, set2 []uint16, buffer []uint16) int { - if 0 == len(set2) { - buffer = buffer[:len(set1)] - copy(buffer, set1[:]) - return len(set1) - } - if 0 == len(set1) { - buffer = buffer[:len(set2)] - copy(buffer, set2[:]) - return len(set2) - } - pos := 0 - k1 := 0 - k2 := 0 - s1 := set1[k1] - s2 := set2[k2] - buffer = buffer[:cap(buffer)] - for { - if s1 < s2 { - buffer[pos] = s1 - pos++ - k1++ - if k1 >= len(set1) { - for ; k2 < len(set2); k2++ { - buffer[pos] = set2[k2] - pos++ - } - break - } - s1 = set1[k1] - } else if s1 == s2 { - k1++ - k2++ - if k1 >= len(set1) { - for ; k2 < len(set2); k2++ { - buffer[pos] = set2[k2] - pos++ - } - break - } - if k2 >= len(set2) { - for ; k1 < len(set1); k1++ { - buffer[pos] = set1[k1] - pos++ - } - break - } - s1 = set1[k1] - s2 = set2[k2] - } else { // if (val1>val2) - buffer[pos] = s2 - pos++ - k2++ - if k2 >= len(set2) { - for ; k1 < len(set1); k1++ { - buffer[pos] = set1[k1] - pos++ - } - break - } - s2 = set2[k2] - } - } - return pos -} - -func union2by2Cardinality(set1 []uint16, set2 []uint16) int { - pos := 0 - k1 := 0 - k2 := 0 - if 0 == len(set2) { - return len(set1) - } - if 0 == len(set1) { - return len(set2) - } - s1 := set1[k1] - s2 := set2[k2] - for { - if s1 < s2 { - pos++ - k1++ - if k1 >= len(set1) { - pos += len(set2) - k2 - break - } - s1 = set1[k1] - } else if s1 == s2 { - pos++ - k1++ - k2++ - if k1 >= len(set1) { - pos += len(set2) - k2 - break - } - if k2 >= len(set2) { - pos += len(set1) - k1 - break - } - s1 = set1[k1] - s2 = set2[k2] - } else { // if (set1[k1]>set2[k2]) - pos++ - k2++ - if k2 >= len(set2) { - pos += len(set1) - k1 - break - } - s2 = set2[k2] - } - } - return pos -} - -func intersection2by2( - set1 []uint16, - set2 []uint16, - buffer []uint16) int { - - if len(set1)*64 < len(set2) { - return onesidedgallopingintersect2by2(set1, set2, buffer) - } else if len(set2)*64 < len(set1) { - return onesidedgallopingintersect2by2(set2, set1, buffer) - } else { - return localintersect2by2(set1, set2, buffer) - } -} - -func intersection2by2Cardinality( - set1 []uint16, - set2 []uint16) int { - - if len(set1)*64 < len(set2) { - return onesidedgallopingintersect2by2Cardinality(set1, set2) - } else if len(set2)*64 < len(set1) { - return onesidedgallopingintersect2by2Cardinality(set2, set1) - } else { - return localintersect2by2Cardinality(set1, set2) - } -} - -func intersects2by2( - set1 []uint16, - set2 []uint16) bool { - // could be optimized if one set is much larger than the other one - if (0 == len(set1)) || (0 == len(set2)) { - return false - } - k1 := 0 - k2 := 0 - s1 := set1[k1] - s2 := set2[k2] -mainwhile: - for { - - if s2 < s1 { - for { - k2++ - if k2 == len(set2) { - break mainwhile - } - s2 = set2[k2] - if s2 >= s1 { - break - } - } - } - if s1 < s2 { - for { - k1++ - if k1 == len(set1) { - break mainwhile - } - s1 = set1[k1] - if s1 >= s2 { - break - } - } - - } else { - // (set2[k2] == set1[k1]) - return true - } - } - return false -} - -func localintersect2by2( - set1 []uint16, - set2 []uint16, - buffer []uint16) int { - - if (0 == len(set1)) || (0 == len(set2)) { - return 0 - } - k1 := 0 - k2 := 0 - pos := 0 - buffer = buffer[:cap(buffer)] - s1 := set1[k1] - s2 := set2[k2] -mainwhile: - for { - if s2 < s1 { - for { - k2++ - if k2 == len(set2) { - break mainwhile - } - s2 = set2[k2] - if s2 >= s1 { - break - } - } - } - if s1 < s2 { - for { - k1++ - if k1 == len(set1) { - break mainwhile - } - s1 = set1[k1] - if s1 >= s2 { - break - } - } - - } else { - // (set2[k2] == set1[k1]) - buffer[pos] = s1 - pos++ - k1++ - if k1 == len(set1) { - break - } - s1 = set1[k1] - k2++ - if k2 == len(set2) { - break - } - s2 = set2[k2] - } - } - return pos -} - -func localintersect2by2Cardinality( - set1 []uint16, - set2 []uint16) int { - - if (0 == len(set1)) || (0 == len(set2)) { - return 0 - } - k1 := 0 - k2 := 0 - pos := 0 - s1 := set1[k1] - s2 := set2[k2] -mainwhile: - for { - if s2 < s1 { - for { - k2++ - if k2 == len(set2) { - break mainwhile - } - s2 = set2[k2] - if s2 >= s1 { - break - } - } - } - if s1 < s2 { - for { - k1++ - if k1 == len(set1) { - break mainwhile - } - s1 = set1[k1] - if s1 >= s2 { - break - } - } - - } else { - // (set2[k2] == set1[k1]) - pos++ - k1++ - if k1 == len(set1) { - break - } - s1 = set1[k1] - k2++ - if k2 == len(set2) { - break - } - s2 = set2[k2] - } - } - return pos -} - -func advanceUntil( - array []uint16, - pos int, - length int, - min uint16) int { - lower := pos + 1 - - if lower >= length || array[lower] >= min { - return lower - } - - spansize := 1 - - for lower+spansize < length && array[lower+spansize] < min { - spansize *= 2 - } - var upper int - if lower+spansize < length { - upper = lower + spansize - } else { - upper = length - 1 - } - - if array[upper] == min { - return upper - } - - if array[upper] < min { - // means - // array - // has no - // item - // >= min - // pos = array.length; - return length - } - - // we know that the next-smallest span was too small - lower += (spansize >> 1) - - mid := 0 - for lower+1 != upper { - mid = (lower + upper) >> 1 - if array[mid] == min { - return mid - } else if array[mid] < min { - lower = mid - } else { - upper = mid - } - } - return upper - -} - -func onesidedgallopingintersect2by2( - smallset []uint16, - largeset []uint16, - buffer []uint16) int { - - if 0 == len(smallset) { - return 0 - } - buffer = buffer[:cap(buffer)] - k1 := 0 - k2 := 0 - pos := 0 - s1 := largeset[k1] - s2 := smallset[k2] -mainwhile: - - for { - if s1 < s2 { - k1 = advanceUntil(largeset, k1, len(largeset), s2) - if k1 == len(largeset) { - break mainwhile - } - s1 = largeset[k1] - } - if s2 < s1 { - k2++ - if k2 == len(smallset) { - break mainwhile - } - s2 = smallset[k2] - } else { - - buffer[pos] = s2 - pos++ - k2++ - if k2 == len(smallset) { - break - } - s2 = smallset[k2] - k1 = advanceUntil(largeset, k1, len(largeset), s2) - if k1 == len(largeset) { - break mainwhile - } - s1 = largeset[k1] - } - - } - return pos -} - -func onesidedgallopingintersect2by2Cardinality( - smallset []uint16, - largeset []uint16) int { - - if 0 == len(smallset) { - return 0 - } - k1 := 0 - k2 := 0 - pos := 0 - s1 := largeset[k1] - s2 := smallset[k2] -mainwhile: - - for { - if s1 < s2 { - k1 = advanceUntil(largeset, k1, len(largeset), s2) - if k1 == len(largeset) { - break mainwhile - } - s1 = largeset[k1] - } - if s2 < s1 { - k2++ - if k2 == len(smallset) { - break mainwhile - } - s2 = smallset[k2] - } else { - - pos++ - k2++ - if k2 == len(smallset) { - break - } - s2 = smallset[k2] - k1 = advanceUntil(largeset, k1, len(largeset), s2) - if k1 == len(largeset) { - break mainwhile - } - s1 = largeset[k1] - } - - } - return pos -} - -func binarySearch(array []uint16, ikey uint16) int { - low := 0 - high := len(array) - 1 - for low+16 <= high { - middleIndex := int(uint32(low+high) >> 1) - middleValue := array[middleIndex] - if middleValue < ikey { - low = middleIndex + 1 - } else if middleValue > ikey { - high = middleIndex - 1 - } else { - return middleIndex - } - } - for ; low <= high; low++ { - val := array[low] - if val >= ikey { - if val == ikey { - return low - } - break - } - } - return -(low + 1) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/setutil_arm64.go b/vendor/github.com/RoaringBitmap/roaring/setutil_arm64.go deleted file mode 100644 index 3e08965..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/setutil_arm64.go +++ /dev/null @@ -1,7 +0,0 @@ -//go:build arm64 && !gccgo && !appengine -// +build arm64,!gccgo,!appengine - -package roaring - -//go:noescape -func union2by2(set1 []uint16, set2 []uint16, buffer []uint16) (size int) diff --git a/vendor/github.com/RoaringBitmap/roaring/setutil_arm64.s b/vendor/github.com/RoaringBitmap/roaring/setutil_arm64.s deleted file mode 100644 index e4f0f20..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/setutil_arm64.s +++ /dev/null @@ -1,132 +0,0 @@ -// +build arm64,!gccgo,!appengine - -#include "textflag.h" - - -// This implements union2by2 using golang's version of arm64 assembly -// The algorithm is very similar to the generic one, -// but makes better use of arm64 features so is notably faster. -// The basic algorithm structure is as follows: -// 1. If either set is empty, copy the other set into the buffer and return the length -// 2. Otherwise, load the first element of each set into a variable (s1 and s2). -// 3. a. Compare the values of s1 and s2. - // b. add the smaller one to the buffer. - // c. perform a bounds check before incrementing. - // If one set is finished, copy the rest of the other set over. - // d. update s1 and or s2 to the next value, continue loop. - // - // Past the fact of the algorithm, this code makes use of several arm64 features - // Condition Codes: - // arm64's CMP operation sets 4 bits that can be used for branching, - // rather than just true or false. - // As a consequence, a single comparison gives enough information to distinguish the three cases - // - // Post-increment pointers after load/store: - // Instructions like `MOVHU.P 2(R0), R6` - // increment the register by a specified amount, in this example 2. - // Because uint16's are exactly 2 bytes and the length of the slices - // is part of the slice header, - // there is no need to separately track the index into the slice. - // Instead, the code can calculate the final read value and compare against that, - // using the post-increment reads to move the pointers along. - // - // TODO: CALL out to memmove once the list is exhausted. - // Right now it moves the necessary shorts so that the remaining count - // is a multiple of 4 and then copies 64 bits at a time. - -TEXT ·union2by2(SB), NOSPLIT, $0-80 - // R0, R1, and R2 for the pointers to the three slices - MOVD set1+0(FP), R0 - MOVD set2+24(FP), R1 - MOVD buffer+48(FP), R2 - - //R3 and R4 will be the values at which we will have finished reading set1 and set2. - // R3 should be R0 + 2 * set1_len+8(FP) - MOVD set1_len+8(FP), R3 - MOVD set2_len+32(FP), R4 - - ADD R3<<1, R0, R3 - ADD R4<<1, R1, R4 - - - //Rather than counting the number of elements added separately - //Save the starting register of buffer. - MOVD buffer+48(FP), R5 - - // set1 is empty, just flush set2 - CMP R0, R3 - BEQ flush_right - - // set2 is empty, just flush set1 - CMP R1, R4 - BEQ flush_left - - // R6, R7 are the working space for s1 and s2 - MOVD ZR, R6 - MOVD ZR, R7 - - MOVHU.P 2(R0), R6 - MOVHU.P 2(R1), R7 -loop: - - CMP R6, R7 - BEQ pop_both // R6 == R7 - BLS pop_right // R6 > R7 -//pop_left: // R6 < R7 - MOVHU.P R6, 2(R2) - CMP R0, R3 - BEQ pop_then_flush_right - MOVHU.P 2(R0), R6 - JMP loop -pop_both: - MOVHU.P R6, 2(R2) //could also use R7, since they are equal - CMP R0, R3 - BEQ flush_right - CMP R1, R4 - BEQ flush_left - MOVHU.P 2(R0), R6 - MOVHU.P 2(R1), R7 - JMP loop -pop_right: - MOVHU.P R7, 2(R2) - CMP R1, R4 - BEQ pop_then_flush_left - MOVHU.P 2(R1), R7 - JMP loop - -pop_then_flush_right: - MOVHU.P R7, 2(R2) -flush_right: - MOVD R1, R0 - MOVD R4, R3 - JMP flush_left -pop_then_flush_left: - MOVHU.P R6, 2(R2) -flush_left: - CMP R0, R3 - BEQ return - //figure out how many bytes to slough off. Must be a multiple of two - SUB R0, R3, R4 - ANDS $6, R4 - BEQ long_flush //handles the 0 mod 8 case - SUBS $4, R4, R4 // since possible values are 2, 4, 6, this splits evenly - BLT pop_single // exactly the 2 case - MOVW.P 4(R0), R6 - MOVW.P R6, 4(R2) - BEQ long_flush // we're now aligned by 64 bits, as R4==4, otherwise 2 more -pop_single: - MOVHU.P 2(R0), R6 - MOVHU.P R6, 2(R2) -long_flush: - // at this point we know R3 - R0 is a multiple of 8. - CMP R0, R3 - BEQ return - MOVD.P 8(R0), R6 - MOVD.P R6, 8(R2) - JMP long_flush -return: - // number of shorts written is (R5 - R2) >> 1 - SUB R5, R2 - LSR $1, R2, R2 - MOVD R2, size+72(FP) - RET diff --git a/vendor/github.com/RoaringBitmap/roaring/setutil_generic.go b/vendor/github.com/RoaringBitmap/roaring/setutil_generic.go deleted file mode 100644 index 4755fd5..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/setutil_generic.go +++ /dev/null @@ -1,64 +0,0 @@ -//go:build !arm64 || gccgo || appengine -// +build !arm64 gccgo appengine - -package roaring - -func union2by2(set1 []uint16, set2 []uint16, buffer []uint16) int { - pos := 0 - k1 := 0 - k2 := 0 - if 0 == len(set2) { - buffer = buffer[:len(set1)] - copy(buffer, set1[:]) - return len(set1) - } - if 0 == len(set1) { - buffer = buffer[:len(set2)] - copy(buffer, set2[:]) - return len(set2) - } - s1 := set1[k1] - s2 := set2[k2] - buffer = buffer[:cap(buffer)] - for { - if s1 < s2 { - buffer[pos] = s1 - pos++ - k1++ - if k1 >= len(set1) { - copy(buffer[pos:], set2[k2:]) - pos += len(set2) - k2 - break - } - s1 = set1[k1] - } else if s1 == s2 { - buffer[pos] = s1 - pos++ - k1++ - k2++ - if k1 >= len(set1) { - copy(buffer[pos:], set2[k2:]) - pos += len(set2) - k2 - break - } - if k2 >= len(set2) { - copy(buffer[pos:], set1[k1:]) - pos += len(set1) - k1 - break - } - s1 = set1[k1] - s2 = set2[k2] - } else { // if (set1[k1]>set2[k2]) - buffer[pos] = s2 - pos++ - k2++ - if k2 >= len(set2) { - copy(buffer[pos:], set1[k1:]) - pos += len(set1) - k1 - break - } - s2 = set2[k2] - } - } - return pos -} diff --git a/vendor/github.com/RoaringBitmap/roaring/shortiterator.go b/vendor/github.com/RoaringBitmap/roaring/shortiterator.go deleted file mode 100644 index 15b78bd..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/shortiterator.go +++ /dev/null @@ -1,52 +0,0 @@ -package roaring - -type shortIterable interface { - hasNext() bool - next() uint16 -} - -type shortPeekable interface { - shortIterable - peekNext() uint16 - advanceIfNeeded(minval uint16) -} - -type shortIterator struct { - slice []uint16 - loc int -} - -func (si *shortIterator) hasNext() bool { - return si.loc < len(si.slice) -} - -func (si *shortIterator) next() uint16 { - a := si.slice[si.loc] - si.loc++ - return a -} - -func (si *shortIterator) peekNext() uint16 { - return si.slice[si.loc] -} - -func (si *shortIterator) advanceIfNeeded(minval uint16) { - if si.hasNext() && si.peekNext() < minval { - si.loc = advanceUntil(si.slice, si.loc, len(si.slice), minval) - } -} - -type reverseIterator struct { - slice []uint16 - loc int -} - -func (si *reverseIterator) hasNext() bool { - return si.loc >= 0 -} - -func (si *reverseIterator) next() uint16 { - a := si.slice[si.loc] - si.loc-- - return a -} diff --git a/vendor/github.com/RoaringBitmap/roaring/smat.go b/vendor/github.com/RoaringBitmap/roaring/smat.go deleted file mode 100644 index c52c5f0..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/smat.go +++ /dev/null @@ -1,384 +0,0 @@ -//go:build gofuzz -// +build gofuzz - -/* -# Instructions for smat testing for roaring - -[smat](https://github.com/mschoch/smat) is a framework that provides -state machine assisted fuzz testing. - -To run the smat tests for roaring... - -## Prerequisites - - $ go get github.com/dvyukov/go-fuzz/go-fuzz - $ go get github.com/dvyukov/go-fuzz/go-fuzz-build - -## Steps - -1. Generate initial smat corpus: -``` - go test -tags=gofuzz -run=TestGenerateSmatCorpus -``` - -2. Build go-fuzz test program with instrumentation: -``` - go-fuzz-build -func FuzzSmat github.com/RoaringBitmap/roaring -``` - -3. Run go-fuzz: -``` - go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200 -``` - -You should see output like... -``` -2016/09/16 13:58:35 slaves: 8, corpus: 1 (3s ago), crashers: 0, restarts: 1/0, execs: 0 (0/sec), cover: 0, uptime: 3s -2016/09/16 13:58:38 slaves: 8, corpus: 1 (6s ago), crashers: 0, restarts: 1/0, execs: 0 (0/sec), cover: 0, uptime: 6s -2016/09/16 13:58:41 slaves: 8, corpus: 1 (9s ago), crashers: 0, restarts: 1/44, execs: 44 (5/sec), cover: 0, uptime: 9s -2016/09/16 13:58:44 slaves: 8, corpus: 1 (12s ago), crashers: 0, restarts: 1/45, execs: 45 (4/sec), cover: 0, uptime: 12s -2016/09/16 13:58:47 slaves: 8, corpus: 1 (15s ago), crashers: 0, restarts: 1/46, execs: 46 (3/sec), cover: 0, uptime: 15s -2016/09/16 13:58:50 slaves: 8, corpus: 1 (18s ago), crashers: 0, restarts: 1/47, execs: 47 (3/sec), cover: 0, uptime: 18s -2016/09/16 13:58:53 slaves: 8, corpus: 1 (21s ago), crashers: 0, restarts: 1/63, execs: 63 (3/sec), cover: 0, uptime: 21s -2016/09/16 13:58:56 slaves: 8, corpus: 1 (24s ago), crashers: 0, restarts: 1/65, execs: 65 (3/sec), cover: 0, uptime: 24s -2016/09/16 13:58:59 slaves: 8, corpus: 1 (27s ago), crashers: 0, restarts: 1/66, execs: 66 (2/sec), cover: 0, uptime: 27s -2016/09/16 13:59:02 slaves: 8, corpus: 1 (30s ago), crashers: 0, restarts: 1/67, execs: 67 (2/sec), cover: 0, uptime: 30s -2016/09/16 13:59:05 slaves: 8, corpus: 1 (33s ago), crashers: 0, restarts: 1/83, execs: 83 (3/sec), cover: 0, uptime: 33s -2016/09/16 13:59:08 slaves: 8, corpus: 1 (36s ago), crashers: 0, restarts: 1/84, execs: 84 (2/sec), cover: 0, uptime: 36s -2016/09/16 13:59:11 slaves: 8, corpus: 2 (0s ago), crashers: 0, restarts: 1/85, execs: 85 (2/sec), cover: 0, uptime: 39s -2016/09/16 13:59:14 slaves: 8, corpus: 17 (2s ago), crashers: 0, restarts: 1/86, execs: 86 (2/sec), cover: 480, uptime: 42s -2016/09/16 13:59:17 slaves: 8, corpus: 17 (5s ago), crashers: 0, restarts: 1/66, execs: 132 (3/sec), cover: 487, uptime: 45s -2016/09/16 13:59:20 slaves: 8, corpus: 17 (8s ago), crashers: 0, restarts: 1/440, execs: 2645 (55/sec), cover: 487, uptime: 48s - -``` - -Let it run, and if the # of crashers is > 0, check out the reports in -the workdir where you should be able to find the panic goroutine stack -traces. -*/ - -package roaring - -import ( - "fmt" - "sort" - - "github.com/bits-and-blooms/bitset" - "github.com/mschoch/smat" -) - -// fuzz test using state machine driven by byte stream. -func FuzzSmat(data []byte) int { - return smat.Fuzz(&smatContext{}, smat.ActionID('S'), smat.ActionID('T'), - smatActionMap, data) -} - -var smatDebug = false - -func smatLog(prefix, format string, args ...interface{}) { - if smatDebug { - fmt.Print(prefix) - fmt.Printf(format, args...) - } -} - -type smatContext struct { - pairs []*smatPair - - // Two registers, x & y. - x int - y int - - actions int -} - -type smatPair struct { - bm *Bitmap - bs *bitset.BitSet -} - -// ------------------------------------------------------------------ - -var smatActionMap = smat.ActionMap{ - smat.ActionID('X'): smatAction("x++", smatWrap(func(c *smatContext) { c.x++ })), - smat.ActionID('x'): smatAction("x--", smatWrap(func(c *smatContext) { c.x-- })), - smat.ActionID('Y'): smatAction("y++", smatWrap(func(c *smatContext) { c.y++ })), - smat.ActionID('y'): smatAction("y--", smatWrap(func(c *smatContext) { c.y-- })), - smat.ActionID('*'): smatAction("x*y", smatWrap(func(c *smatContext) { c.x = c.x * c.y })), - smat.ActionID('<'): smatAction("x<<", smatWrap(func(c *smatContext) { c.x = c.x << 1 })), - - smat.ActionID('^'): smatAction("swap", smatWrap(func(c *smatContext) { c.x, c.y = c.y, c.x })), - - smat.ActionID('['): smatAction(" pushPair", smatWrap(smatPushPair)), - smat.ActionID(']'): smatAction(" popPair", smatWrap(smatPopPair)), - - smat.ActionID('B'): smatAction(" setBit", smatWrap(smatSetBit)), - smat.ActionID('b'): smatAction(" removeBit", smatWrap(smatRemoveBit)), - - smat.ActionID('o'): smatAction(" or", smatWrap(smatOr)), - smat.ActionID('a'): smatAction(" and", smatWrap(smatAnd)), - - smat.ActionID('#'): smatAction(" cardinality", smatWrap(smatCardinality)), - - smat.ActionID('O'): smatAction(" orCardinality", smatWrap(smatOrCardinality)), - smat.ActionID('A'): smatAction(" andCardinality", smatWrap(smatAndCardinality)), - - smat.ActionID('c'): smatAction(" clear", smatWrap(smatClear)), - smat.ActionID('r'): smatAction(" runOptimize", smatWrap(smatRunOptimize)), - - smat.ActionID('e'): smatAction(" isEmpty", smatWrap(smatIsEmpty)), - - smat.ActionID('i'): smatAction(" intersects", smatWrap(smatIntersects)), - - smat.ActionID('f'): smatAction(" flip", smatWrap(smatFlip)), - - smat.ActionID('-'): smatAction(" difference", smatWrap(smatDifference)), -} - -var smatRunningPercentActions []smat.PercentAction - -func init() { - var ids []int - for actionId := range smatActionMap { - ids = append(ids, int(actionId)) - } - sort.Ints(ids) - - pct := 100 / len(smatActionMap) - for _, actionId := range ids { - smatRunningPercentActions = append(smatRunningPercentActions, - smat.PercentAction{pct, smat.ActionID(actionId)}) - } - - smatActionMap[smat.ActionID('S')] = smatAction("SETUP", smatSetupFunc) - smatActionMap[smat.ActionID('T')] = smatAction("TEARDOWN", smatTeardownFunc) -} - -// We only have one smat state: running. -func smatRunning(next byte) smat.ActionID { - return smat.PercentExecute(next, smatRunningPercentActions...) -} - -func smatAction(name string, f func(ctx smat.Context) (smat.State, error)) func(smat.Context) (smat.State, error) { - return func(ctx smat.Context) (smat.State, error) { - c := ctx.(*smatContext) - c.actions++ - - smatLog(" ", "%s\n", name) - - return f(ctx) - } -} - -// Creates an smat action func based on a simple callback. -func smatWrap(cb func(c *smatContext)) func(smat.Context) (next smat.State, err error) { - return func(ctx smat.Context) (next smat.State, err error) { - c := ctx.(*smatContext) - cb(c) - return smatRunning, nil - } -} - -// Invokes a callback function with the input v bounded to len(c.pairs). -func (c *smatContext) withPair(v int, cb func(*smatPair)) { - if len(c.pairs) > 0 { - if v < 0 { - v = -v - } - v = v % len(c.pairs) - cb(c.pairs[v]) - } -} - -// ------------------------------------------------------------------ - -func smatSetupFunc(ctx smat.Context) (next smat.State, err error) { - return smatRunning, nil -} - -func smatTeardownFunc(ctx smat.Context) (next smat.State, err error) { - return nil, err -} - -// ------------------------------------------------------------------ - -func smatPushPair(c *smatContext) { - c.pairs = append(c.pairs, &smatPair{ - bm: NewBitmap(), - bs: bitset.New(100), - }) -} - -func smatPopPair(c *smatContext) { - if len(c.pairs) > 0 { - c.pairs = c.pairs[0 : len(c.pairs)-1] - } -} - -func smatSetBit(c *smatContext) { - c.withPair(c.x, func(p *smatPair) { - y := uint32(c.y) - p.bm.AddInt(int(y)) - p.bs.Set(uint(y)) - p.checkEquals() - }) -} - -func smatRemoveBit(c *smatContext) { - c.withPair(c.x, func(p *smatPair) { - y := uint32(c.y) - p.bm.Remove(y) - p.bs.Clear(uint(y)) - p.checkEquals() - }) -} - -func smatAnd(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - c.withPair(c.y, func(py *smatPair) { - px.bm.And(py.bm) - px.bs = px.bs.Intersection(py.bs) - px.checkEquals() - py.checkEquals() - }) - }) -} - -func smatOr(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - c.withPair(c.y, func(py *smatPair) { - px.bm.Or(py.bm) - px.bs = px.bs.Union(py.bs) - px.checkEquals() - py.checkEquals() - }) - }) -} - -func smatAndCardinality(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - c.withPair(c.y, func(py *smatPair) { - c0 := px.bm.AndCardinality(py.bm) - c1 := px.bs.IntersectionCardinality(py.bs) - if c0 != uint64(c1) { - panic("expected same add cardinality") - } - px.checkEquals() - py.checkEquals() - }) - }) -} - -func smatOrCardinality(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - c.withPair(c.y, func(py *smatPair) { - c0 := px.bm.OrCardinality(py.bm) - c1 := px.bs.UnionCardinality(py.bs) - if c0 != uint64(c1) { - panic("expected same or cardinality") - } - px.checkEquals() - py.checkEquals() - }) - }) -} - -func smatRunOptimize(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - px.bm.RunOptimize() - px.checkEquals() - }) -} - -func smatClear(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - px.bm.Clear() - px.bs = px.bs.ClearAll() - px.checkEquals() - }) -} - -func smatCardinality(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - c0 := px.bm.GetCardinality() - c1 := px.bs.Count() - if c0 != uint64(c1) { - panic("expected same cardinality") - } - }) -} - -func smatIsEmpty(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - c0 := px.bm.IsEmpty() - c1 := px.bs.None() - if c0 != c1 { - panic("expected same is empty") - } - }) -} - -func smatIntersects(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - c.withPair(c.y, func(py *smatPair) { - v0 := px.bm.Intersects(py.bm) - v1 := px.bs.IntersectionCardinality(py.bs) > 0 - if v0 != v1 { - panic("intersects not equal") - } - - px.checkEquals() - py.checkEquals() - }) - }) -} - -func smatFlip(c *smatContext) { - c.withPair(c.x, func(p *smatPair) { - y := uint32(c.y) - p.bm.Flip(uint64(y), uint64(y)+1) - p.bs = p.bs.Flip(uint(y)) - p.checkEquals() - }) -} - -func smatDifference(c *smatContext) { - c.withPair(c.x, func(px *smatPair) { - c.withPair(c.y, func(py *smatPair) { - px.bm.AndNot(py.bm) - px.bs = px.bs.Difference(py.bs) - px.checkEquals() - py.checkEquals() - }) - }) -} - -func (p *smatPair) checkEquals() { - if !p.equalsBitSet(p.bs, p.bm) { - panic("bitset mismatch") - } -} - -func (p *smatPair) equalsBitSet(a *bitset.BitSet, b *Bitmap) bool { - for i, e := a.NextSet(0); e; i, e = a.NextSet(i + 1) { - if !b.ContainsInt(int(i)) { - fmt.Printf("in a bitset, not b bitmap, i: %d\n", i) - fmt.Printf(" a bitset: %s\n b bitmap: %s\n", - a.String(), b.String()) - return false - } - } - - i := b.Iterator() - for i.HasNext() { - v := i.Next() - if !a.Test(uint(v)) { - fmt.Printf("in b bitmap, not a bitset, v: %d\n", v) - fmt.Printf(" a bitset: %s\n b bitmap: %s\n", - a.String(), b.String()) - return false - } - } - - return true -} diff --git a/vendor/github.com/RoaringBitmap/roaring/util.go b/vendor/github.com/RoaringBitmap/roaring/util.go deleted file mode 100644 index 48b9d5a..0000000 --- a/vendor/github.com/RoaringBitmap/roaring/util.go +++ /dev/null @@ -1,305 +0,0 @@ -package roaring - -import ( - "math" - "math/rand" - "sort" -) - -const ( - arrayDefaultMaxSize = 4096 // containers with 4096 or fewer integers should be array containers. - arrayLazyLowerBound = 1024 - maxCapacity = 1 << 16 - serialCookieNoRunContainer = 12346 // only arrays and bitmaps - invalidCardinality = -1 - serialCookie = 12347 // runs, arrays, and bitmaps - noOffsetThreshold = 4 - - // MaxUint32 is the largest uint32 value. - MaxUint32 = math.MaxUint32 - - // MaxRange is One more than the maximum allowed bitmap bit index. For use as an upper - // bound for ranges. - MaxRange uint64 = MaxUint32 + 1 - - // MaxUint16 is the largest 16 bit unsigned int. - // This is the largest value an interval16 can store. - MaxUint16 = math.MaxUint16 - - // Compute wordSizeInBytes, the size of a word in bytes. - _m = ^uint64(0) - _logS = _m>>8&1 + _m>>16&1 + _m>>32&1 - wordSizeInBytes = 1 << _logS - - // other constants used in ctz_generic.go - wordSizeInBits = wordSizeInBytes << 3 // word size in bits -) - -const maxWord = 1< arrayDefaultMaxSize { - // bitmapContainer - return maxCapacity / 8 - } - // arrayContainer - return 2 * card -} - -func fill(arr []uint64, val uint64) { - for i := range arr { - arr[i] = val - } -} -func fillRange(arr []uint64, start, end int, val uint64) { - for i := start; i < end; i++ { - arr[i] = val - } -} - -func fillArrayAND(container []uint16, bitmap1, bitmap2 []uint64) { - if len(bitmap1) != len(bitmap2) { - panic("array lengths don't match") - } - // TODO: rewrite in assembly - pos := 0 - for k := range bitmap1 { - bitset := bitmap1[k] & bitmap2[k] - for bitset != 0 { - t := bitset & -bitset - container[pos] = uint16((k*64 + int(popcount(t-1)))) - pos = pos + 1 - bitset ^= t - } - } -} - -func fillArrayANDNOT(container []uint16, bitmap1, bitmap2 []uint64) { - if len(bitmap1) != len(bitmap2) { - panic("array lengths don't match") - } - // TODO: rewrite in assembly - pos := 0 - for k := range bitmap1 { - bitset := bitmap1[k] &^ bitmap2[k] - for bitset != 0 { - t := bitset & -bitset - container[pos] = uint16((k*64 + int(popcount(t-1)))) - pos = pos + 1 - bitset ^= t - } - } -} - -func fillArrayXOR(container []uint16, bitmap1, bitmap2 []uint64) { - if len(bitmap1) != len(bitmap2) { - panic("array lengths don't match") - } - // TODO: rewrite in assembly - pos := 0 - for k := 0; k < len(bitmap1); k++ { - bitset := bitmap1[k] ^ bitmap2[k] - for bitset != 0 { - t := bitset & -bitset - container[pos] = uint16((k*64 + int(popcount(t-1)))) - pos = pos + 1 - bitset ^= t - } - } -} - -func highbits(x uint32) uint16 { - return uint16(x >> 16) -} -func lowbits(x uint32) uint16 { - return uint16(x & maxLowBit) -} - -const maxLowBit = 0xFFFF - -func flipBitmapRange(bitmap []uint64, start int, end int) { - if start >= end { - return - } - firstword := start / 64 - endword := (end - 1) / 64 - bitmap[firstword] ^= ^(^uint64(0) << uint(start%64)) - for i := firstword; i < endword; i++ { - bitmap[i] = ^bitmap[i] - } - bitmap[endword] ^= ^uint64(0) >> (uint(-end) % 64) -} - -func resetBitmapRange(bitmap []uint64, start int, end int) { - if start >= end { - return - } - firstword := start / 64 - endword := (end - 1) / 64 - if firstword == endword { - bitmap[firstword] &= ^((^uint64(0) << uint(start%64)) & (^uint64(0) >> (uint(-end) % 64))) - return - } - bitmap[firstword] &= ^(^uint64(0) << uint(start%64)) - for i := firstword + 1; i < endword; i++ { - bitmap[i] = 0 - } - bitmap[endword] &= ^(^uint64(0) >> (uint(-end) % 64)) - -} - -func setBitmapRange(bitmap []uint64, start int, end int) { - if start >= end { - return - } - firstword := start / 64 - endword := (end - 1) / 64 - if firstword == endword { - bitmap[firstword] |= (^uint64(0) << uint(start%64)) & (^uint64(0) >> (uint(-end) % 64)) - return - } - bitmap[firstword] |= ^uint64(0) << uint(start%64) - for i := firstword + 1; i < endword; i++ { - bitmap[i] = ^uint64(0) - } - bitmap[endword] |= ^uint64(0) >> (uint(-end) % 64) -} - -func flipBitmapRangeAndCardinalityChange(bitmap []uint64, start int, end int) int { - before := wordCardinalityForBitmapRange(bitmap, start, end) - flipBitmapRange(bitmap, start, end) - after := wordCardinalityForBitmapRange(bitmap, start, end) - return int(after - before) -} - -func resetBitmapRangeAndCardinalityChange(bitmap []uint64, start int, end int) int { - before := wordCardinalityForBitmapRange(bitmap, start, end) - resetBitmapRange(bitmap, start, end) - after := wordCardinalityForBitmapRange(bitmap, start, end) - return int(after - before) -} - -func setBitmapRangeAndCardinalityChange(bitmap []uint64, start int, end int) int { - before := wordCardinalityForBitmapRange(bitmap, start, end) - setBitmapRange(bitmap, start, end) - after := wordCardinalityForBitmapRange(bitmap, start, end) - return int(after - before) -} - -func wordCardinalityForBitmapRange(bitmap []uint64, start int, end int) uint64 { - answer := uint64(0) - if start >= end { - return answer - } - firstword := start / 64 - endword := (end - 1) / 64 - for i := firstword; i <= endword; i++ { - answer += popcount(bitmap[i]) - } - return answer -} - -func selectBitPosition(w uint64, j int) int { - seen := 0 - - // Divide 64bit - part := w & 0xFFFFFFFF - n := popcount(part) - if n <= uint64(j) { - part = w >> 32 - seen += 32 - j -= int(n) - } - w = part - - // Divide 32bit - part = w & 0xFFFF - n = popcount(part) - if n <= uint64(j) { - part = w >> 16 - seen += 16 - j -= int(n) - } - w = part - - // Divide 16bit - part = w & 0xFF - n = popcount(part) - if n <= uint64(j) { - part = w >> 8 - seen += 8 - j -= int(n) - } - w = part - - // Lookup in final byte - var counter uint - for counter = 0; counter < 8; counter++ { - j -= int((w >> counter) & 1) - if j < 0 { - break - } - } - return seen + int(counter) - -} - -func panicOn(err error) { - if err != nil { - panic(err) - } -} - -type ph struct { - orig int - rand int -} - -type pha []ph - -func (p pha) Len() int { return len(p) } -func (p pha) Less(i, j int) bool { return p[i].rand < p[j].rand } -func (p pha) Swap(i, j int) { p[i], p[j] = p[j], p[i] } - -func getRandomPermutation(n int) []int { - r := make([]ph, n) - for i := 0; i < n; i++ { - r[i].orig = i - r[i].rand = rand.Intn(1 << 29) - } - sort.Sort(pha(r)) - m := make([]int, n) - for i := range m { - m[i] = r[i].orig - } - return m -} - -func minOfInt(a, b int) int { - if a < b { - return a - } - return b -} - -func maxOfInt(a, b int) int { - if a > b { - return a - } - return b -} - -func maxOfUint16(a, b uint16) uint16 { - if a > b { - return a - } - return b -} - -func minOfUint16(a, b uint16) uint16 { - if a < b { - return a - } - return b -} diff --git a/vendor/github.com/bits-and-blooms/bitset/.gitignore b/vendor/github.com/bits-and-blooms/bitset/.gitignore deleted file mode 100644 index 5c204d2..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/.gitignore +++ /dev/null @@ -1,26 +0,0 @@ -# Compiled Object files, Static and Dynamic libs (Shared Objects) -*.o -*.a -*.so - -# Folders -_obj -_test - -# Architecture specific extensions/prefixes -*.[568vq] -[568vq].out - -*.cgo1.go -*.cgo2.c -_cgo_defun.c -_cgo_gotypes.go -_cgo_export.* - -_testmain.go - -*.exe -*.test -*.prof - -target diff --git a/vendor/github.com/bits-and-blooms/bitset/.travis.yml b/vendor/github.com/bits-and-blooms/bitset/.travis.yml deleted file mode 100644 index 094aa5c..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/.travis.yml +++ /dev/null @@ -1,37 +0,0 @@ -language: go - -sudo: false - -branches: - except: - - release - -branches: - only: - - master - - travis - -go: - - "1.11.x" - - tip - -matrix: - allow_failures: - - go: tip - -before_install: - - if [ -n "$GH_USER" ]; then git config --global github.user ${GH_USER}; fi; - - if [ -n "$GH_TOKEN" ]; then git config --global github.token ${GH_TOKEN}; fi; - - go get github.com/mattn/goveralls - -before_script: - - make deps - -script: - - make qa - -after_failure: - - cat ./target/test/report.xml - -after_success: - - if [ "$TRAVIS_GO_VERSION" = "1.11.1" ]; then $HOME/gopath/bin/goveralls -covermode=count -coverprofile=target/report/coverage.out -service=travis-ci; fi; diff --git a/vendor/github.com/bits-and-blooms/bitset/README.md b/vendor/github.com/bits-and-blooms/bitset/README.md deleted file mode 100644 index 848234e..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# bitset - -*Go language library to map between non-negative integers and boolean values* - -[![Test](https://github.com/bits-and-blooms/bitset/workflows/Test/badge.svg)](https://github.com/willf/bitset/actions?query=workflow%3ATest) -[![Go Report Card](https://goreportcard.com/badge/github.com/willf/bitset)](https://goreportcard.com/report/github.com/willf/bitset) -[![PkgGoDev](https://pkg.go.dev/badge/github.com/bits-and-blooms/bitset?tab=doc)](https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc) - - -This library is part of the [awesome go collection](https://github.com/avelino/awesome-go). It is used in production by several important systems: - -* [beego](https://github.com/beego/beego) -* [CubeFS](https://github.com/cubefs/cubefs) -* [Amazon EKS Distro](https://github.com/aws/eks-distro) -* [sourcegraph](https://github.com/sourcegraph/sourcegraph) -* [torrent](https://github.com/anacrolix/torrent) - - -## Description - -Package bitset implements bitsets, a mapping between non-negative integers and boolean values. -It should be more efficient than map[uint] bool. - -It provides methods for setting, clearing, flipping, and testing individual integers. - -But it also provides set intersection, union, difference, complement, and symmetric operations, as well as tests to check whether any, all, or no bits are set, and querying a bitset's current length and number of positive bits. - -BitSets are expanded to the size of the largest set bit; the memory allocation is approximately Max bits, where Max is the largest set bit. BitSets are never shrunk. On creation, a hint can be given for the number of bits that will be used. - -Many of the methods, including Set, Clear, and Flip, return a BitSet pointer, which allows for chaining. - -### Example use: - -```go -package main - -import ( - "fmt" - "math/rand" - - "github.com/bits-and-blooms/bitset" -) - -func main() { - fmt.Printf("Hello from BitSet!\n") - var b bitset.BitSet - // play some Go Fish - for i := 0; i < 100; i++ { - card1 := uint(rand.Intn(52)) - card2 := uint(rand.Intn(52)) - b.Set(card1) - if b.Test(card2) { - fmt.Println("Go Fish!") - } - b.Clear(card1) - } - - // Chaining - b.Set(10).Set(11) - - for i, e := b.NextSet(0); e; i, e = b.NextSet(i + 1) { - fmt.Println("The following bit is set:", i) - } - if b.Intersection(bitset.New(100).Set(10)).Count() == 1 { - fmt.Println("Intersection works.") - } else { - fmt.Println("Intersection doesn't work???") - } -} -``` - - -Package documentation is at: https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc - -## Serialization - - -You may serialize a bitset safely and portably to a stream -of bytes as follows: -```Go - const length = 9585 - const oneEvery = 97 - bs := bitset.New(length) - // Add some bits - for i := uint(0); i < length; i += oneEvery { - bs = bs.Set(i) - } - - var buf bytes.Buffer - n, err := bs.WriteTo(&buf) - if err != nil { - // failure - } - // Here n == buf.Len() -``` -You can later deserialize the result as follows: - -```Go - // Read back from buf - bs = bitset.New() - n, err = bs.ReadFrom(&buf) - if err != nil { - // error - } - // n is the number of bytes read -``` - -The `ReadFrom` function attempts to read the data into the existing -BitSet instance, to minimize memory allocations. - - -*Performance tip*: -When reading and writing to a file or a network connection, you may get better performance by -wrapping your streams with `bufio` instances. - -E.g., -```Go - f, err := os.Create("myfile") - w := bufio.NewWriter(f) -``` -```Go - f, err := os.Open("myfile") - r := bufio.NewReader(f) -``` - -## Memory Usage - -The memory usage of a bitset using `N` bits is at least `N/8` bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](http://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring). - -## Implementation Note - -Go 1.9 introduced a native `math/bits` library. We provide backward compatibility to Go 1.7, which might be removed. - -It is possible that a later version will match the `math/bits` return signature for counts (which is `int`, rather than our library's `uint64`). If so, the version will be bumped. - -## Installation - -```bash -go get github.com/bits-and-blooms/bitset -``` - -## Contributing - -If you wish to contribute to this project, please branch and issue a pull request against master ("[GitHub Flow](https://guides.github.com/introduction/flow/)") - -## Running all tests - -Before committing the code, please check if it passes tests, has adequate coverage, etc. -```bash -go test -go test -cover -``` diff --git a/vendor/github.com/bits-and-blooms/bitset/SECURITY.md b/vendor/github.com/bits-and-blooms/bitset/SECURITY.md deleted file mode 100644 index f888420..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/SECURITY.md +++ /dev/null @@ -1,5 +0,0 @@ -# Security Policy - -## Reporting a Vulnerability - -You can report privately a vulnerability by email at daniel@lemire.me (current maintainer). diff --git a/vendor/github.com/bits-and-blooms/bitset/azure-pipelines.yml b/vendor/github.com/bits-and-blooms/bitset/azure-pipelines.yml deleted file mode 100644 index f9b2959..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/azure-pipelines.yml +++ /dev/null @@ -1,39 +0,0 @@ -# Go -# Build your Go project. -# Add steps that test, save build artifacts, deploy, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/go - -trigger: -- master - -pool: - vmImage: 'Ubuntu-16.04' - -variables: - GOBIN: '$(GOPATH)/bin' # Go binaries path - GOROOT: '/usr/local/go1.11' # Go installation path - GOPATH: '$(system.defaultWorkingDirectory)/gopath' # Go workspace path - modulePath: '$(GOPATH)/src/github.com/$(build.repository.name)' # Path to the module's code - -steps: -- script: | - mkdir -p '$(GOBIN)' - mkdir -p '$(GOPATH)/pkg' - mkdir -p '$(modulePath)' - shopt -s extglob - shopt -s dotglob - mv !(gopath) '$(modulePath)' - echo '##vso[task.prependpath]$(GOBIN)' - echo '##vso[task.prependpath]$(GOROOT)/bin' - displayName: 'Set up the Go workspace' - -- script: | - go version - go get -v -t -d ./... - if [ -f Gopkg.toml ]; then - curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh - dep ensure - fi - go build -v . - workingDirectory: '$(modulePath)' - displayName: 'Get dependencies, then build' diff --git a/vendor/github.com/bits-and-blooms/bitset/bitset.go b/vendor/github.com/bits-and-blooms/bitset/bitset.go deleted file mode 100644 index 8fb9e9f..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/bitset.go +++ /dev/null @@ -1,1137 +0,0 @@ -/* -Package bitset implements bitsets, a mapping -between non-negative integers and boolean values. It should be more -efficient than map[uint] bool. - -It provides methods for setting, clearing, flipping, and testing -individual integers. - -But it also provides set intersection, union, difference, -complement, and symmetric operations, as well as tests to -check whether any, all, or no bits are set, and querying a -bitset's current length and number of positive bits. - -BitSets are expanded to the size of the largest set bit; the -memory allocation is approximately Max bits, where Max is -the largest set bit. BitSets are never shrunk. On creation, -a hint can be given for the number of bits that will be used. - -Many of the methods, including Set,Clear, and Flip, return -a BitSet pointer, which allows for chaining. - -Example use: - - import "bitset" - var b BitSet - b.Set(10).Set(11) - if b.Test(1000) { - b.Clear(1000) - } - if B.Intersection(bitset.New(100).Set(10)).Count() > 1 { - fmt.Println("Intersection works.") - } - -As an alternative to BitSets, one should check out the 'big' package, -which provides a (less set-theoretical) view of bitsets. -*/ -package bitset - -import ( - "bytes" - "encoding/base64" - "encoding/binary" - "encoding/json" - "errors" - "fmt" - "io" - "strconv" -) - -// the wordSize of a bit set -const wordSize = uint(64) - -// the wordSize of a bit set in bytes -const wordBytes = wordSize / 8 - -// log2WordSize is lg(wordSize) -const log2WordSize = uint(6) - -// allBits has every bit set -const allBits uint64 = 0xffffffffffffffff - -// default binary BigEndian -var binaryOrder binary.ByteOrder = binary.BigEndian - -// default json encoding base64.URLEncoding -var base64Encoding = base64.URLEncoding - -// Base64StdEncoding Marshal/Unmarshal BitSet with base64.StdEncoding(Default: base64.URLEncoding) -func Base64StdEncoding() { base64Encoding = base64.StdEncoding } - -// LittleEndian Marshal/Unmarshal Binary as Little Endian(Default: binary.BigEndian) -func LittleEndian() { binaryOrder = binary.LittleEndian } - -// A BitSet is a set of bits. The zero value of a BitSet is an empty set of length 0. -type BitSet struct { - length uint - set []uint64 -} - -// Error is used to distinguish errors (panics) generated in this package. -type Error string - -// safeSet will fixup b.set to be non-nil and return the field value -func (b *BitSet) safeSet() []uint64 { - if b.set == nil { - b.set = make([]uint64, wordsNeeded(0)) - } - return b.set -} - -// SetBitsetFrom fills the bitset with an array of integers without creating a new BitSet instance -func (b *BitSet) SetBitsetFrom(buf []uint64) { - b.length = uint(len(buf)) * 64 - b.set = buf -} - -// From is a constructor used to create a BitSet from an array of integers -func From(buf []uint64) *BitSet { - return FromWithLength(uint(len(buf))*64, buf) -} - -// FromWithLength constructs from an array of integers and length. -func FromWithLength(len uint, set []uint64) *BitSet { - return &BitSet{len, set} -} - -// Bytes returns the bitset as array of integers -func (b *BitSet) Bytes() []uint64 { - return b.set -} - -// wordsNeeded calculates the number of words needed for i bits -func wordsNeeded(i uint) int { - if i > (Cap() - wordSize + 1) { - return int(Cap() >> log2WordSize) - } - return int((i + (wordSize - 1)) >> log2WordSize) -} - -// wordsNeededUnbound calculates the number of words needed for i bits, possibly exceeding the capacity. -// This function is useful if you know that the capacity cannot be exceeded (e.g., you have an existing bitmap). -func wordsNeededUnbound(i uint) int { - return int((i + (wordSize - 1)) >> log2WordSize) -} - -// wordsIndex calculates the index of words in a `uint64` -func wordsIndex(i uint) uint { - return i & (wordSize - 1) -} - -// New creates a new BitSet with a hint that length bits will be required -func New(length uint) (bset *BitSet) { - defer func() { - if r := recover(); r != nil { - bset = &BitSet{ - 0, - make([]uint64, 0), - } - } - }() - - bset = &BitSet{ - length, - make([]uint64, wordsNeeded(length)), - } - - return bset -} - -// Cap returns the total possible capacity, or number of bits -func Cap() uint { - return ^uint(0) -} - -// Len returns the number of bits in the BitSet. -// Note the difference to method Count, see example. -func (b *BitSet) Len() uint { - return b.length -} - -// extendSet adds additional words to incorporate new bits if needed -func (b *BitSet) extendSet(i uint) { - if i >= Cap() { - panic("You are exceeding the capacity") - } - nsize := wordsNeeded(i + 1) - if b.set == nil { - b.set = make([]uint64, nsize) - } else if cap(b.set) >= nsize { - b.set = b.set[:nsize] // fast resize - } else if len(b.set) < nsize { - newset := make([]uint64, nsize, 2*nsize) // increase capacity 2x - copy(newset, b.set) - b.set = newset - } - b.length = i + 1 -} - -// Test whether bit i is set. -func (b *BitSet) Test(i uint) bool { - if i >= b.length { - return false - } - return b.set[i>>log2WordSize]&(1<= Cap(), this function will panic. -// Warning: using a very large value for 'i' -// may lead to a memory shortage and a panic: the caller is responsible -// for providing sensible parameters in line with their memory capacity. -func (b *BitSet) Set(i uint) *BitSet { - if i >= b.length { // if we need more bits, make 'em - b.extendSet(i) - } - b.set[i>>log2WordSize] |= 1 << wordsIndex(i) - return b -} - -// Clear bit i to 0 -func (b *BitSet) Clear(i uint) *BitSet { - if i >= b.length { - return b - } - b.set[i>>log2WordSize] &^= 1 << wordsIndex(i) - return b -} - -// SetTo sets bit i to value. -// If i>= Cap(), this function will panic. -// Warning: using a very large value for 'i' -// may lead to a memory shortage and a panic: the caller is responsible -// for providing sensible parameters in line with their memory capacity. -func (b *BitSet) SetTo(i uint, value bool) *BitSet { - if value { - return b.Set(i) - } - return b.Clear(i) -} - -// Flip bit at i. -// If i>= Cap(), this function will panic. -// Warning: using a very large value for 'i' -// may lead to a memory shortage and a panic: the caller is responsible -// for providing sensible parameters in line with their memory capacity. -func (b *BitSet) Flip(i uint) *BitSet { - if i >= b.length { - return b.Set(i) - } - b.set[i>>log2WordSize] ^= 1 << wordsIndex(i) - return b -} - -// FlipRange bit in [start, end). -// If end>= Cap(), this function will panic. -// Warning: using a very large value for 'end' -// may lead to a memory shortage and a panic: the caller is responsible -// for providing sensible parameters in line with their memory capacity. -func (b *BitSet) FlipRange(start, end uint) *BitSet { - if start >= end { - return b - } - if end-1 >= b.length { // if we need more bits, make 'em - b.extendSet(end - 1) - } - var startWord uint = start >> log2WordSize - var endWord uint = end >> log2WordSize - b.set[startWord] ^= ^(^uint64(0) << wordsIndex(start)) - if endWord > 0 { - // bounds check elimination - data := b.set - _ = data[endWord-1] - for i := startWord; i < endWord; i++ { - data[i] = ^data[i] - } - } - if end&(wordSize-1) != 0 { - b.set[endWord] ^= ^uint64(0) >> wordsIndex(-end) - } - return b -} - -// Shrink shrinks BitSet so that the provided value is the last possible -// set value. It clears all bits > the provided index and reduces the size -// and length of the set. -// -// Note that the parameter value is not the new length in bits: it is the -// maximal value that can be stored in the bitset after the function call. -// The new length in bits is the parameter value + 1. Thus it is not possible -// to use this function to set the length to 0, the minimal value of the length -// after this function call is 1. -// -// A new slice is allocated to store the new bits, so you may see an increase in -// memory usage until the GC runs. Normally this should not be a problem, but if you -// have an extremely large BitSet its important to understand that the old BitSet will -// remain in memory until the GC frees it. -func (b *BitSet) Shrink(lastbitindex uint) *BitSet { - length := lastbitindex + 1 - idx := wordsNeeded(length) - if idx > len(b.set) { - return b - } - shrunk := make([]uint64, idx) - copy(shrunk, b.set[:idx]) - b.set = shrunk - b.length = length - lastWordUsedBits := length % 64 - if lastWordUsedBits != 0 { - b.set[idx-1] &= allBits >> uint64(64-wordsIndex(lastWordUsedBits)) - } - return b -} - -// Compact shrinks BitSet to so that we preserve all set bits, while minimizing -// memory usage. Compact calls Shrink. -func (b *BitSet) Compact() *BitSet { - idx := len(b.set) - 1 - for ; idx >= 0 && b.set[idx] == 0; idx-- { - } - newlength := uint((idx + 1) << log2WordSize) - if newlength >= b.length { - return b // nothing to do - } - if newlength > 0 { - return b.Shrink(newlength - 1) - } - // We preserve one word - return b.Shrink(63) -} - -// InsertAt takes an index which indicates where a bit should be -// inserted. Then it shifts all the bits in the set to the left by 1, starting -// from the given index position, and sets the index position to 0. -// -// Depending on the size of your BitSet, and where you are inserting the new entry, -// this method could be extremely slow and in some cases might cause the entire BitSet -// to be recopied. -func (b *BitSet) InsertAt(idx uint) *BitSet { - insertAtElement := idx >> log2WordSize - - // if length of set is a multiple of wordSize we need to allocate more space first - if b.isLenExactMultiple() { - b.set = append(b.set, uint64(0)) - } - - var i uint - for i = uint(len(b.set) - 1); i > insertAtElement; i-- { - // all elements above the position where we want to insert can simply by shifted - b.set[i] <<= 1 - - // we take the most significant bit of the previous element and set it as - // the least significant bit of the current element - b.set[i] |= (b.set[i-1] & 0x8000000000000000) >> 63 - } - - // generate a mask to extract the data that we need to shift left - // within the element where we insert a bit - dataMask := uint64(1)< 0x40000 { - buffer.WriteString("...") - break - } - buffer.WriteString(strconv.FormatInt(int64(i), 10)) - i, e = b.NextSet(i + 1) - if e { - buffer.WriteString(",") - } - } - buffer.WriteString("}") - return buffer.String() -} - -// DeleteAt deletes the bit at the given index position from -// within the bitset -// All the bits residing on the left of the deleted bit get -// shifted right by 1 -// The running time of this operation may potentially be -// relatively slow, O(length) -func (b *BitSet) DeleteAt(i uint) *BitSet { - // the index of the slice element where we'll delete a bit - deleteAtElement := i >> log2WordSize - - // generate a mask for the data that needs to be shifted right - // within that slice element that gets modified - dataMask := ^((uint64(1) << wordsIndex(i)) - 1) - - // extract the data that we'll shift right from the slice element - data := b.set[deleteAtElement] & dataMask - - // set the masked area to 0 while leaving the rest as it is - b.set[deleteAtElement] &= ^dataMask - - // shift the previously extracted data to the right and then - // set it in the previously masked area - b.set[deleteAtElement] |= (data >> 1) & dataMask - - // loop over all the consecutive slice elements to copy each - // lowest bit into the highest position of the previous element, - // then shift the entire content to the right by 1 - for i := int(deleteAtElement) + 1; i < len(b.set); i++ { - b.set[i-1] |= (b.set[i] & 1) << 63 - b.set[i] >>= 1 - } - - b.length = b.length - 1 - - return b -} - -// NextSet returns the next bit set from the specified index, -// including possibly the current index -// along with an error code (true = valid, false = no set bit found) -// for i,e := v.NextSet(0); e; i,e = v.NextSet(i + 1) {...} -// -// Users concerned with performance may want to use NextSetMany to -// retrieve several values at once. -func (b *BitSet) NextSet(i uint) (uint, bool) { - x := int(i >> log2WordSize) - if x >= len(b.set) { - return 0, false - } - w := b.set[x] - w = w >> wordsIndex(i) - if w != 0 { - return i + trailingZeroes64(w), true - } - x++ - // bounds check elimination in the loop - if x < 0 { - return 0, false - } - for x < len(b.set) { - if b.set[x] != 0 { - return uint(x)*wordSize + trailingZeroes64(b.set[x]), true - } - x++ - - } - return 0, false -} - -// NextSetMany returns many next bit sets from the specified index, -// including possibly the current index and up to cap(buffer). -// If the returned slice has len zero, then no more set bits were found -// -// buffer := make([]uint, 256) // this should be reused -// j := uint(0) -// j, buffer = bitmap.NextSetMany(j, buffer) -// for ; len(buffer) > 0; j, buffer = bitmap.NextSetMany(j,buffer) { -// for k := range buffer { -// do something with buffer[k] -// } -// j += 1 -// } -// -// It is possible to retrieve all set bits as follow: -// -// indices := make([]uint, bitmap.Count()) -// bitmap.NextSetMany(0, indices) -// -// However if bitmap.Count() is large, it might be preferable to -// use several calls to NextSetMany, for performance reasons. -func (b *BitSet) NextSetMany(i uint, buffer []uint) (uint, []uint) { - myanswer := buffer - capacity := cap(buffer) - x := int(i >> log2WordSize) - if x >= len(b.set) || capacity == 0 { - return 0, myanswer[:0] - } - skip := wordsIndex(i) - word := b.set[x] >> skip - myanswer = myanswer[:capacity] - size := int(0) - for word != 0 { - r := trailingZeroes64(word) - t := word & ((^word) + 1) - myanswer[size] = r + i - size++ - if size == capacity { - goto End - } - word = word ^ t - } - x++ - for idx, word := range b.set[x:] { - for word != 0 { - r := trailingZeroes64(word) - t := word & ((^word) + 1) - myanswer[size] = r + (uint(x+idx) << 6) - size++ - if size == capacity { - goto End - } - word = word ^ t - } - } -End: - if size > 0 { - return myanswer[size-1], myanswer[:size] - } - return 0, myanswer[:0] -} - -// NextClear returns the next clear bit from the specified index, -// including possibly the current index -// along with an error code (true = valid, false = no bit found i.e. all bits are set) -func (b *BitSet) NextClear(i uint) (uint, bool) { - x := int(i >> log2WordSize) - if x >= len(b.set) { - return 0, false - } - w := b.set[x] - w = w >> wordsIndex(i) - wA := allBits >> wordsIndex(i) - index := i + trailingZeroes64(^w) - if w != wA && index < b.length { - return index, true - } - x++ - // bounds check elimination in the loop - if x < 0 { - return 0, false - } - for x < len(b.set) { - if b.set[x] != allBits { - index = uint(x)*wordSize + trailingZeroes64(^b.set[x]) - if index < b.length { - return index, true - } - } - x++ - } - return 0, false -} - -// ClearAll clears the entire BitSet -func (b *BitSet) ClearAll() *BitSet { - if b != nil && b.set != nil { - for i := range b.set { - b.set[i] = 0 - } - } - return b -} - -// wordCount returns the number of words used in a bit set -func (b *BitSet) wordCount() int { - return wordsNeededUnbound(b.length) -} - -// Clone this BitSet -func (b *BitSet) Clone() *BitSet { - c := New(b.length) - if b.set != nil { // Clone should not modify current object - copy(c.set, b.set) - } - return c -} - -// Copy into a destination BitSet using the Go array copy semantics: -// the number of bits copied is the minimum of the number of bits in the current -// BitSet (Len()) and the destination Bitset. -// We return the number of bits copied in the destination BitSet. -func (b *BitSet) Copy(c *BitSet) (count uint) { - if c == nil { - return - } - if b.set != nil { // Copy should not modify current object - copy(c.set, b.set) - } - count = c.length - if b.length < c.length { - count = b.length - } - // Cleaning the last word is needed to keep the invariant that other functions, such as Count, require - // that any bits in the last word that would exceed the length of the bitmask are set to 0. - c.cleanLastWord() - return -} - -// CopyFull copies into a destination BitSet such that the destination is -// identical to the source after the operation, allocating memory if necessary. -func (b *BitSet) CopyFull(c *BitSet) { - if c == nil { - return - } - c.length = b.length - if len(b.set) == 0 { - if c.set != nil { - c.set = c.set[:0] - } - } else { - if cap(c.set) < len(b.set) { - c.set = make([]uint64, len(b.set)) - } else { - c.set = c.set[:len(b.set)] - } - copy(c.set, b.set) - } -} - -// Count (number of set bits). -// Also known as "popcount" or "population count". -func (b *BitSet) Count() uint { - if b != nil && b.set != nil { - return uint(popcntSlice(b.set)) - } - return 0 -} - -// Equal tests the equivalence of two BitSets. -// False if they are of different sizes, otherwise true -// only if all the same bits are set -func (b *BitSet) Equal(c *BitSet) bool { - if c == nil || b == nil { - return c == b - } - if b.length != c.length { - return false - } - if b.length == 0 { // if they have both length == 0, then could have nil set - return true - } - wn := b.wordCount() - // bounds check elimination - if wn <= 0 { - return true - } - _ = b.set[wn-1] - _ = c.set[wn-1] - for p := 0; p < wn; p++ { - if c.set[p] != b.set[p] { - return false - } - } - return true -} - -func panicIfNull(b *BitSet) { - if b == nil { - panic(Error("BitSet must not be null")) - } -} - -// Difference of base set and other set -// This is the BitSet equivalent of &^ (and not) -func (b *BitSet) Difference(compare *BitSet) (result *BitSet) { - panicIfNull(b) - panicIfNull(compare) - result = b.Clone() // clone b (in case b is bigger than compare) - l := compare.wordCount() - if l > b.wordCount() { - l = b.wordCount() - } - for i := 0; i < l; i++ { - result.set[i] = b.set[i] &^ compare.set[i] - } - return -} - -// DifferenceCardinality computes the cardinality of the differnce -func (b *BitSet) DifferenceCardinality(compare *BitSet) uint { - panicIfNull(b) - panicIfNull(compare) - l := compare.wordCount() - if l > b.wordCount() { - l = b.wordCount() - } - cnt := uint64(0) - cnt += popcntMaskSlice(b.set[:l], compare.set[:l]) - cnt += popcntSlice(b.set[l:]) - return uint(cnt) -} - -// InPlaceDifference computes the difference of base set and other set -// This is the BitSet equivalent of &^ (and not) -func (b *BitSet) InPlaceDifference(compare *BitSet) { - panicIfNull(b) - panicIfNull(compare) - l := compare.wordCount() - if l > b.wordCount() { - l = b.wordCount() - } - if l <= 0 { - return - } - // bounds check elimination - data, cmpData := b.set, compare.set - _ = data[l-1] - _ = cmpData[l-1] - for i := 0; i < l; i++ { - data[i] &^= cmpData[i] - } -} - -// Convenience function: return two bitsets ordered by -// increasing length. Note: neither can be nil -func sortByLength(a *BitSet, b *BitSet) (ap *BitSet, bp *BitSet) { - if a.length <= b.length { - ap, bp = a, b - } else { - ap, bp = b, a - } - return -} - -// Intersection of base set and other set -// This is the BitSet equivalent of & (and) -func (b *BitSet) Intersection(compare *BitSet) (result *BitSet) { - panicIfNull(b) - panicIfNull(compare) - b, compare = sortByLength(b, compare) - result = New(b.length) - for i, word := range b.set { - result.set[i] = word & compare.set[i] - } - return -} - -// IntersectionCardinality computes the cardinality of the union -func (b *BitSet) IntersectionCardinality(compare *BitSet) uint { - panicIfNull(b) - panicIfNull(compare) - b, compare = sortByLength(b, compare) - cnt := popcntAndSlice(b.set, compare.set) - return uint(cnt) -} - -// InPlaceIntersection destructively computes the intersection of -// base set and the compare set. -// This is the BitSet equivalent of & (and) -func (b *BitSet) InPlaceIntersection(compare *BitSet) { - panicIfNull(b) - panicIfNull(compare) - l := compare.wordCount() - if l > b.wordCount() { - l = b.wordCount() - } - if l > 0 { - // bounds check elimination - data, cmpData := b.set, compare.set - _ = data[l-1] - _ = cmpData[l-1] - - for i := 0; i < l; i++ { - data[i] &= cmpData[i] - } - } - if l >= 0 { - for i := l; i < len(b.set); i++ { - b.set[i] = 0 - } - } - if compare.length > 0 { - if compare.length-1 >= b.length { - b.extendSet(compare.length - 1) - } - } -} - -// Union of base set and other set -// This is the BitSet equivalent of | (or) -func (b *BitSet) Union(compare *BitSet) (result *BitSet) { - panicIfNull(b) - panicIfNull(compare) - b, compare = sortByLength(b, compare) - result = compare.Clone() - for i, word := range b.set { - result.set[i] = word | compare.set[i] - } - return -} - -// UnionCardinality computes the cardinality of the uniton of the base set -// and the compare set. -func (b *BitSet) UnionCardinality(compare *BitSet) uint { - panicIfNull(b) - panicIfNull(compare) - b, compare = sortByLength(b, compare) - cnt := popcntOrSlice(b.set, compare.set) - if len(compare.set) > len(b.set) { - cnt += popcntSlice(compare.set[len(b.set):]) - } - return uint(cnt) -} - -// InPlaceUnion creates the destructive union of base set and compare set. -// This is the BitSet equivalent of | (or). -func (b *BitSet) InPlaceUnion(compare *BitSet) { - panicIfNull(b) - panicIfNull(compare) - l := compare.wordCount() - if l > b.wordCount() { - l = b.wordCount() - } - if compare.length > 0 && compare.length-1 >= b.length { - b.extendSet(compare.length - 1) - } - if l > 0 { - // bounds check elimination - data, cmpData := b.set, compare.set - _ = data[l-1] - _ = cmpData[l-1] - - for i := 0; i < l; i++ { - data[i] |= cmpData[i] - } - } - if len(compare.set) > l { - for i := l; i < len(compare.set); i++ { - b.set[i] = compare.set[i] - } - } -} - -// SymmetricDifference of base set and other set -// This is the BitSet equivalent of ^ (xor) -func (b *BitSet) SymmetricDifference(compare *BitSet) (result *BitSet) { - panicIfNull(b) - panicIfNull(compare) - b, compare = sortByLength(b, compare) - // compare is bigger, so clone it - result = compare.Clone() - for i, word := range b.set { - result.set[i] = word ^ compare.set[i] - } - return -} - -// SymmetricDifferenceCardinality computes the cardinality of the symmetric difference -func (b *BitSet) SymmetricDifferenceCardinality(compare *BitSet) uint { - panicIfNull(b) - panicIfNull(compare) - b, compare = sortByLength(b, compare) - cnt := popcntXorSlice(b.set, compare.set) - if len(compare.set) > len(b.set) { - cnt += popcntSlice(compare.set[len(b.set):]) - } - return uint(cnt) -} - -// InPlaceSymmetricDifference creates the destructive SymmetricDifference of base set and other set -// This is the BitSet equivalent of ^ (xor) -func (b *BitSet) InPlaceSymmetricDifference(compare *BitSet) { - panicIfNull(b) - panicIfNull(compare) - l := compare.wordCount() - if l > b.wordCount() { - l = b.wordCount() - } - if compare.length > 0 && compare.length-1 >= b.length { - b.extendSet(compare.length - 1) - } - if l > 0 { - // bounds check elimination - data, cmpData := b.set, compare.set - _ = data[l-1] - _ = cmpData[l-1] - for i := 0; i < l; i++ { - data[i] ^= cmpData[i] - } - } - if len(compare.set) > l { - for i := l; i < len(compare.set); i++ { - b.set[i] = compare.set[i] - } - } -} - -// Is the length an exact multiple of word sizes? -func (b *BitSet) isLenExactMultiple() bool { - return wordsIndex(b.length) == 0 -} - -// Clean last word by setting unused bits to 0 -func (b *BitSet) cleanLastWord() { - if !b.isLenExactMultiple() { - b.set[len(b.set)-1] &= allBits >> (wordSize - wordsIndex(b.length)) - } -} - -// Complement computes the (local) complement of a bitset (up to length bits) -func (b *BitSet) Complement() (result *BitSet) { - panicIfNull(b) - result = New(b.length) - for i, word := range b.set { - result.set[i] = ^word - } - result.cleanLastWord() - return -} - -// All returns true if all bits are set, false otherwise. Returns true for -// empty sets. -func (b *BitSet) All() bool { - panicIfNull(b) - return b.Count() == b.length -} - -// None returns true if no bit is set, false otherwise. Returns true for -// empty sets. -func (b *BitSet) None() bool { - panicIfNull(b) - if b != nil && b.set != nil { - for _, word := range b.set { - if word > 0 { - return false - } - } - } - return true -} - -// Any returns true if any bit is set, false otherwise -func (b *BitSet) Any() bool { - panicIfNull(b) - return !b.None() -} - -// IsSuperSet returns true if this is a superset of the other set -func (b *BitSet) IsSuperSet(other *BitSet) bool { - l := other.wordCount() - if b.wordCount() < l { - l = b.wordCount() - } - for i, word := range other.set[:l] { - if b.set[i]&word != word { - return false - } - } - return popcntSlice(other.set[l:]) == 0 -} - -// IsStrictSuperSet returns true if this is a strict superset of the other set -func (b *BitSet) IsStrictSuperSet(other *BitSet) bool { - return b.Count() > other.Count() && b.IsSuperSet(other) -} - -// DumpAsBits dumps a bit set as a string of bits -func (b *BitSet) DumpAsBits() string { - if b.set == nil { - return "." - } - buffer := bytes.NewBufferString("") - i := len(b.set) - 1 - for ; i >= 0; i-- { - fmt.Fprintf(buffer, "%064b.", b.set[i]) - } - return buffer.String() -} - -// BinaryStorageSize returns the binary storage requirements (see WriteTo) in bytes. -func (b *BitSet) BinaryStorageSize() int { - return int(wordBytes + wordBytes*uint(b.wordCount())) -} - -func readUint64Array(reader io.Reader, data []uint64) error { - length := len(data) - bufferSize := 128 - buffer := make([]byte, bufferSize*int(wordBytes)) - for i := 0; i < length; i += bufferSize { - end := i + bufferSize - if end > length { - end = length - buffer = buffer[:wordBytes*uint(end-i)] - } - chunk := data[i:end] - if _, err := io.ReadFull(reader, buffer); err != nil { - return err - } - for i := range chunk { - chunk[i] = uint64(binaryOrder.Uint64(buffer[8*i:])) - } - } - return nil -} - -func writeUint64Array(writer io.Writer, data []uint64) error { - bufferSize := 128 - buffer := make([]byte, bufferSize*int(wordBytes)) - for i := 0; i < len(data); i += bufferSize { - end := i + bufferSize - if end > len(data) { - end = len(data) - buffer = buffer[:wordBytes*uint(end-i)] - } - chunk := data[i:end] - for i, x := range chunk { - binaryOrder.PutUint64(buffer[8*i:], x) - } - _, err := writer.Write(buffer) - if err != nil { - return err - } - } - return nil -} - -// WriteTo writes a BitSet to a stream. The format is: -// 1. uint64 length -// 2. []uint64 set -// Upon success, the number of bytes written is returned. -// -// Performance: if this function is used to write to a disk or network -// connection, it might be beneficial to wrap the stream in a bufio.Writer. -// E.g., -// -// f, err := os.Create("myfile") -// w := bufio.NewWriter(f) -func (b *BitSet) WriteTo(stream io.Writer) (int64, error) { - length := uint64(b.length) - // Write length - err := binary.Write(stream, binaryOrder, &length) - if err != nil { - // Upon failure, we do not guarantee that we - // return the number of bytes written. - return int64(0), err - } - err = writeUint64Array(stream, b.set[:b.wordCount()]) - if err != nil { - // Upon failure, we do not guarantee that we - // return the number of bytes written. - return int64(wordBytes), err - } - return int64(b.BinaryStorageSize()), nil -} - -// ReadFrom reads a BitSet from a stream written using WriteTo -// The format is: -// 1. uint64 length -// 2. []uint64 set -// Upon success, the number of bytes read is returned. -// If the current BitSet is not large enough to hold the data, -// it is extended. In case of error, the BitSet is either -// left unchanged or made empty if the error occurs too late -// to preserve the content. -// -// Performance: if this function is used to read from a disk or network -// connection, it might be beneficial to wrap the stream in a bufio.Reader. -// E.g., -// -// f, err := os.Open("myfile") -// r := bufio.NewReader(f) -func (b *BitSet) ReadFrom(stream io.Reader) (int64, error) { - var length uint64 - err := binary.Read(stream, binaryOrder, &length) - if err != nil { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - return 0, err - } - newlength := uint(length) - - if uint64(newlength) != length { - return 0, errors.New("unmarshalling error: type mismatch") - } - nWords := wordsNeeded(uint(newlength)) - if cap(b.set) >= nWords { - b.set = b.set[:nWords] - } else { - b.set = make([]uint64, nWords) - } - - b.length = newlength - - err = readUint64Array(stream, b.set) - if err != nil { - if err == io.EOF { - err = io.ErrUnexpectedEOF - } - // We do not want to leave the BitSet partially filled as - // it is error prone. - b.set = b.set[:0] - b.length = 0 - return 0, err - } - - return int64(b.BinaryStorageSize()), nil -} - -// MarshalBinary encodes a BitSet into a binary form and returns the result. -func (b *BitSet) MarshalBinary() ([]byte, error) { - var buf bytes.Buffer - _, err := b.WriteTo(&buf) - if err != nil { - return []byte{}, err - } - - return buf.Bytes(), err -} - -// UnmarshalBinary decodes the binary form generated by MarshalBinary. -func (b *BitSet) UnmarshalBinary(data []byte) error { - buf := bytes.NewReader(data) - _, err := b.ReadFrom(buf) - return err -} - -// MarshalJSON marshals a BitSet as a JSON structure -func (b BitSet) MarshalJSON() ([]byte, error) { - buffer := bytes.NewBuffer(make([]byte, 0, b.BinaryStorageSize())) - _, err := b.WriteTo(buffer) - if err != nil { - return nil, err - } - - // URLEncode all bytes - return json.Marshal(base64Encoding.EncodeToString(buffer.Bytes())) -} - -// UnmarshalJSON unmarshals a BitSet from JSON created using MarshalJSON -func (b *BitSet) UnmarshalJSON(data []byte) error { - // Unmarshal as string - var s string - err := json.Unmarshal(data, &s) - if err != nil { - return err - } - - // URLDecode string - buf, err := base64Encoding.DecodeString(s) - if err != nil { - return err - } - - _, err = b.ReadFrom(bytes.NewReader(buf)) - return err -} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt.go b/vendor/github.com/bits-and-blooms/bitset/popcnt.go deleted file mode 100644 index 76577a8..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt.go +++ /dev/null @@ -1,53 +0,0 @@ -package bitset - -// bit population count, take from -// https://code.google.com/p/go/issues/detail?id=4988#c11 -// credit: https://code.google.com/u/arnehormann/ -func popcount(x uint64) (n uint64) { - x -= (x >> 1) & 0x5555555555555555 - x = (x>>2)&0x3333333333333333 + x&0x3333333333333333 - x += x >> 4 - x &= 0x0f0f0f0f0f0f0f0f - x *= 0x0101010101010101 - return x >> 56 -} - -func popcntSliceGo(s []uint64) uint64 { - cnt := uint64(0) - for _, x := range s { - cnt += popcount(x) - } - return cnt -} - -func popcntMaskSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] &^ m[i]) - } - return cnt -} - -func popcntAndSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] & m[i]) - } - return cnt -} - -func popcntOrSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] | m[i]) - } - return cnt -} - -func popcntXorSliceGo(s, m []uint64) uint64 { - cnt := uint64(0) - for i := range s { - cnt += popcount(s[i] ^ m[i]) - } - return cnt -} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go deleted file mode 100644 index 7855c04..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go +++ /dev/null @@ -1,62 +0,0 @@ -//go:build go1.9 -// +build go1.9 - -package bitset - -import "math/bits" - -func popcntSlice(s []uint64) uint64 { - var cnt int - for _, x := range s { - cnt += bits.OnesCount64(x) - } - return uint64(cnt) -} - -func popcntMaskSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } - for i := range s { - cnt += bits.OnesCount64(s[i] &^ m[i]) - } - return uint64(cnt) -} - -func popcntAndSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } - for i := range s { - cnt += bits.OnesCount64(s[i] & m[i]) - } - return uint64(cnt) -} - -func popcntOrSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } - for i := range s { - cnt += bits.OnesCount64(s[i] | m[i]) - } - return uint64(cnt) -} - -func popcntXorSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } - for i := range s { - cnt += bits.OnesCount64(s[i] ^ m[i]) - } - return uint64(cnt) -} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go deleted file mode 100644 index 116e044..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go +++ /dev/null @@ -1,68 +0,0 @@ -//go:build !go1.9 && amd64 && !appengine -// +build !go1.9,amd64,!appengine - -package bitset - -// *** the following functions are defined in popcnt_amd64.s - -//go:noescape - -func hasAsm() bool - -// useAsm is a flag used to select the GO or ASM implementation of the popcnt function -var useAsm = hasAsm() - -//go:noescape - -func popcntSliceAsm(s []uint64) uint64 - -//go:noescape - -func popcntMaskSliceAsm(s, m []uint64) uint64 - -//go:noescape - -func popcntAndSliceAsm(s, m []uint64) uint64 - -//go:noescape - -func popcntOrSliceAsm(s, m []uint64) uint64 - -//go:noescape - -func popcntXorSliceAsm(s, m []uint64) uint64 - -func popcntSlice(s []uint64) uint64 { - if useAsm { - return popcntSliceAsm(s) - } - return popcntSliceGo(s) -} - -func popcntMaskSlice(s, m []uint64) uint64 { - if useAsm { - return popcntMaskSliceAsm(s, m) - } - return popcntMaskSliceGo(s, m) -} - -func popcntAndSlice(s, m []uint64) uint64 { - if useAsm { - return popcntAndSliceAsm(s, m) - } - return popcntAndSliceGo(s, m) -} - -func popcntOrSlice(s, m []uint64) uint64 { - if useAsm { - return popcntOrSliceAsm(s, m) - } - return popcntOrSliceGo(s, m) -} - -func popcntXorSlice(s, m []uint64) uint64 { - if useAsm { - return popcntXorSliceAsm(s, m) - } - return popcntXorSliceGo(s, m) -} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s deleted file mode 100644 index 666c0dc..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s +++ /dev/null @@ -1,104 +0,0 @@ -// +build !go1.9 -// +build amd64,!appengine - -TEXT ·hasAsm(SB),4,$0-1 -MOVQ $1, AX -CPUID -SHRQ $23, CX -ANDQ $1, CX -MOVB CX, ret+0(FP) -RET - -#define POPCNTQ_DX_DX BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0xd2 - -TEXT ·popcntSliceAsm(SB),4,$0-32 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntSliceEnd -popcntSliceLoop: -BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0x16 // POPCNTQ (SI), DX -ADDQ DX, AX -ADDQ $8, SI -LOOP popcntSliceLoop -popcntSliceEnd: -MOVQ AX, ret+24(FP) -RET - -TEXT ·popcntMaskSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntMaskSliceEnd -MOVQ m+24(FP), DI -popcntMaskSliceLoop: -MOVQ (DI), DX -NOTQ DX -ANDQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntMaskSliceLoop -popcntMaskSliceEnd: -MOVQ AX, ret+48(FP) -RET - -TEXT ·popcntAndSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntAndSliceEnd -MOVQ m+24(FP), DI -popcntAndSliceLoop: -MOVQ (DI), DX -ANDQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntAndSliceLoop -popcntAndSliceEnd: -MOVQ AX, ret+48(FP) -RET - -TEXT ·popcntOrSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntOrSliceEnd -MOVQ m+24(FP), DI -popcntOrSliceLoop: -MOVQ (DI), DX -ORQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntOrSliceLoop -popcntOrSliceEnd: -MOVQ AX, ret+48(FP) -RET - -TEXT ·popcntXorSliceAsm(SB),4,$0-56 -XORQ AX, AX -MOVQ s+0(FP), SI -MOVQ s_len+8(FP), CX -TESTQ CX, CX -JZ popcntXorSliceEnd -MOVQ m+24(FP), DI -popcntXorSliceLoop: -MOVQ (DI), DX -XORQ (SI), DX -POPCNTQ_DX_DX -ADDQ DX, AX -ADDQ $8, SI -ADDQ $8, DI -LOOP popcntXorSliceLoop -popcntXorSliceEnd: -MOVQ AX, ret+48(FP) -RET diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go deleted file mode 100644 index 9e0ad46..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go +++ /dev/null @@ -1,25 +0,0 @@ -//go:build !go1.9 && (!amd64 || appengine) -// +build !go1.9 -// +build !amd64 appengine - -package bitset - -func popcntSlice(s []uint64) uint64 { - return popcntSliceGo(s) -} - -func popcntMaskSlice(s, m []uint64) uint64 { - return popcntMaskSliceGo(s, m) -} - -func popcntAndSlice(s, m []uint64) uint64 { - return popcntAndSliceGo(s, m) -} - -func popcntOrSlice(s, m []uint64) uint64 { - return popcntOrSliceGo(s, m) -} - -func popcntXorSlice(s, m []uint64) uint64 { - return popcntXorSliceGo(s, m) -} diff --git a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go deleted file mode 100644 index 12336e7..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go +++ /dev/null @@ -1,15 +0,0 @@ -//go:build !go1.9 -// +build !go1.9 - -package bitset - -var deBruijn = [...]byte{ - 0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4, - 62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5, - 63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11, - 54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6, -} - -func trailingZeroes64(v uint64) uint { - return uint(deBruijn[((v&-v)*0x03f79d71b4ca8b09)>>58]) -} diff --git a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go deleted file mode 100644 index cfb0a84..0000000 --- a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build go1.9 -// +build go1.9 - -package bitset - -import "math/bits" - -func trailingZeroes64(v uint64) uint { - return uint(bits.TrailingZeros64(v)) -} diff --git a/vendor/github.com/RoaringBitmap/roaring/LICENSE-2.0.txt b/vendor/github.com/cockroachdb/crlib/LICENSE similarity index 99% rename from vendor/github.com/RoaringBitmap/roaring/LICENSE-2.0.txt rename to vendor/github.com/cockroachdb/crlib/LICENSE index aff5f99..261eeb9 100644 --- a/vendor/github.com/RoaringBitmap/roaring/LICENSE-2.0.txt +++ b/vendor/github.com/cockroachdb/crlib/LICENSE @@ -1,4 +1,3 @@ - Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -187,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2016 by the authors + Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix.go b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix.go new file mode 100644 index 0000000..1cbf852 --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix.go @@ -0,0 +1,36 @@ +// Copyright 2024 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package crbytes + +import "encoding/binary" + +// commonPrefixGeneric is used for architectures without a native +// implementation. It is defined here rather than common_generic.go so that the +// benchmarking code can have access to it even when there's a native +// implementation available. +func commonPrefixGeneric(a, b []byte) int { + asUint64 := func(data []byte, i int) uint64 { + return binary.LittleEndian.Uint64(data[i:]) + } + var shared int + n := min(len(a), len(b)) + for shared < n-7 && asUint64(a, shared) == asUint64(b, shared) { + shared += 8 + } + for shared < n && a[shared] == b[shared] { + shared++ + } + return shared +} diff --git a/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_amd64.s b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_amd64.s new file mode 100644 index 0000000..c64d6af --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_amd64.s @@ -0,0 +1,284 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in licenses/BSD-golang.txt. + +// This code is based on compare_amd64.s from Go 1.12.5. + +TEXT ·CommonPrefix(SB),$0-56 + // SI = uintptr(unsafe.Pointer(&a[0])) + MOVQ a_base+0(FP), SI + // BX = len(a) + MOVQ a_len+8(FP), BX + // DI = uintptr(unsafe.Pointer(&b[0])) + MOVQ b_base+24(FP), DI + // DX = len(b) + MOVQ b_len+32(FP), DX + + CMPQ BX, DX + MOVQ DX, R8 + CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare + // Throughout this function, DX remembers the original min(alen, blen) and + // R8 is the number of bytes we still need to compare (with bytes 0 to + // DX-R8 known to match). + MOVQ R8, DX + CMPQ R8, $8 + JB small + + CMPQ R8, $63 + JBE loop + JMP big_loop + RET + +// loop is used when we have between 8 and 63 bytes left to compare (8 <= R8 < 64). +// Invariant: 8 <= R8 < 64 +loop: + CMPQ R8, $16 + JB _0through15 + // X0 = a[:16] + MOVOU (SI), X0 + // X0 = b[:16] + MOVOU (DI), X1 + // Compare Packed Data for Equal: + // for i := 0; i < 16; i++ { + // if X0[i] != X1[i] { + // X1[i] = 0 + // } else { + // X1[i] = 0xFF + // } + // } + PCMPEQB X0, X1 + // Move Byte Mask. + // AX = 0 + // for i := 0; i < 16; i++ { + // if X1[i] & 0x80 != 0 { + // AX |= (1 << i) + // } + PMOVMSKB X1, AX + // AX ^= 0xFFFF + XORQ $0xffff, AX // convert EQ to NE + // if AX != 0 { + // goto diff16 + // } + JNE diff16 // branch if at least one byte is not equal + // a = a[16:] + ADDQ $16, SI + // b = b[16:] + ADDQ $16, DI + // R8 -= 16 + SUBQ $16, R8 + JMP loop + +// Invariant: a[0:48] matches b[0:48] and AX contains a bit mask of differences +// between a[48:64] and b[48:64]. +diff64: + // R8 -= 48 + SUBQ $48, R8 + JMP diff16 + +// Invariant: a[0:32] matches b[0:32] and AX contains a bit mask of differences +// between a[32:48] and b[32:48]. +diff48: + // R8 -= 32 + SUBQ $32, R8 + JMP diff16 + +// Invariant: a[0:16] matches b[0:16] and AX contains a bit mask of differences +// between a[16:32] and b[16:32]. +diff32: + // R8 -= 16 + SUBQ $16, R8 + +// Invariant: AX contains a bit mask of differences between a[:16] and b[:16]. +// AX & (1 << i) == 1 iff a[i] != b[i] +diff16: + // Bit Scan Forward (return the index of the least significant set bit) + // BX = bits.TrailingZeros64(AX) + BSFQ AX, BX + // BX is now the prefix of bytes that matched, advance by this much. + // R8 -= BX + SUBQ BX, R8 + + // Return DX (original min(alen, blen)) - R8 (bytes left to compare) + SUBQ R8, DX + MOVQ DX, ret+48(FP) + RET + +// Invariants: +// - original slices contained at least 8 bytes (DX >= 8) +// - we have at most 15 bytes left to compare (R8 < 16) +_0through15: + // if R8 <= 8 { + // goto _0through8 + // } + CMPQ R8, $8 + JBE _0through8 + // AX = a[:8] + MOVQ (SI), AX + // CX = b[:8] + MOVQ (DI), CX + // if AX != CX { + // goto diff8 + // } + CMPQ AX, CX + JNE diff8 + +// Invariants: +// - original slices contained at least 8 bytes (DX >= 8) +// - we have at most 8 bytes left to compare (R8 <= 8) +// +// Because the backing slices have at least 8 bytes and all the bytes so far +// matched, we can (potentially) back up to where we have exactly 8 bytes to +// compare. +_0through8: + // AX = b[len(b)-8:] + MOVQ -8(SI)(R8*1), AX + // CX = b[len(b)-8:] + MOVQ -8(DI)(R8*1), CX + // if AX == CX { + // goto allsame + // } + CMPQ AX, CX + JEQ allsame + // R8 = 8 + MOVQ $8, R8 + +// Invariant: AX contains a bit mask of differences between a[:8] and b[:8]. +// AX & (1 << i) == 1 iff a[i] != b[i] +diff8: + // CX ^= AX + XORQ AX, CX + // Bit Scan Forward (return the index of the least significant set bit) + // CX = bits.TrailingZeros64(CX) + BSFQ CX, CX + // CX /= 8 + SHRQ $3, CX + // CX is now the 0-based index of the first byte that differs. + // R8 -= CX + SUBQ CX, R8 + + // Return DX (original min(alen, blen)) - R8 (bytes left to compare) + SUBQ R8, DX + MOVQ DX, ret+48(FP) + RET + +// Invariant: original min(alen, blen) < 8. DX < 8, R8 = DX. +small: + // CX = R8 * 8 + LEAQ (R8*8), CX + // CX = -CX + // We only care about the lower 6 bits of CX, so this is equivalent to: + // CX = (8-min(alen, blen)) * 8 + NEGQ CX + JEQ allsame + + // We will load 8 bytes, even though some of them are outside the slice + // bounds. We go out of bounds either before or after the slice depending on + // the value of the pointer. + + // if uintptr(unsafe.Pointer(&a[0]) > 0xF8 { + // goto si_high + // } + CMPB SI, $0xf8 + JA si_high + // SI = a[:8] + MOVQ (SI), SI + // Discard the upper bytes which were out of bounds and add 0s (to be + // removed below). + SHLQ CX, SI + JMP si_finish +si_high: + // SI = a[len(a)-8:] + MOVQ -8(SI)(R8*1), SI +si_finish: + // SI = SI >> CX + // Discard the lower bytes which were added by SHLQ in one case, or that + // were out of bounds in the si_high case. + // In both cases, SI = a[:]. + SHRQ CX, SI + + // if uintptr(unsafe.Pointer(&b[0]) > 0xF8 { + // goto di_high + // } + CMPB DI, $0xf8 + JA di_high + // DI = b[:8] + MOVQ (DI), DI + // Discard the upper bytes which were out of bounds and add 0s (to be + // removed below). + SHLQ CX, DI + JMP di_finish +di_high: + // DI = b[len(b)-8:] + MOVQ -8(DI)(R8*1), DI +di_finish: + // DI = DI >> CX + // Discard the lower bytes which were added by SHLQ in one case, or that + // were out of bounds in the di_high case. + // In both cases, DI = b[:]. + SHRQ CX, DI + + // DI ^= SI + XORQ SI, DI + // if DI == 0 { + // goto allsame + // } + JEQ allsame + + // Bit Scan Forward (return the index of the least significant set bit) + // DI = bits.TrailingZeros64(DI) + BSFQ DI, DI + // DI /= 8 + SHRQ $3, DI + // DI is now the 0-based index of the first byte that differs. + // R8 -= DI + SUBQ DI, R8 + + // Return DX (original min(alen, blen)) - R8 (bytes left to compare) + SUBQ R8, DX +allsame: + MOVQ DX, ret+48(FP) + RET + +// big_loop is used when we have at least 64 bytes to compare. It is similar to +// , except that we do 4 iterations at a time. +big_loop: + MOVOU (SI), X0 + MOVOU (DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff16 + + MOVOU 16(SI), X0 + MOVOU 16(DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff32 + + MOVOU 32(SI), X0 + MOVOU 32(DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff48 + + MOVOU 48(SI), X0 + MOVOU 48(DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff64 + + // a = a[64:] + ADDQ $64, SI + // b = b[64:] + ADDQ $64, DI + // R8 -= 64 + SUBQ $64, R8 + CMPQ R8, $64 + // if R8 < 64 { + // goto loop + // } + JBE loop + JMP big_loop diff --git a/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_arm64.s b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_arm64.s new file mode 100644 index 0000000..cf20e22 --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_arm64.s @@ -0,0 +1,244 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in licenses/BSD-golang.txt. + +// This code is based on compare_arm64.s from Go 1.12.5. + +TEXT ·CommonPrefix(SB),$0-56 + // R0 = uintptr(unsafe.Pointer(&a[0])) + MOVD a_base+0(FP), R0 + // R1 = len(a) + MOVD a_len+8(FP), R1 + // R2 = uintptr(unsafe.Pointer(&b[0])) + MOVD b_base+24(FP), R2 + // R3 = len(b) + MOVD b_len+32(FP), R3 + + CMP R1, R3 + // R6 = min(alen, blen) + CSEL LT, R3, R1, R6 + // Throughout this function, R7 remembers the original min(alen, blen) and + // R6 is the number of bytes we still need to compare (with bytes 0 to R7-R6 + // known to match). + MOVD R6, R7 + + // if R6 == 0 { + // goto samebytes + // } + CBZ R6, samebytes + // IF R6 < 16 { + // goto small + // } + CMP $16, R6 + BLT small + +// chunk16_loop compares 16 bytes at a time. +// Invariant: R6 >= 16 +chunk16_loop: + // R4, R8, a = a[:8], a[8:16], a[16:] + LDP.P 16(R0), (R4, R8) + // R5, R9, b = b[:8], b[8:16]; b[16:] + LDP.P 16(R2), (R5, R9) + // if R4 != R5 { + // goto cmp + // } + CMP R4, R5 + BNE cmp + // if R8 != R9 { + // goto cmpnext + // } + CMP R8, R9 + BNE cmpnext + // R6 -= 16 + SUB $16, R6 + // if R6 >= 16 { + // goto chunk16_loop + // } + CMP $16, R6 + BGE chunk16_loop + // if R6 == 0 { + // goto samebytes + // } + CBZ R6, samebytes + // if R6 <= 8 { + // goto tail + // } + CMP $8, R6 + BLE tail + // We have more than 8 bytes remaining; compare the first 8 bytes. + // R4, a = a[:8], a[8:] + // R5, b = b[:8], b[8:] + MOVD.P 8(R0), R4 + MOVD.P 8(R2), R5 + // if R4 != R5 { + // goto cmp + // } + CMP R4, R5 + BNE cmp + // R6 -= 8 + SUB $8, R6 + +// Invariants: +// - the original slices have at least 8 bytes (R7 >= 8) +// - there are at most 8 bytes left to compare (R6 <= 8) +tail: + // R6 -= 8 + SUB $8, R6 + // R4 = a[R6:R6+8] + MOVD (R0)(R6), R4 + // R5 = b[R6:R6+8] + MOVD (R2)(R6), R5 + // if R4 == R6 { + // goto samebytes + // } + CMP R4, R5 + BEQ samebytes + // R6 = 8 + MOVD $8, R6 + +// Invariants: R4 and R5 contain the next 8 bytes and R4 != R5. +cmp: + // R4 = bits.ReverseBytes64(R4) + REV R4, R4 + // R5 = bits.ReverseBytes64(R5) + REV R5, R5 +// Invariant: R4 and R5 contain the next 8 bytes in reverse order and R4 != R5. +cmprev: + // R5 ^= R4 + EOR R4, R5, R5 + // R5 = bits.LeadingZeros64(R5) + // This is the number of bits that match. + CLZ R5, R5 + // R5 /= 8 + // This is the number of bytes that match. + LSR $3, R5, R5 + // R6 -= R5 + SUBS R5, R6, R6 + // if R6 == 0 { + // goto samebytes + // } + BLT samebytes + +ret: + // return R7 - R6 + SUB R6, R7 + MOVD R7, ret+48(FP) + RET + +// Invariant: we have less than 16 bytes to compare (R6 = R7, R6 < 16). +small: + // Test Bit and Branch if Zero: + // if R6 & 8 != 0 { + // goto lt_8 + // } + TBZ $3, R6, lt_8 + // R4 = a[:8] + MOVD (R0), R4 + // R5 = b[:8] + MOVD (R2), R5 + // if R4 != R5 { + // goto cmp + // } + CMP R4, R5 + BNE cmp + // R6 -= 8 + SUBS $8, R6, R6 + // if R6 == 0 { + // goto samebytes + // } + BEQ samebytes + // a = a[8:] + ADD $8, R0 + // b = b[8:] + ADD $8, R2 + // goto tail + B tail + +// Invariant: we have less than 8 bytes to compare (R6 = R7, R6 < 8). +lt_8: + // Test Bit and Branch if Zero: + // if R6 & 4 != 0 { + // goto lt_4 + // } + TBZ $2, R6, lt_4 + // R4 = a[:4] + MOVWU (R0), R4 + // R5 = b[:4] + MOVWU (R2), R5 + // if R4 != R5 { + // goto cmp + // } + CMPW R4, R5 + BNE cmp + // R6 -= 4 + SUBS $4, R6 + // if R6 == 0 { + // goto samebytes + // } + BEQ samebytes + // a = a[4:] + ADD $4, R0 + // b = b[4:] + ADD $4, R2 + +// Invariant: we have less than 4 bytes to compare (R6 = R7, R6 < 4). +lt_4: + // Test Bit and Branch if Zero: + // if R6 & 2 != 0 { + // goto lt_2 + // } + TBZ $1, R6, lt_2 + // R4 = a[:2] + MOVHU (R0), R4 + // R5 = b[:2] + MOVHU (R2), R5 + CMPW R4, R5 + // if R4 != R5 { + // goto cmp + // } + BNE cmp + // a = a[2:] + ADD $2, R0 + // b = b[2:] + ADD $2, R2 + // R6 -= 2 + SUB $2, R6 + +// Invariant: we have less than 2 bytes to compare (R6 = R7, R6 < 2). +lt_2: + // if R6 == 0 { + // goto samebytes + // } + TBZ $0, R6, samebytes + +// Invariant: we have 1 byte to compare (R6 = R7 = 1). +one: + // R4 = a[:1] + MOVBU (R0), R4 + // R6 = b[:1] + MOVBU (R2), R5 + // if R4 != R5 { + // goto ret + // } + CMPW R4, R5 + BNE ret + +// Invariant: all R7 bytes matched. +samebytes: + // Return R7 + MOVD R7, ret+48(FP) + RET + +// Invariants: +// - the next 8 bytes match (a[:8] == b[:8]) +// - the following bytes R8 and R9 contain the following 8 bytes (R8 = a[8:16], R9 = b[8:16]) +// - R8 != R9 +cmpnext: + // R6 -= 8 + SUB $8, R6 + // R4 = bits.ReverseBytes64(R8) + REV R8, R4 + // R5 = bits.ReverseBytes64(R9) + REV R9, R5 + // goto cmprev + B cmprev diff --git a/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_generic.go b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_generic.go new file mode 100644 index 0000000..cf107a5 --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_generic.go @@ -0,0 +1,22 @@ +// Copyright 2024 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +//go:build !amd64 && !arm64 + +package crbytes + +// CommonPrefix returns the longest prefix shared by the two slices. +func CommonPrefix(a, b []byte) int { + return commonPrefixGeneric(a, b) +} diff --git a/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_native.go b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_native.go new file mode 100644 index 0000000..2d31bac --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/crbytes/common_prefix_native.go @@ -0,0 +1,22 @@ +// Copyright 2024 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +//go:build amd64 || arm64 + +package crbytes + +// CommonPrefix returns the longest prefix shared by the two slices. +// +//go:noescape +func CommonPrefix(a, b []byte) int diff --git a/vendor/github.com/cockroachdb/crlib/crbytes/crbytes.go b/vendor/github.com/cockroachdb/crlib/crbytes/crbytes.go new file mode 100644 index 0000000..303c36c --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/crbytes/crbytes.go @@ -0,0 +1,46 @@ +// Copyright 2024 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package crbytes + +import ( + "fmt" + "unsafe" +) + +// AllocAligned allocates a new byte slice of length n, ensuring the address of +// the beginning of the slice is word aligned. Go does not guarantee that a +// simple make([]byte, n) is aligned. +func AllocAligned(n int) []byte { + if n == 0 { + return nil + } + a := make([]uint64, (n+7)/8) + b := unsafe.Slice((*byte)(unsafe.Pointer(&a[0])), n) + + // Verify alignment. + ptr := uintptr(unsafe.Pointer(&b[0])) + if ptr&7 != 0 { + panic(fmt.Sprintf("allocated []uint64 slice not 8-aligned: pointer %p", &b[0])) + } + return b +} + +// CopyAligned copies the provided byte slice into an aligned byte slice of the +// same length. +func CopyAligned(s []byte) []byte { + dst := AllocAligned(len(s)) + copy(dst, s) + return dst +} diff --git a/vendor/github.com/cockroachdb/crlib/crstrings/utils.go b/vendor/github.com/cockroachdb/crlib/crstrings/utils.go new file mode 100644 index 0000000..1e90d7d --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/crstrings/utils.go @@ -0,0 +1,99 @@ +// Copyright 2024 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package crstrings + +import ( + "fmt" + "slices" + "strings" +) + +// JoinStringers concatenates the string representations of the given +// fmt.Stringer implementations. +func JoinStringers[T fmt.Stringer](delim string, args ...T) string { + switch len(args) { + case 0: + return "" + case 1: + return args[0].String() + } + elems := make([]string, len(args)) + for i := range args { + elems[i] = args[i].String() + } + return strings.Join(elems, delim) +} + +// MapAndJoin converts each argument to a string using the given function and +// joins the strings with the given delimiter. +func MapAndJoin[T any](fn func(T) string, delim string, args ...T) string { + switch len(args) { + case 0: + return "" + case 1: + return fn(args[0]) + } + elems := make([]string, len(args)) + for i := range args { + elems[i] = fn(args[i]) + } + return strings.Join(elems, delim) +} + +// If returns the given value if the flag is true, otherwise an empty string. +func If(flag bool, trueValue string) string { + return IfElse(flag, trueValue, "") +} + +// IfElse returns the value that matches the value of the flag. +func IfElse(flag bool, trueValue, falseValue string) string { + if flag { + return trueValue + } + return falseValue +} + +// WithSep prints the strings a and b with the given separator in-between, +// unless one of the strings is empty (in which case the other string is +// returned). +func WithSep(a string, separator string, b string) string { + if a == "" { + return b + } + if b == "" { + return a + } + return strings.Join([]string{a, b}, separator) +} + +// FilterEmpty removes empty strings from the given slice. +func FilterEmpty(elems []string) []string { + return slices.DeleteFunc(elems, func(s string) bool { + return s == "" + }) +} + +// Lines breaks up the given string into lines. +func Lines(s string) []string { + // Remove any trailing newline (to avoid getting an extraneous empty line at + // the end). + s = strings.TrimSuffix(s, "\n") + if s == "" { + // In this case, Split returns a slice with a single empty string (which is + // not what we want). + return nil + } + return strings.Split(s, "\n") +} diff --git a/vendor/github.com/cockroachdb/crlib/crsync/typed_atomic.go b/vendor/github.com/cockroachdb/crlib/crsync/typed_atomic.go new file mode 100644 index 0000000..9f753a3 --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/crsync/typed_atomic.go @@ -0,0 +1,40 @@ +// Copyright 2024 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package crsync + +import "sync/atomic" + +// TypedAtomicInt64 is a thin wrapper aorund atomic.Int64 that provides type +// safety. +type TypedAtomicInt64[T ~int64] struct { + v atomic.Int64 +} + +// Load atomically loads and returns the value stored in x. +func (x *TypedAtomicInt64[T]) Load() T { return T(x.v.Load()) } + +// Store atomically stores val into x. +func (x *TypedAtomicInt64[T]) Store(val T) { x.v.Store(int64(val)) } + +// Swap atomically stores new into x and returns the previous value. +func (x *TypedAtomicInt64[T]) Swap(new T) (old T) { return T(x.v.Swap(int64(new))) } + +// CompareAndSwap executes the compare-and-swap operation for x. +func (x *TypedAtomicInt64[T]) CompareAndSwap(old, new T) (swapped bool) { + return x.v.CompareAndSwap(int64(old), int64(new)) +} + +// Add atomically adds delta to x and returns the new value. +func (x *TypedAtomicInt64[T]) Add(delta T) (new T) { return T(x.v.Add(int64(delta))) } diff --git a/vendor/github.com/cockroachdb/crlib/crtime/monotonic.go b/vendor/github.com/cockroachdb/crlib/crtime/monotonic.go new file mode 100644 index 0000000..5cd2c7f --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/crtime/monotonic.go @@ -0,0 +1,59 @@ +// Copyright 2024 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package crtime + +import ( + "time" + + "github.com/cockroachdb/crlib/crsync" +) + +// Mono represents a moment in time in terms of a monotonic clock. Its value is +// the duration since the start of the process. +// +// Note that if the system doesn't support a monotonic clock, the wall clock is +// used. +type Mono time.Duration + +// NowMono returns a moment in time in terms of a monotonic clock. It is faster +// than time.Now which also consults the wall clock. +func NowMono() Mono { + // Note: time.Since reads only the monotonic clock (if it is available). + return Mono(time.Since(startTime)) +} + +// Sub returns the duration that elapsed between two moments. +func (m Mono) Sub(other Mono) time.Duration { + return time.Duration(m - other) +} + +// Elapsed returns the duration that elapsed since m. +func (m Mono) Elapsed() time.Duration { + return time.Duration(NowMono() - m) +} + +// MonoFromTime converts a time.Time to a Mono value. If the time has a +// monotonic component, it is used. +func MonoFromTime(t time.Time) Mono { + return Mono(t.Sub(startTime)) +} + +// AtomicMono provides atomic access to a Mono value. +type AtomicMono = crsync.TypedAtomicInt64[Mono] + +// We use startTime as a reference point against which we can call +// time.Since(). This solution is suggested by the Go runtime code: +// https://github.com/golang/go/blob/889abb17e125bb0f5d8de61bb80ef15fbe2a130d/src/runtime/time_nofake.go#L19 +var startTime = time.Now() diff --git a/vendor/github.com/cockroachdb/crlib/fifo/README.md b/vendor/github.com/cockroachdb/crlib/fifo/README.md new file mode 100644 index 0000000..fd90977 --- /dev/null +++ b/vendor/github.com/cockroachdb/crlib/fifo/README.md @@ -0,0 +1,13 @@ +## Go facilities for FIFO queueing + +This library contains several optimized facilities related to FIFO queueing and +rate limiting. + + - [Queue](https://github.com/cockroachdb/crlib/blob/main/fifo/queue.go) implements an + allocation efficient FIFO queue. + + - [Semaphore](https://github.com/cockroachdb/crlib/blob/main/fifo/semaphore.go) + implements a weighted, dynamically reconfigurable semaphore which respects + context cancellation. + +TODO(radu): add rate limiter. diff --git a/vendor/github.com/cockroachdb/fifo/queue.go b/vendor/github.com/cockroachdb/crlib/fifo/queue.go similarity index 93% rename from vendor/github.com/cockroachdb/fifo/queue.go rename to vendor/github.com/cockroachdb/crlib/fifo/queue.go index 73c3912..109e3c5 100644 --- a/vendor/github.com/cockroachdb/fifo/queue.go +++ b/vendor/github.com/cockroachdb/crlib/fifo/queue.go @@ -14,7 +14,11 @@ package fifo -import "sync" +import ( + "sync" + + "github.com/cockroachdb/crlib/internal/invariants" +) // Queue implements an allocation efficient FIFO queue. It is not safe for // concurrent access. @@ -27,7 +31,8 @@ import "sync" // // The queue is implemented as a linked list of nodes, where each node is a // small ring buffer. The nodes are allocated using a sync.Pool (a single pool -// is created for any given type and is used for all queues of that type). +// should be created for any given type and is used for all queues of that +// type). type Queue[T any] struct { len int head, tail *queueNode[T] @@ -100,7 +105,7 @@ type QueueBackingPool[T any] struct { pool sync.Pool } -// MakeQueueBackingPool makes a queue backing pool. It is intented to be used to +// MakeQueueBackingPool makes a queue backing pool. It is intended to be used to // initialize a singleton (global) variable. A single pool can and should be // used by all queues of that type. func MakeQueueBackingPool[T any]() QueueBackingPool[T] { @@ -136,7 +141,7 @@ func (qn *queueNode[T]) IsFull() bool { } func (qn *queueNode[T]) PushBack(t T) *T { - if invariants && qn.len >= queueNodeSize { + if invariants.Enabled && qn.len >= queueNodeSize { panic("cannot push back into a full node") } i := (qn.head + qn.len) % queueNodeSize @@ -150,7 +155,7 @@ func (qn *queueNode[T]) PeekFront() *T { } func (qn *queueNode[T]) PopFront() T { - if invariants && qn.len == 0 { + if invariants.Enabled && qn.len == 0 { panic("cannot pop from empty queue") } t := qn.buf[qn.head] diff --git a/vendor/github.com/cockroachdb/fifo/semaphore.go b/vendor/github.com/cockroachdb/crlib/fifo/semaphore.go similarity index 96% rename from vendor/github.com/cockroachdb/fifo/semaphore.go rename to vendor/github.com/cockroachdb/crlib/fifo/semaphore.go index 2d2a222..5bf90d1 100644 --- a/vendor/github.com/cockroachdb/fifo/semaphore.go +++ b/vendor/github.com/cockroachdb/crlib/fifo/semaphore.go @@ -19,6 +19,8 @@ import ( "errors" "fmt" "sync" + + "github.com/cockroachdb/crlib/internal/invariants" ) // Semaphore implements a weighted, dynamically reconfigurable semaphore which @@ -62,6 +64,8 @@ func NewSemaphore(capacity int64) *Semaphore { var semaQueuePool = MakeQueueBackingPool[semaWaiter]() +// ErrRequestExceedsCapacity is returned when an Acquire requests more than the +// current capacity of the semaphore. var ErrRequestExceedsCapacity = errors.New("request exceeds semaphore capacity") // TryAcquire attempts to acquire n units from the semaphore without waiting. On @@ -211,7 +215,7 @@ func (s *Semaphore) processWaitersLocked() { case w.c == nil: // Request was canceled, we can just clean it up. s.mu.numCanceled-- - if invariants && s.mu.numCanceled < 0 { + if invariants.Enabled && s.mu.numCanceled < 0 { panic("negative numCanceled") } diff --git a/vendor/github.com/cockroachdb/fifo/invariants_off.go b/vendor/github.com/cockroachdb/crlib/internal/invariants/invariants_off.go similarity index 79% rename from vendor/github.com/cockroachdb/fifo/invariants_off.go rename to vendor/github.com/cockroachdb/crlib/internal/invariants/invariants_off.go index e1a73df..65fd976 100644 --- a/vendor/github.com/cockroachdb/fifo/invariants_off.go +++ b/vendor/github.com/cockroachdb/crlib/internal/invariants/invariants_off.go @@ -12,10 +12,9 @@ // implied. See the License for the specific language governing // permissions and limitations under the License. -//go:build !fifo_invariants +//go:build !crlib_invariants -package fifo +package invariants -// invariants is false if we were not built with the "fifo_invariants" build -// tag. -const invariants = false +// Enabled is false if we were not built with the "crlib_invariants" build tag. +const Enabled = false diff --git a/vendor/github.com/cockroachdb/fifo/invariants_on.go b/vendor/github.com/cockroachdb/crlib/internal/invariants/invariants_on.go similarity index 80% rename from vendor/github.com/cockroachdb/fifo/invariants_on.go rename to vendor/github.com/cockroachdb/crlib/internal/invariants/invariants_on.go index c166a43..1d9f22e 100644 --- a/vendor/github.com/cockroachdb/fifo/invariants_on.go +++ b/vendor/github.com/cockroachdb/crlib/internal/invariants/invariants_on.go @@ -12,9 +12,9 @@ // implied. See the License for the specific language governing // permissions and limitations under the License. -//go:build fifo_invariants +//go:build crlib_invariants -package fifo +package invariants -// invariants is true if we were built with the "fifo_invariants" build tag. -const invariants = true +// Enabled is true if we were built with the "crlib_invariants" build tag. +const Enabled = true diff --git a/vendor/github.com/cockroachdb/pebble/cleaner.go b/vendor/github.com/cockroachdb/pebble/cleaner.go deleted file mode 100644 index f9fa43b..0000000 --- a/vendor/github.com/cockroachdb/pebble/cleaner.go +++ /dev/null @@ -1,295 +0,0 @@ -// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "context" - "runtime/pprof" - "sync" - "time" - - "github.com/cockroachdb/errors/oserror" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/tokenbucket" -) - -// Cleaner exports the base.Cleaner type. -type Cleaner = base.Cleaner - -// DeleteCleaner exports the base.DeleteCleaner type. -type DeleteCleaner = base.DeleteCleaner - -// ArchiveCleaner exports the base.ArchiveCleaner type. -type ArchiveCleaner = base.ArchiveCleaner - -type cleanupManager struct { - opts *Options - objProvider objstorage.Provider - onTableDeleteFn func(fileSize uint64) - deletePacer *deletionPacer - - // jobsCh is used as the cleanup job queue. - jobsCh chan *cleanupJob - // waitGroup is used to wait for the background goroutine to exit. - waitGroup sync.WaitGroup - - mu struct { - sync.Mutex - // totalJobs is the total number of enqueued jobs (completed or in progress). - totalJobs int - completedJobs int - completedJobsCond sync.Cond - jobsQueueWarningIssued bool - } -} - -// We can queue this many jobs before we have to block EnqueueJob. -const jobsQueueDepth = 1000 - -// obsoleteFile holds information about a file that needs to be deleted soon. -type obsoleteFile struct { - dir string - fileNum base.DiskFileNum - fileType fileType - fileSize uint64 -} - -type cleanupJob struct { - jobID int - obsoleteFiles []obsoleteFile -} - -// openCleanupManager creates a cleanupManager and starts its background goroutine. -// The cleanupManager must be Close()d. -func openCleanupManager( - opts *Options, - objProvider objstorage.Provider, - onTableDeleteFn func(fileSize uint64), - getDeletePacerInfo func() deletionPacerInfo, -) *cleanupManager { - cm := &cleanupManager{ - opts: opts, - objProvider: objProvider, - onTableDeleteFn: onTableDeleteFn, - deletePacer: newDeletionPacer(time.Now(), int64(opts.TargetByteDeletionRate), getDeletePacerInfo), - jobsCh: make(chan *cleanupJob, jobsQueueDepth), - } - cm.mu.completedJobsCond.L = &cm.mu.Mutex - cm.waitGroup.Add(1) - - go func() { - pprof.Do(context.Background(), gcLabels, func(context.Context) { - cm.mainLoop() - }) - }() - - return cm -} - -// Close stops the background goroutine, waiting until all queued jobs are completed. -// Delete pacing is disabled for the remaining jobs. -func (cm *cleanupManager) Close() { - close(cm.jobsCh) - cm.waitGroup.Wait() -} - -// EnqueueJob adds a cleanup job to the manager's queue. -func (cm *cleanupManager) EnqueueJob(jobID int, obsoleteFiles []obsoleteFile) { - job := &cleanupJob{ - jobID: jobID, - obsoleteFiles: obsoleteFiles, - } - - // Report deleted bytes to the pacer, which can use this data to potentially - // increase the deletion rate to keep up. We want to do this at enqueue time - // rather than when we get to the job, otherwise the reported bytes will be - // subject to the throttling rate which defeats the purpose. - var pacingBytes uint64 - for _, of := range obsoleteFiles { - if cm.needsPacing(of.fileType, of.fileNum) { - pacingBytes += of.fileSize - } - } - if pacingBytes > 0 { - cm.deletePacer.ReportDeletion(time.Now(), pacingBytes) - } - - cm.mu.Lock() - cm.mu.totalJobs++ - cm.maybeLogLocked() - cm.mu.Unlock() - - if invariants.Enabled && len(cm.jobsCh) >= cap(cm.jobsCh)-2 { - panic("cleanup jobs queue full") - } - - cm.jobsCh <- job -} - -// Wait until the completion of all jobs that were already queued. -// -// Does not wait for jobs that are enqueued during the call. -// -// Note that DB.mu should not be held while calling this method; the background -// goroutine needs to acquire DB.mu to update deleted table metrics. -func (cm *cleanupManager) Wait() { - cm.mu.Lock() - defer cm.mu.Unlock() - n := cm.mu.totalJobs - for cm.mu.completedJobs < n { - cm.mu.completedJobsCond.Wait() - } -} - -// mainLoop runs the manager's background goroutine. -func (cm *cleanupManager) mainLoop() { - defer cm.waitGroup.Done() - - var tb tokenbucket.TokenBucket - // Use a token bucket with 1 token / second refill rate and 1 token burst. - tb.Init(1.0, 1.0) - for job := range cm.jobsCh { - for _, of := range job.obsoleteFiles { - if of.fileType != fileTypeTable { - path := base.MakeFilepath(cm.opts.FS, of.dir, of.fileType, of.fileNum) - cm.deleteObsoleteFile(of.fileType, job.jobID, path, of.fileNum, of.fileSize) - } else { - cm.maybePace(&tb, of.fileType, of.fileNum, of.fileSize) - cm.onTableDeleteFn(of.fileSize) - cm.deleteObsoleteObject(fileTypeTable, job.jobID, of.fileNum) - } - } - cm.mu.Lock() - cm.mu.completedJobs++ - cm.mu.completedJobsCond.Broadcast() - cm.maybeLogLocked() - cm.mu.Unlock() - } -} - -func (cm *cleanupManager) needsPacing(fileType base.FileType, fileNum base.DiskFileNum) bool { - if fileType != fileTypeTable { - return false - } - meta, err := cm.objProvider.Lookup(fileType, fileNum) - if err != nil { - // The object was already removed from the provider; we won't actually - // delete anything, so we don't need to pace. - return false - } - // Don't throttle deletion of remote objects. - return !meta.IsRemote() -} - -// maybePace sleeps before deleting an object if appropriate. It is always -// called from the background goroutine. -func (cm *cleanupManager) maybePace( - tb *tokenbucket.TokenBucket, fileType base.FileType, fileNum base.DiskFileNum, fileSize uint64, -) { - if !cm.needsPacing(fileType, fileNum) { - return - } - - tokens := cm.deletePacer.PacingDelay(time.Now(), fileSize) - if tokens == 0.0 { - // The token bucket might be in debt; it could make us wait even for 0 - // tokens. We don't want that if the pacer decided throttling should be - // disabled. - return - } - // Wait for tokens. We use a token bucket instead of sleeping outright because - // the token bucket accumulates up to one second of unused tokens. - for { - ok, d := tb.TryToFulfill(tokenbucket.Tokens(tokens)) - if ok { - break - } - time.Sleep(d) - } -} - -// deleteObsoleteFile deletes a (non-object) file that is no longer needed. -func (cm *cleanupManager) deleteObsoleteFile( - fileType fileType, jobID int, path string, fileNum base.DiskFileNum, fileSize uint64, -) { - // TODO(peter): need to handle this error, probably by re-adding the - // file that couldn't be deleted to one of the obsolete slices map. - err := cm.opts.Cleaner.Clean(cm.opts.FS, fileType, path) - if oserror.IsNotExist(err) { - return - } - - switch fileType { - case fileTypeLog: - cm.opts.EventListener.WALDeleted(WALDeleteInfo{ - JobID: jobID, - Path: path, - FileNum: fileNum.FileNum(), - Err: err, - }) - case fileTypeManifest: - cm.opts.EventListener.ManifestDeleted(ManifestDeleteInfo{ - JobID: jobID, - Path: path, - FileNum: fileNum.FileNum(), - Err: err, - }) - case fileTypeTable: - panic("invalid deletion of object file") - } -} - -func (cm *cleanupManager) deleteObsoleteObject( - fileType fileType, jobID int, fileNum base.DiskFileNum, -) { - if fileType != fileTypeTable { - panic("not an object") - } - - var path string - meta, err := cm.objProvider.Lookup(fileType, fileNum) - if err != nil { - path = "" - } else { - path = cm.objProvider.Path(meta) - err = cm.objProvider.Remove(fileType, fileNum) - } - if cm.objProvider.IsNotExistError(err) { - return - } - - switch fileType { - case fileTypeTable: - cm.opts.EventListener.TableDeleted(TableDeleteInfo{ - JobID: jobID, - Path: path, - FileNum: fileNum.FileNum(), - Err: err, - }) - } -} - -// maybeLogLocked issues a log if the job queue gets 75% full and issues a log -// when the job queue gets back to less than 10% full. -// -// Must be called with cm.mu locked. -func (cm *cleanupManager) maybeLogLocked() { - const highThreshold = jobsQueueDepth * 3 / 4 - const lowThreshold = jobsQueueDepth / 10 - - jobsInQueue := cm.mu.totalJobs - cm.mu.completedJobs - - if !cm.mu.jobsQueueWarningIssued && jobsInQueue > highThreshold { - cm.mu.jobsQueueWarningIssued = true - cm.opts.Logger.Infof("cleanup falling behind; job queue has over %d jobs", highThreshold) - } - - if cm.mu.jobsQueueWarningIssued && jobsInQueue < lowThreshold { - cm.mu.jobsQueueWarningIssued = false - cm.opts.Logger.Infof("cleanup back to normal; job queue has under %d jobs", lowThreshold) - } -} diff --git a/vendor/github.com/cockroachdb/pebble/compaction.go b/vendor/github.com/cockroachdb/pebble/compaction.go deleted file mode 100644 index cffdccf..0000000 --- a/vendor/github.com/cockroachdb/pebble/compaction.go +++ /dev/null @@ -1,3942 +0,0 @@ -// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "bytes" - "context" - "fmt" - "io" - "math" - "runtime/pprof" - "sort" - "sync/atomic" - "time" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invalidating" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/internal/private" - "github.com/cockroachdb/pebble/internal/rangedel" - "github.com/cockroachdb/pebble/internal/rangekey" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" - "github.com/cockroachdb/pebble/objstorage/remote" - "github.com/cockroachdb/pebble/sstable" - "github.com/cockroachdb/pebble/vfs" - "golang.org/x/exp/constraints" -) - -var errEmptyTable = errors.New("pebble: empty table") - -// ErrCancelledCompaction is returned if a compaction is cancelled by a -// concurrent excise or ingest-split operation. -var ErrCancelledCompaction = errors.New("pebble: compaction cancelled by a concurrent operation, will retry compaction") - -var compactLabels = pprof.Labels("pebble", "compact") -var flushLabels = pprof.Labels("pebble", "flush") -var gcLabels = pprof.Labels("pebble", "gc") - -// getInternalWriterProperties accesses a private variable (in the -// internal/private package) initialized by the sstable Writer. This indirection -// is necessary to ensure non-Pebble users constructing sstables for ingestion -// are unable to set internal-only properties. -var getInternalWriterProperties = private.SSTableInternalProperties.(func(*sstable.Writer) *sstable.Properties) - -// expandedCompactionByteSizeLimit is the maximum number of bytes in all -// compacted files. We avoid expanding the lower level file set of a compaction -// if it would make the total compaction cover more than this many bytes. -func expandedCompactionByteSizeLimit(opts *Options, level int, availBytes uint64) uint64 { - v := uint64(25 * opts.Level(level).TargetFileSize) - - // Never expand a compaction beyond half the available capacity, divided - // by the maximum number of concurrent compactions. Each of the concurrent - // compactions may expand up to this limit, so this attempts to limit - // compactions to half of available disk space. Note that this will not - // prevent compaction picking from pursuing compactions that are larger - // than this threshold before expansion. - diskMax := (availBytes / 2) / uint64(opts.MaxConcurrentCompactions()) - if v > diskMax { - v = diskMax - } - return v -} - -// maxGrandparentOverlapBytes is the maximum bytes of overlap with level+1 -// before we stop building a single file in a level-1 to level compaction. -func maxGrandparentOverlapBytes(opts *Options, level int) uint64 { - return uint64(10 * opts.Level(level).TargetFileSize) -} - -// maxReadCompactionBytes is used to prevent read compactions which -// are too wide. -func maxReadCompactionBytes(opts *Options, level int) uint64 { - return uint64(10 * opts.Level(level).TargetFileSize) -} - -// noCloseIter wraps around a FragmentIterator, intercepting and eliding -// calls to Close. It is used during compaction to ensure that rangeDelIters -// are not closed prematurely. -type noCloseIter struct { - keyspan.FragmentIterator -} - -func (i noCloseIter) Close() error { - return nil -} - -type compactionLevel struct { - level int - files manifest.LevelSlice - // l0SublevelInfo contains information about L0 sublevels being compacted. - // It's only set for the start level of a compaction starting out of L0 and - // is nil for all other compactions. - l0SublevelInfo []sublevelInfo -} - -func (cl compactionLevel) Clone() compactionLevel { - newCL := compactionLevel{ - level: cl.level, - files: cl.files.Reslice(func(start, end *manifest.LevelIterator) {}), - } - return newCL -} -func (cl compactionLevel) String() string { - return fmt.Sprintf(`Level %d, Files %s`, cl.level, cl.files) -} - -// Return output from compactionOutputSplitters. See comment on -// compactionOutputSplitter.shouldSplitBefore() on how this value is used. -type maybeSplit int - -const ( - noSplit maybeSplit = iota - splitNow -) - -// String implements the Stringer interface. -func (c maybeSplit) String() string { - if c == noSplit { - return "no-split" - } - return "split-now" -} - -// compactionOutputSplitter is an interface for encapsulating logic around -// switching the output of a compaction to a new output file. Additional -// constraints around switching compaction outputs that are specific to that -// compaction type (eg. flush splits) are implemented in -// compactionOutputSplitters that compose other child compactionOutputSplitters. -type compactionOutputSplitter interface { - // shouldSplitBefore returns whether we should split outputs before the - // specified "current key". The return value is splitNow or noSplit. - // splitNow means a split is advised before the specified key, and noSplit - // means no split is advised. If shouldSplitBefore(a) advises a split then - // shouldSplitBefore(b) should also advise a split given b >= a, until - // onNewOutput is called. - shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit - // onNewOutput updates internal splitter state when the compaction switches - // to a new sstable, and returns the next limit for the new output which - // would get used to truncate range tombstones if the compaction iterator - // runs out of keys. The limit returned MUST be > key according to the - // compaction's comparator. The specified key is the first key in the new - // output, or nil if this sstable will only contain range tombstones already - // in the fragmenter. - onNewOutput(key []byte) []byte -} - -// fileSizeSplitter is a compactionOutputSplitter that enforces target file -// sizes. This splitter splits to a new output file when the estimated file size -// is 0.5x-2x the target file size. If there are overlapping grandparent files, -// this splitter will attempt to split at a grandparent boundary. For example, -// consider the example where a compaction wrote 'd' to the current output file, -// and the next key has a user key 'g': -// -// previous key next key -// | | -// | | -// +---------------|----+ +--|----------+ -// grandparents: | 000006 | | | | 000007 | -// +---------------|----+ +--|----------+ -// a b d e f g i -// -// Splitting the output file F before 'g' will ensure that the current output -// file F does not overlap the grandparent file 000007. Aligning sstable -// boundaries like this can significantly reduce write amplification, since a -// subsequent compaction of F into the grandparent level will avoid needlessly -// rewriting any keys within 000007 that do not overlap F's bounds. Consider the -// following compaction: -// -// +----------------------+ -// input | | -// level +----------------------+ -// \/ -// +---------------+ +---------------+ -// output |XXXXXXX| | | |XXXXXXXX| -// level +---------------+ +---------------+ -// -// The input-level file overlaps two files in the output level, but only -// partially. The beginning of the first output-level file and the end of the -// second output-level file will be rewritten verbatim. This write I/O is -// "wasted" in the sense that no merging is being performed. -// -// To prevent the above waste, this splitter attempts to split output files -// before the start key of grandparent files. It still strives to write output -// files of approximately the target file size, by constraining this splitting -// at grandparent points to apply only if the current output's file size is -// about the right order of magnitude. -// -// Note that, unlike most other splitters, this splitter does not guarantee that -// it will advise splits only at user key change boundaries. -type fileSizeSplitter struct { - frontier frontier - targetFileSize uint64 - atGrandparentBoundary bool - boundariesObserved uint64 - nextGrandparent *fileMetadata - grandparents manifest.LevelIterator -} - -func newFileSizeSplitter( - f *frontiers, targetFileSize uint64, grandparents manifest.LevelIterator, -) *fileSizeSplitter { - s := &fileSizeSplitter{targetFileSize: targetFileSize} - s.nextGrandparent = grandparents.First() - s.grandparents = grandparents - if s.nextGrandparent != nil { - s.frontier.Init(f, s.nextGrandparent.Smallest.UserKey, s.reached) - } - return s -} - -func (f *fileSizeSplitter) reached(nextKey []byte) []byte { - f.atGrandparentBoundary = true - f.boundariesObserved++ - // NB: f.grandparents is a bounded iterator, constrained to the compaction - // key range. - f.nextGrandparent = f.grandparents.Next() - if f.nextGrandparent == nil { - return nil - } - // TODO(jackson): Should we also split before or immediately after - // grandparents' largest keys? Splitting before the start boundary prevents - // overlap with the grandparent. Also splitting after the end boundary may - // increase the probability of move compactions. - return f.nextGrandparent.Smallest.UserKey -} - -func (f *fileSizeSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit { - atGrandparentBoundary := f.atGrandparentBoundary - - // Clear f.atGrandparentBoundary unconditionally. - // - // This is a bit subtle. Even if do decide to split, it's possible that a - // higher-level splitter will ignore our request (eg, because we're between - // two internal keys with the same user key). In this case, the next call to - // shouldSplitBefore will find atGrandparentBoundary=false. This is - // desirable, because in this case we would've already written the earlier - // key with the same user key to the output file. The current output file is - // already doomed to overlap the grandparent whose bound triggered - // atGrandparentBoundary=true. We should continue on, waiting for the next - // grandparent boundary. - f.atGrandparentBoundary = false - - // If the key is a range tombstone, the EstimatedSize may not grow right - // away when a range tombstone is added to the fragmenter: It's dependent on - // whether or not the this new range deletion will start a new fragment. - // Range deletions are rare, so we choose to simply not split yet. - // TODO(jackson): Reconsider this, and consider range keys too as a part of - // #2321. - if key.Kind() == InternalKeyKindRangeDelete || tw == nil { - return noSplit - } - - estSize := tw.EstimatedSize() - switch { - case estSize < f.targetFileSize/2: - // The estimated file size is less than half the target file size. Don't - // split it, even if currently aligned with a grandparent file because - // it's too small. - return noSplit - case estSize >= 2*f.targetFileSize: - // The estimated file size is double the target file size. Split it even - // if we were not aligned with a grandparent file boundary to avoid - // excessively exceeding the target file size. - return splitNow - case !atGrandparentBoundary: - // Don't split if we're not at a grandparent, except if we've exhausted - // all the grandparents overlapping this compaction's key range. Then we - // may want to split purely based on file size. - if f.nextGrandparent == nil { - // There are no more grandparents. Optimize for the target file size - // and split as soon as we hit the target file size. - if estSize >= f.targetFileSize { - return splitNow - } - } - return noSplit - default: - // INVARIANT: atGrandparentBoundary - // INVARIANT: targetSize/2 < estSize < 2*targetSize - // - // The estimated file size is close enough to the target file size that - // we should consider splitting. - // - // Determine whether to split now based on how many grandparent - // boundaries we have already observed while building this output file. - // The intuition here is that if the grandparent level is dense in this - // part of the keyspace, we're likely to continue to have more - // opportunities to split this file aligned with a grandparent. If this - // is the first grandparent boundary observed, we split immediately - // (we're already at ≥50% the target file size). Otherwise, each - // overlapping grandparent we've observed increases the minimum file - // size by 5% of the target file size, up to at most 90% of the target - // file size. - // - // TODO(jackson): The particular thresholds are somewhat unprincipled. - // This is the same heuristic as RocksDB implements. Is there are more - // principled formulation that can, further reduce w-amp, produce files - // closer to the target file size, or is more understandable? - - // NB: Subtract 1 from `boundariesObserved` to account for the current - // boundary we're considering splitting at. `reached` will have - // incremented it at the same time it set `atGrandparentBoundary`. - minimumPctOfTargetSize := 50 + 5*minUint64(f.boundariesObserved-1, 8) - if estSize < (minimumPctOfTargetSize*f.targetFileSize)/100 { - return noSplit - } - return splitNow - } -} - -func minUint64(a, b uint64) uint64 { - if b < a { - a = b - } - return a -} - -func (f *fileSizeSplitter) onNewOutput(key []byte) []byte { - f.boundariesObserved = 0 - return nil -} - -func newLimitFuncSplitter(f *frontiers, limitFunc func(userKey []byte) []byte) *limitFuncSplitter { - s := &limitFuncSplitter{limitFunc: limitFunc} - s.frontier.Init(f, nil, s.reached) - return s -} - -type limitFuncSplitter struct { - frontier frontier - limitFunc func(userKey []byte) []byte - split maybeSplit -} - -func (lf *limitFuncSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit { - return lf.split -} - -func (lf *limitFuncSplitter) reached(nextKey []byte) []byte { - lf.split = splitNow - return nil -} - -func (lf *limitFuncSplitter) onNewOutput(key []byte) []byte { - lf.split = noSplit - if key != nil { - // TODO(jackson): For some users, like L0 flush splits, there's no need - // to binary search over all the flush splits every time. The next split - // point must be ahead of the previous flush split point. - limit := lf.limitFunc(key) - lf.frontier.Update(limit) - return limit - } - lf.frontier.Update(nil) - return nil -} - -// splitterGroup is a compactionOutputSplitter that splits whenever one of its -// child splitters advises a compaction split. -type splitterGroup struct { - cmp Compare - splitters []compactionOutputSplitter -} - -func (a *splitterGroup) shouldSplitBefore( - key *InternalKey, tw *sstable.Writer, -) (suggestion maybeSplit) { - for _, splitter := range a.splitters { - if splitter.shouldSplitBefore(key, tw) == splitNow { - return splitNow - } - } - return noSplit -} - -func (a *splitterGroup) onNewOutput(key []byte) []byte { - var earliestLimit []byte - for _, splitter := range a.splitters { - limit := splitter.onNewOutput(key) - if limit == nil { - continue - } - if earliestLimit == nil || a.cmp(limit, earliestLimit) < 0 { - earliestLimit = limit - } - } - return earliestLimit -} - -// userKeyChangeSplitter is a compactionOutputSplitter that takes in a child -// splitter, and splits when 1) that child splitter has advised a split, and 2) -// the compaction output is at the boundary between two user keys (also -// the boundary between atomic compaction units). Use this splitter to wrap -// any splitters that don't guarantee user key splits (i.e. splitters that make -// their determination in ways other than comparing the current key against a -// limit key.) If a wrapped splitter advises a split, it must continue -// to advise a split until a new output. -type userKeyChangeSplitter struct { - cmp Compare - splitter compactionOutputSplitter - unsafePrevUserKey func() []byte -} - -func (u *userKeyChangeSplitter) shouldSplitBefore(key *InternalKey, tw *sstable.Writer) maybeSplit { - // NB: The userKeyChangeSplitter only needs to suffer a key comparison if - // the wrapped splitter requests a split. - // - // We could implement this splitter using frontiers: When the inner splitter - // requests a split before key `k`, we'd update a frontier to be - // ImmediateSuccessor(k). Then on the next key greater than >k, the - // frontier's `reached` func would be called and we'd return splitNow. - // This doesn't really save work since duplicate user keys are rare, and it - // requires us to materialize the ImmediateSuccessor key. It also prevents - // us from splitting on the same key that the inner splitter requested a - // split for—instead we need to wait until the next key. The current - // implementation uses `unsafePrevUserKey` to gain access to the previous - // key which allows it to immediately respect the inner splitter if - // possible. - if split := u.splitter.shouldSplitBefore(key, tw); split != splitNow { - return split - } - if u.cmp(key.UserKey, u.unsafePrevUserKey()) > 0 { - return splitNow - } - return noSplit -} - -func (u *userKeyChangeSplitter) onNewOutput(key []byte) []byte { - return u.splitter.onNewOutput(key) -} - -// compactionWritable is a objstorage.Writable wrapper that, on every write, -// updates a metric in `versions` on bytes written by in-progress compactions so -// far. It also increments a per-compaction `written` int. -type compactionWritable struct { - objstorage.Writable - - versions *versionSet - written *int64 -} - -// Write is part of the objstorage.Writable interface. -func (c *compactionWritable) Write(p []byte) error { - if err := c.Writable.Write(p); err != nil { - return err - } - - *c.written += int64(len(p)) - c.versions.incrementCompactionBytes(int64(len(p))) - return nil -} - -type compactionKind int - -const ( - compactionKindDefault compactionKind = iota - compactionKindFlush - // compactionKindMove denotes a move compaction where the input file is - // retained and linked in a new level without being obsoleted. - compactionKindMove - // compactionKindCopy denotes a copy compaction where the input file is - // copied byte-by-byte into a new file with a new FileNum in the output level. - compactionKindCopy - compactionKindDeleteOnly - compactionKindElisionOnly - compactionKindRead - compactionKindRewrite - compactionKindIngestedFlushable -) - -func (k compactionKind) String() string { - switch k { - case compactionKindDefault: - return "default" - case compactionKindFlush: - return "flush" - case compactionKindMove: - return "move" - case compactionKindDeleteOnly: - return "delete-only" - case compactionKindElisionOnly: - return "elision-only" - case compactionKindRead: - return "read" - case compactionKindRewrite: - return "rewrite" - case compactionKindIngestedFlushable: - return "ingested-flushable" - case compactionKindCopy: - return "copy" - } - return "?" -} - -// rangeKeyCompactionTransform is used to transform range key spans as part of the -// keyspan.MergingIter. As part of this transformation step, we can elide range -// keys in the last snapshot stripe, as well as coalesce range keys within -// snapshot stripes. -func rangeKeyCompactionTransform( - eq base.Equal, snapshots []uint64, elideRangeKey func(start, end []byte) bool, -) keyspan.Transformer { - return keyspan.TransformerFunc(func(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error { - elideInLastStripe := func(keys []keyspan.Key) []keyspan.Key { - // Unsets and deletes in the last snapshot stripe can be elided. - k := 0 - for j := range keys { - if elideRangeKey(s.Start, s.End) && - (keys[j].Kind() == InternalKeyKindRangeKeyUnset || keys[j].Kind() == InternalKeyKindRangeKeyDelete) { - continue - } - keys[k] = keys[j] - k++ - } - keys = keys[:k] - return keys - } - // snapshots are in ascending order, while s.keys are in descending seqnum - // order. Partition s.keys by snapshot stripes, and call rangekey.Coalesce - // on each partition. - dst.Start = s.Start - dst.End = s.End - dst.Keys = dst.Keys[:0] - i, j := len(snapshots)-1, 0 - usedLen := 0 - for i >= 0 { - start := j - for j < len(s.Keys) && !base.Visible(s.Keys[j].SeqNum(), snapshots[i], base.InternalKeySeqNumMax) { - // Include j in current partition. - j++ - } - if j > start { - keysDst := dst.Keys[usedLen:cap(dst.Keys)] - if err := rangekey.Coalesce(cmp, eq, s.Keys[start:j], &keysDst); err != nil { - return err - } - if j == len(s.Keys) { - // This is the last snapshot stripe. Unsets and deletes can be elided. - keysDst = elideInLastStripe(keysDst) - } - usedLen += len(keysDst) - dst.Keys = append(dst.Keys, keysDst...) - } - i-- - } - if j < len(s.Keys) { - keysDst := dst.Keys[usedLen:cap(dst.Keys)] - if err := rangekey.Coalesce(cmp, eq, s.Keys[j:], &keysDst); err != nil { - return err - } - keysDst = elideInLastStripe(keysDst) - usedLen += len(keysDst) - dst.Keys = append(dst.Keys, keysDst...) - } - return nil - }) -} - -// compaction is a table compaction from one level to the next, starting from a -// given version. -type compaction struct { - // cancel is a bool that can be used by other goroutines to signal a compaction - // to cancel, such as if a conflicting excise operation raced it to manifest - // application. Only holders of the manifest lock will write to this atomic. - cancel atomic.Bool - - kind compactionKind - cmp Compare - equal Equal - comparer *base.Comparer - formatKey base.FormatKey - logger Logger - version *version - stats base.InternalIteratorStats - beganAt time.Time - // versionEditApplied is set to true when a compaction has completed and the - // resulting version has been installed (if successful), but the compaction - // goroutine is still cleaning up (eg, deleting obsolete files). - versionEditApplied bool - bufferPool sstable.BufferPool - - // startLevel is the level that is being compacted. Inputs from startLevel - // and outputLevel will be merged to produce a set of outputLevel files. - startLevel *compactionLevel - - // outputLevel is the level that files are being produced in. outputLevel is - // equal to startLevel+1 except when: - // - if startLevel is 0, the output level equals compactionPicker.baseLevel(). - // - in multilevel compaction, the output level is the lowest level involved in - // the compaction - // A compaction's outputLevel is nil for delete-only compactions. - outputLevel *compactionLevel - - // extraLevels point to additional levels in between the input and output - // levels that get compacted in multilevel compactions - extraLevels []*compactionLevel - - inputs []compactionLevel - - // maxOutputFileSize is the maximum size of an individual table created - // during compaction. - maxOutputFileSize uint64 - // maxOverlapBytes is the maximum number of bytes of overlap allowed for a - // single output table with the tables in the grandparent level. - maxOverlapBytes uint64 - // disableSpanElision disables elision of range tombstones and range keys. Used - // by tests to allow range tombstones or range keys to be added to tables where - // they would otherwise be elided. - disableSpanElision bool - - // flushing contains the flushables (aka memtables) that are being flushed. - flushing flushableList - // bytesIterated contains the number of bytes that have been flushed/compacted. - bytesIterated uint64 - // bytesWritten contains the number of bytes that have been written to outputs. - bytesWritten int64 - - // The boundaries of the input data. - smallest InternalKey - largest InternalKey - - // The range deletion tombstone fragmenter. Adds range tombstones as they are - // returned from `compactionIter` and fragments them for output to files. - // Referenced by `compactionIter` which uses it to check whether keys are deleted. - rangeDelFrag keyspan.Fragmenter - // The range key fragmenter. Similar to rangeDelFrag in that it gets range - // keys from the compaction iter and fragments them for output to files. - rangeKeyFrag keyspan.Fragmenter - // The range deletion tombstone iterator, that merges and fragments - // tombstones across levels. This iterator is included within the compaction - // input iterator as a single level. - // TODO(jackson): Remove this when the refactor of FragmentIterator, - // InterleavingIterator, etc is complete. - rangeDelIter keyspan.InternalIteratorShim - // rangeKeyInterleaving is the interleaving iter for range keys. - rangeKeyInterleaving keyspan.InterleavingIter - - // A list of objects to close when the compaction finishes. Used by input - // iteration to keep rangeDelIters open for the lifetime of the compaction, - // and only close them when the compaction finishes. - closers []io.Closer - - // grandparents are the tables in level+2 that overlap with the files being - // compacted. Used to determine output table boundaries. Do not assume that the actual files - // in the grandparent when this compaction finishes will be the same. - grandparents manifest.LevelSlice - - // Boundaries at which flushes to L0 should be split. Determined by - // L0Sublevels. If nil, flushes aren't split. - l0Limits [][]byte - - // List of disjoint inuse key ranges the compaction overlaps with in - // grandparent and lower levels. See setupInuseKeyRanges() for the - // construction. Used by elideTombstone() and elideRangeTombstone() to - // determine if keys affected by a tombstone possibly exist at a lower level. - inuseKeyRanges []manifest.UserKeyRange - // inuseEntireRange is set if the above inuse key ranges wholly contain the - // compaction's key range. This allows compactions in higher levels to often - // elide key comparisons. - inuseEntireRange bool - elideTombstoneIndex int - - // allowedZeroSeqNum is true if seqnums can be zeroed if there are no - // snapshots requiring them to be kept. This determination is made by - // looking for an sstable which overlaps the bounds of the compaction at a - // lower level in the LSM during runCompaction. - allowedZeroSeqNum bool - - metrics map[int]*LevelMetrics - - pickerMetrics compactionPickerMetrics -} - -func (c *compaction) makeInfo(jobID int) CompactionInfo { - info := CompactionInfo{ - JobID: jobID, - Reason: c.kind.String(), - Input: make([]LevelInfo, 0, len(c.inputs)), - Annotations: []string{}, - } - for _, cl := range c.inputs { - inputInfo := LevelInfo{Level: cl.level, Tables: nil} - iter := cl.files.Iter() - for m := iter.First(); m != nil; m = iter.Next() { - inputInfo.Tables = append(inputInfo.Tables, m.TableInfo()) - } - info.Input = append(info.Input, inputInfo) - } - if c.outputLevel != nil { - info.Output.Level = c.outputLevel.level - - // If there are no inputs from the output level (eg, a move - // compaction), add an empty LevelInfo to info.Input. - if len(c.inputs) > 0 && c.inputs[len(c.inputs)-1].level != c.outputLevel.level { - info.Input = append(info.Input, LevelInfo{Level: c.outputLevel.level}) - } - } else { - // For a delete-only compaction, set the output level to L6. The - // output level is not meaningful here, but complicating the - // info.Output interface with a pointer doesn't seem worth the - // semantic distinction. - info.Output.Level = numLevels - 1 - } - - for i, score := range c.pickerMetrics.scores { - info.Input[i].Score = score - } - info.SingleLevelOverlappingRatio = c.pickerMetrics.singleLevelOverlappingRatio - info.MultiLevelOverlappingRatio = c.pickerMetrics.multiLevelOverlappingRatio - if len(info.Input) > 2 { - info.Annotations = append(info.Annotations, "multilevel") - } - return info -} - -func newCompaction( - pc *pickedCompaction, opts *Options, beganAt time.Time, provider objstorage.Provider, -) *compaction { - c := &compaction{ - kind: compactionKindDefault, - cmp: pc.cmp, - equal: opts.equal(), - comparer: opts.Comparer, - formatKey: opts.Comparer.FormatKey, - inputs: pc.inputs, - smallest: pc.smallest, - largest: pc.largest, - logger: opts.Logger, - version: pc.version, - beganAt: beganAt, - maxOutputFileSize: pc.maxOutputFileSize, - maxOverlapBytes: pc.maxOverlapBytes, - pickerMetrics: pc.pickerMetrics, - } - c.startLevel = &c.inputs[0] - if pc.startLevel.l0SublevelInfo != nil { - c.startLevel.l0SublevelInfo = pc.startLevel.l0SublevelInfo - } - c.outputLevel = &c.inputs[1] - - if len(pc.extraLevels) > 0 { - c.extraLevels = pc.extraLevels - c.outputLevel = &c.inputs[len(c.inputs)-1] - } - // Compute the set of outputLevel+1 files that overlap this compaction (these - // are the grandparent sstables). - if c.outputLevel.level+1 < numLevels { - c.grandparents = c.version.Overlaps(c.outputLevel.level+1, c.cmp, - c.smallest.UserKey, c.largest.UserKey, c.largest.IsExclusiveSentinel()) - } - c.setupInuseKeyRanges() - c.kind = pc.kind - - if c.kind == compactionKindDefault && c.outputLevel.files.Empty() && !c.hasExtraLevelData() && - c.startLevel.files.Len() == 1 && c.grandparents.SizeSum() <= c.maxOverlapBytes { - // This compaction can be converted into a move or copy from one level - // to the next. We avoid such a move if there is lots of overlapping - // grandparent data. Otherwise, the move could create a parent file - // that will require a very expensive merge later on. - iter := c.startLevel.files.Iter() - meta := iter.First() - isRemote := false - // We should always be passed a provider, except in some unit tests. - if provider != nil { - objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) - if err != nil { - panic(errors.Wrapf(err, "cannot lookup table %s in provider", meta.FileBacking.DiskFileNum)) - } - isRemote = objMeta.IsRemote() - } - // Avoid a trivial move or copy if all of these are true, as rewriting a - // new file is better: - // - // 1) The source file is a virtual sstable - // 2) The existing file `meta` is on non-remote storage - // 3) The output level prefers shared storage - mustCopy := !isRemote && remote.ShouldCreateShared(opts.Experimental.CreateOnShared, c.outputLevel.level) - if mustCopy { - // If the source is virtual, it's best to just rewrite the file as all - // conditions in the above comment are met. - if !meta.Virtual { - c.kind = compactionKindCopy - } - } else { - c.kind = compactionKindMove - } - } - return c -} - -func newDeleteOnlyCompaction( - opts *Options, cur *version, inputs []compactionLevel, beganAt time.Time, -) *compaction { - c := &compaction{ - kind: compactionKindDeleteOnly, - cmp: opts.Comparer.Compare, - equal: opts.equal(), - comparer: opts.Comparer, - formatKey: opts.Comparer.FormatKey, - logger: opts.Logger, - version: cur, - beganAt: beganAt, - inputs: inputs, - } - - // Set c.smallest, c.largest. - files := make([]manifest.LevelIterator, 0, len(inputs)) - for _, in := range inputs { - files = append(files, in.files.Iter()) - } - c.smallest, c.largest = manifest.KeyRange(opts.Comparer.Compare, files...) - return c -} - -func adjustGrandparentOverlapBytesForFlush(c *compaction, flushingBytes uint64) { - // Heuristic to place a lower bound on compaction output file size - // caused by Lbase. Prior to this heuristic we have observed an L0 in - // production with 310K files of which 290K files were < 10KB in size. - // Our hypothesis is that it was caused by L1 having 2600 files and - // ~10GB, such that each flush got split into many tiny files due to - // overlapping with most of the files in Lbase. - // - // The computation below is general in that it accounts - // for flushing different volumes of data (e.g. we may be flushing - // many memtables). For illustration, we consider the typical - // example of flushing a 64MB memtable. So 12.8MB output, - // based on the compression guess below. If the compressed bytes - // guess is an over-estimate we will end up with smaller files, - // and if an under-estimate we will end up with larger files. - // With a 2MB target file size, 7 files. We are willing to accept - // 4x the number of files, if it results in better write amplification - // when later compacting to Lbase, i.e., ~450KB files (target file - // size / 4). - // - // Note that this is a pessimistic heuristic in that - // fileCountUpperBoundDueToGrandparents could be far from the actual - // number of files produced due to the grandparent limits. For - // example, in the extreme, consider a flush that overlaps with 1000 - // files in Lbase f0...f999, and the initially calculated value of - // maxOverlapBytes will cause splits at f10, f20,..., f990, which - // means an upper bound file count of 100 files. Say the input bytes - // in the flush are such that acceptableFileCount=10. We will fatten - // up maxOverlapBytes by 10x to ensure that the upper bound file count - // drops to 10. However, it is possible that in practice, even without - // this change, we would have produced no more than 10 files, and that - // this change makes the files unnecessarily wide. Say the input bytes - // are distributed such that 10% are in f0...f9, 10% in f10...f19, ... - // 10% in f80...f89 and 10% in f990...f999. The original value of - // maxOverlapBytes would have actually produced only 10 sstables. But - // by increasing maxOverlapBytes by 10x, we may produce 1 sstable that - // spans f0...f89, i.e., a much wider sstable than necessary. - // - // We could produce a tighter estimate of - // fileCountUpperBoundDueToGrandparents if we had knowledge of the key - // distribution of the flush. The 4x multiplier mentioned earlier is - // a way to try to compensate for this pessimism. - // - // TODO(sumeer): we don't have compression info for the data being - // flushed, but it is likely that existing files that overlap with - // this flush in Lbase are representative wrt compression ratio. We - // could store the uncompressed size in FileMetadata and estimate - // the compression ratio. - const approxCompressionRatio = 0.2 - approxOutputBytes := approxCompressionRatio * float64(flushingBytes) - approxNumFilesBasedOnTargetSize := - int(math.Ceil(approxOutputBytes / float64(c.maxOutputFileSize))) - acceptableFileCount := float64(4 * approxNumFilesBasedOnTargetSize) - // The byte calculation is linear in numGrandparentFiles, but we will - // incur this linear cost in findGrandparentLimit too, so we are also - // willing to pay it now. We could approximate this cheaply by using - // the mean file size of Lbase. - grandparentFileBytes := c.grandparents.SizeSum() - fileCountUpperBoundDueToGrandparents := - float64(grandparentFileBytes) / float64(c.maxOverlapBytes) - if fileCountUpperBoundDueToGrandparents > acceptableFileCount { - c.maxOverlapBytes = uint64( - float64(c.maxOverlapBytes) * - (fileCountUpperBoundDueToGrandparents / acceptableFileCount)) - } -} - -func newFlush( - opts *Options, cur *version, baseLevel int, flushing flushableList, beganAt time.Time, -) *compaction { - c := &compaction{ - kind: compactionKindFlush, - cmp: opts.Comparer.Compare, - equal: opts.equal(), - comparer: opts.Comparer, - formatKey: opts.Comparer.FormatKey, - logger: opts.Logger, - version: cur, - beganAt: beganAt, - inputs: []compactionLevel{{level: -1}, {level: 0}}, - maxOutputFileSize: math.MaxUint64, - maxOverlapBytes: math.MaxUint64, - flushing: flushing, - } - c.startLevel = &c.inputs[0] - c.outputLevel = &c.inputs[1] - - if len(flushing) > 0 { - if _, ok := flushing[0].flushable.(*ingestedFlushable); ok { - if len(flushing) != 1 { - panic("pebble: ingestedFlushable must be flushed one at a time.") - } - c.kind = compactionKindIngestedFlushable - return c - } - } - - // Make sure there's no ingestedFlushable after the first flushable in the - // list. - for _, f := range flushing { - if _, ok := f.flushable.(*ingestedFlushable); ok { - panic("pebble: flushing shouldn't contain ingestedFlushable flushable") - } - } - - if cur.L0Sublevels != nil { - c.l0Limits = cur.L0Sublevels.FlushSplitKeys() - } - - smallestSet, largestSet := false, false - updatePointBounds := func(iter internalIterator) { - if key, _ := iter.First(); key != nil { - if !smallestSet || - base.InternalCompare(c.cmp, c.smallest, *key) > 0 { - smallestSet = true - c.smallest = key.Clone() - } - } - if key, _ := iter.Last(); key != nil { - if !largestSet || - base.InternalCompare(c.cmp, c.largest, *key) < 0 { - largestSet = true - c.largest = key.Clone() - } - } - } - - updateRangeBounds := func(iter keyspan.FragmentIterator) { - // File bounds require s != nil && !s.Empty(). We only need to check for - // s != nil here, as the memtable's FragmentIterator would never surface - // empty spans. - if s := iter.First(); s != nil { - if key := s.SmallestKey(); !smallestSet || - base.InternalCompare(c.cmp, c.smallest, key) > 0 { - smallestSet = true - c.smallest = key.Clone() - } - } - if s := iter.Last(); s != nil { - if key := s.LargestKey(); !largestSet || - base.InternalCompare(c.cmp, c.largest, key) < 0 { - largestSet = true - c.largest = key.Clone() - } - } - } - - var flushingBytes uint64 - for i := range flushing { - f := flushing[i] - updatePointBounds(f.newIter(nil)) - if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil { - updateRangeBounds(rangeDelIter) - } - if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { - updateRangeBounds(rangeKeyIter) - } - flushingBytes += f.inuseBytes() - } - - if opts.FlushSplitBytes > 0 { - c.maxOutputFileSize = uint64(opts.Level(0).TargetFileSize) - c.maxOverlapBytes = maxGrandparentOverlapBytes(opts, 0) - c.grandparents = c.version.Overlaps(baseLevel, c.cmp, c.smallest.UserKey, - c.largest.UserKey, c.largest.IsExclusiveSentinel()) - adjustGrandparentOverlapBytesForFlush(c, flushingBytes) - } - - c.setupInuseKeyRanges() - return c -} - -func (c *compaction) hasExtraLevelData() bool { - if len(c.extraLevels) == 0 { - // not a multi level compaction - return false - } else if c.extraLevels[0].files.Empty() { - // a multi level compaction without data in the intermediate input level; - // e.g. for a multi level compaction with levels 4,5, and 6, this could - // occur if there is no files to compact in 5, or in 5 and 6 (i.e. a move). - return false - } - return true -} - -func (c *compaction) setupInuseKeyRanges() { - level := c.outputLevel.level + 1 - if c.outputLevel.level == 0 { - level = 0 - } - // calculateInuseKeyRanges will return a series of sorted spans. Overlapping - // or abutting spans have already been merged. - c.inuseKeyRanges = calculateInuseKeyRanges( - c.version, c.cmp, level, numLevels-1, c.smallest.UserKey, c.largest.UserKey, - ) - // Check if there's a single in-use span that encompasses the entire key - // range of the compaction. This is an optimization to avoid key comparisons - // against inuseKeyRanges during the compaction when every key within the - // compaction overlaps with an in-use span. - if len(c.inuseKeyRanges) > 0 { - c.inuseEntireRange = c.cmp(c.inuseKeyRanges[0].Start, c.smallest.UserKey) <= 0 && - c.cmp(c.inuseKeyRanges[0].End, c.largest.UserKey) >= 0 - } -} - -func calculateInuseKeyRanges( - v *version, cmp base.Compare, level, maxLevel int, smallest, largest []byte, -) []manifest.UserKeyRange { - // Use two slices, alternating which one is input and which one is output - // as we descend the LSM. - var input, output []manifest.UserKeyRange - - // L0 requires special treatment, since sstables within L0 may overlap. - // We use the L0 Sublevels structure to efficiently calculate the merged - // in-use key ranges. - if level == 0 { - output = v.L0Sublevels.InUseKeyRanges(smallest, largest) - level++ - } - - for ; level <= maxLevel; level++ { - // NB: We always treat `largest` as inclusive for simplicity, because - // there's little consequence to calculating slightly broader in-use key - // ranges. - overlaps := v.Overlaps(level, cmp, smallest, largest, false /* exclusiveEnd */) - iter := overlaps.Iter() - - // We may already have in-use key ranges from higher levels. Iterate - // through both our accumulated in-use key ranges and this level's - // files, merging the two. - // - // Tables higher within the LSM have broader key spaces. We use this - // when possible to seek past a level's files that are contained by - // our current accumulated in-use key ranges. This helps avoid - // per-sstable work during flushes or compactions in high levels which - // overlap the majority of the LSM's sstables. - input, output = output, input - output = output[:0] - - var currFile *fileMetadata - var currAccum *manifest.UserKeyRange - if len(input) > 0 { - currAccum, input = &input[0], input[1:] - } - - // If we have an accumulated key range and its start is ≤ smallest, - // we can seek to the accumulated range's end. Otherwise, we need to - // start at the first overlapping file within the level. - if currAccum != nil && cmp(currAccum.Start, smallest) <= 0 { - currFile = seekGT(&iter, cmp, currAccum.End) - } else { - currFile = iter.First() - } - - for currFile != nil || currAccum != nil { - // If we've exhausted either the files in the level or the - // accumulated key ranges, we just need to append the one we have. - // If we have both a currFile and a currAccum, they either overlap - // or they're disjoint. If they're disjoint, we append whichever - // one sorts first and move on to the next file or range. If they - // overlap, we merge them into currAccum and proceed to the next - // file. - switch { - case currAccum == nil || (currFile != nil && cmp(currFile.Largest.UserKey, currAccum.Start) < 0): - // This file is strictly before the current accumulated range, - // or there are no more accumulated ranges. - output = append(output, manifest.UserKeyRange{ - Start: currFile.Smallest.UserKey, - End: currFile.Largest.UserKey, - }) - currFile = iter.Next() - case currFile == nil || (currAccum != nil && cmp(currAccum.End, currFile.Smallest.UserKey) < 0): - // The current accumulated key range is strictly before the - // current file, or there are no more files. - output = append(output, *currAccum) - currAccum = nil - if len(input) > 0 { - currAccum, input = &input[0], input[1:] - } - default: - // The current accumulated range and the current file overlap. - // Adjust the accumulated range to be the union. - if cmp(currFile.Smallest.UserKey, currAccum.Start) < 0 { - currAccum.Start = currFile.Smallest.UserKey - } - if cmp(currFile.Largest.UserKey, currAccum.End) > 0 { - currAccum.End = currFile.Largest.UserKey - } - - // Extending `currAccum`'s end boundary may have caused it to - // overlap with `input` key ranges that we haven't processed - // yet. Merge any such key ranges. - for len(input) > 0 && cmp(input[0].Start, currAccum.End) <= 0 { - if cmp(input[0].End, currAccum.End) > 0 { - currAccum.End = input[0].End - } - input = input[1:] - } - // Seek the level iterator past our current accumulated end. - currFile = seekGT(&iter, cmp, currAccum.End) - } - } - } - return output -} - -func seekGT(iter *manifest.LevelIterator, cmp base.Compare, key []byte) *manifest.FileMetadata { - f := iter.SeekGE(cmp, key) - for f != nil && cmp(f.Largest.UserKey, key) == 0 { - f = iter.Next() - } - return f -} - -// findGrandparentLimit takes the start user key for a table and returns the -// user key to which that table can extend without excessively overlapping -// the grandparent level. If no limit is needed considering the grandparent -// files, this function returns nil. This is done in order to prevent a table -// at level N from overlapping too much data at level N+1. We want to avoid -// such large overlaps because they translate into large compactions. The -// current heuristic stops output of a table if the addition of another key -// would cause the table to overlap more than 10x the target file size at -// level N. See maxGrandparentOverlapBytes. -func (c *compaction) findGrandparentLimit(start []byte) []byte { - iter := c.grandparents.Iter() - var overlappedBytes uint64 - var greater bool - for f := iter.SeekGE(c.cmp, start); f != nil; f = iter.Next() { - overlappedBytes += f.Size - // To ensure forward progress we always return a larger user - // key than where we started. See comments above clients of - // this function for how this is used. - greater = greater || c.cmp(f.Smallest.UserKey, start) > 0 - if !greater { - continue - } - - // We return the smallest bound of a sstable rather than the - // largest because the smallest is always inclusive, and limits - // are used exlusively when truncating range tombstones. If we - // truncated an output to the largest key while there's a - // pending tombstone, the next output file would also overlap - // the same grandparent f. - if overlappedBytes > c.maxOverlapBytes { - return f.Smallest.UserKey - } - } - return nil -} - -// findL0Limit takes the start key for a table and returns the user key to which -// that table can be extended without hitting the next l0Limit. Having flushed -// sstables "bridging across" an l0Limit could lead to increased L0 -> LBase -// compaction sizes as well as elevated read amplification. -func (c *compaction) findL0Limit(start []byte) []byte { - if c.startLevel.level > -1 || c.outputLevel.level != 0 || len(c.l0Limits) == 0 { - return nil - } - index := sort.Search(len(c.l0Limits), func(i int) bool { - return c.cmp(c.l0Limits[i], start) > 0 - }) - if index < len(c.l0Limits) { - return c.l0Limits[index] - } - return nil -} - -// errorOnUserKeyOverlap returns an error if the last two written sstables in -// this compaction have revisions of the same user key present in both sstables, -// when it shouldn't (eg. when splitting flushes). -func (c *compaction) errorOnUserKeyOverlap(ve *versionEdit) error { - if n := len(ve.NewFiles); n > 1 { - meta := ve.NewFiles[n-1].Meta - prevMeta := ve.NewFiles[n-2].Meta - if !prevMeta.Largest.IsExclusiveSentinel() && - c.cmp(prevMeta.Largest.UserKey, meta.Smallest.UserKey) >= 0 { - return errors.Errorf("pebble: compaction split user key across two sstables: %s in %s and %s", - prevMeta.Largest.Pretty(c.formatKey), - prevMeta.FileNum, - meta.FileNum) - } - } - return nil -} - -// allowZeroSeqNum returns true if seqnum's can be zeroed if there are no -// snapshots requiring them to be kept. It performs this determination by -// looking for an sstable which overlaps the bounds of the compaction at a -// lower level in the LSM. -func (c *compaction) allowZeroSeqNum() bool { - return c.elideRangeTombstone(c.smallest.UserKey, c.largest.UserKey) -} - -// elideTombstone returns true if it is ok to elide a tombstone for the -// specified key. A return value of true guarantees that there are no key/value -// pairs at c.level+2 or higher that possibly contain the specified user -// key. The keys in multiple invocations to elideTombstone must be supplied in -// order. -func (c *compaction) elideTombstone(key []byte) bool { - if c.inuseEntireRange || len(c.flushing) != 0 { - return false - } - - for ; c.elideTombstoneIndex < len(c.inuseKeyRanges); c.elideTombstoneIndex++ { - r := &c.inuseKeyRanges[c.elideTombstoneIndex] - if c.cmp(key, r.End) <= 0 { - if c.cmp(key, r.Start) >= 0 { - return false - } - break - } - } - return true -} - -// elideRangeTombstone returns true if it is ok to elide the specified range -// tombstone. A return value of true guarantees that there are no key/value -// pairs at c.outputLevel.level+1 or higher that possibly overlap the specified -// tombstone. -func (c *compaction) elideRangeTombstone(start, end []byte) bool { - // Disable range tombstone elision if the testing knob for that is enabled, - // or if we are flushing memtables. The latter requirement is due to - // inuseKeyRanges not accounting for key ranges in other memtables that are - // being flushed in the same compaction. It's possible for a range tombstone - // in one memtable to overlap keys in a preceding memtable in c.flushing. - // - // This function is also used in setting allowZeroSeqNum, so disabling - // elision of range tombstones also disables zeroing of SeqNums. - // - // TODO(peter): we disable zeroing of seqnums during flushing to match - // RocksDB behavior and to avoid generating overlapping sstables during - // DB.replayWAL. When replaying WAL files at startup, we flush after each - // WAL is replayed building up a single version edit that is - // applied. Because we don't apply the version edit after each flush, this - // code doesn't know that L0 contains files and zeroing of seqnums should - // be disabled. That is fixable, but it seems safer to just match the - // RocksDB behavior for now. - if c.disableSpanElision || len(c.flushing) != 0 { - return false - } - - lower := sort.Search(len(c.inuseKeyRanges), func(i int) bool { - return c.cmp(c.inuseKeyRanges[i].End, start) >= 0 - }) - upper := sort.Search(len(c.inuseKeyRanges), func(i int) bool { - return c.cmp(c.inuseKeyRanges[i].Start, end) > 0 - }) - return lower >= upper -} - -// elideRangeKey returns true if it is ok to elide the specified range key. A -// return value of true guarantees that there are no key/value pairs at -// c.outputLevel.level+1 or higher that possibly overlap the specified range key. -func (c *compaction) elideRangeKey(start, end []byte) bool { - // TODO(bilal): Track inuseKeyRanges separately for the range keyspace as - // opposed to the point keyspace. Once that is done, elideRangeTombstone - // can just check in the point keyspace, and this function can check for - // inuseKeyRanges in the range keyspace. - return c.elideRangeTombstone(start, end) -} - -// newInputIter returns an iterator over all the input tables in a compaction. -func (c *compaction) newInputIter( - newIters tableNewIters, newRangeKeyIter keyspan.TableNewSpanIter, snapshots []uint64, -) (_ internalIterator, retErr error) { - // Validate the ordering of compaction input files for defense in depth. - // TODO(jackson): Some of the CheckOrdering calls may be adapted to pass - // ProhibitSplitUserKeys if we thread the active format major version in. Or - // if we remove support for earlier FMVs, we can remove the parameter - // altogether. - if len(c.flushing) == 0 { - if c.startLevel.level >= 0 { - err := manifest.CheckOrdering(c.cmp, c.formatKey, - manifest.Level(c.startLevel.level), c.startLevel.files.Iter(), - manifest.AllowSplitUserKeys) - if err != nil { - return nil, err - } - } - err := manifest.CheckOrdering(c.cmp, c.formatKey, - manifest.Level(c.outputLevel.level), c.outputLevel.files.Iter(), - manifest.AllowSplitUserKeys) - if err != nil { - return nil, err - } - if c.startLevel.level == 0 { - if c.startLevel.l0SublevelInfo == nil { - panic("l0SublevelInfo not created for compaction out of L0") - } - for _, info := range c.startLevel.l0SublevelInfo { - err := manifest.CheckOrdering(c.cmp, c.formatKey, - info.sublevel, info.Iter(), - // NB: L0 sublevels have never allowed split user keys. - manifest.ProhibitSplitUserKeys) - if err != nil { - return nil, err - } - } - } - if len(c.extraLevels) > 0 { - if len(c.extraLevels) > 1 { - panic("n>2 multi level compaction not implemented yet") - } - interLevel := c.extraLevels[0] - err := manifest.CheckOrdering(c.cmp, c.formatKey, - manifest.Level(interLevel.level), interLevel.files.Iter(), - manifest.AllowSplitUserKeys) - if err != nil { - return nil, err - } - } - } - - // There are three classes of keys that a compaction needs to process: point - // keys, range deletion tombstones and range keys. Collect all iterators for - // all these classes of keys from all the levels. We'll aggregate them - // together farther below. - // - // numInputLevels is an approximation of the number of iterator levels. Due - // to idiosyncrasies in iterator construction, we may (rarely) exceed this - // initial capacity. - numInputLevels := max[int](len(c.flushing), len(c.inputs)) - iters := make([]internalIterator, 0, numInputLevels) - rangeDelIters := make([]keyspan.FragmentIterator, 0, numInputLevels) - rangeKeyIters := make([]keyspan.FragmentIterator, 0, numInputLevels) - - // If construction of the iterator inputs fails, ensure that we close all - // the consitutent iterators. - defer func() { - if retErr != nil { - for _, iter := range iters { - if iter != nil { - iter.Close() - } - } - for _, rangeDelIter := range rangeDelIters { - rangeDelIter.Close() - } - } - }() - iterOpts := IterOptions{logger: c.logger} - - // Populate iters, rangeDelIters and rangeKeyIters with the appropriate - // constituent iterators. This depends on whether this is a flush or a - // compaction. - if len(c.flushing) != 0 { - // If flushing, we need to build the input iterators over the memtables - // stored in c.flushing. - for i := range c.flushing { - f := c.flushing[i] - iters = append(iters, f.newFlushIter(nil, &c.bytesIterated)) - rangeDelIter := f.newRangeDelIter(nil) - if rangeDelIter != nil { - rangeDelIters = append(rangeDelIters, rangeDelIter) - } - if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { - rangeKeyIters = append(rangeKeyIters, rangeKeyIter) - } - } - } else { - addItersForLevel := func(level *compactionLevel, l manifest.Level) error { - // Add a *levelIter for point iterators. Because we don't call - // initRangeDel, the levelIter will close and forget the range - // deletion iterator when it steps on to a new file. Surfacing range - // deletions to compactions are handled below. - iters = append(iters, newLevelIter(iterOpts, c.comparer, newIters, - level.files.Iter(), l, internalIterOpts{ - bytesIterated: &c.bytesIterated, - bufferPool: &c.bufferPool, - })) - // TODO(jackson): Use keyspan.LevelIter to avoid loading all the range - // deletions into memory upfront. (See #2015, which reverted this.) - // There will be no user keys that are split between sstables - // within a level in Cockroach 23.1, which unblocks this optimization. - - // Add the range deletion iterator for each file as an independent level - // in mergingIter, as opposed to making a levelIter out of those. This - // is safer as levelIter expects all keys coming from underlying - // iterators to be in order. Due to compaction / tombstone writing - // logic in finishOutput(), it is possible for range tombstones to not - // be strictly ordered across all files in one level. - // - // Consider this example from the metamorphic tests (also repeated in - // finishOutput()), consisting of three L3 files with their bounds - // specified in square brackets next to the file name: - // - // ./000240.sst [tmgc#391,MERGE-tmgc#391,MERGE] - // tmgc#391,MERGE [786e627a] - // tmgc-udkatvs#331,RANGEDEL - // - // ./000241.sst [tmgc#384,MERGE-tmgc#384,MERGE] - // tmgc#384,MERGE [666c7070] - // tmgc-tvsalezade#383,RANGEDEL - // tmgc-tvsalezade#331,RANGEDEL - // - // ./000242.sst [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL] - // tmgc-tvsalezade#383,RANGEDEL - // tmgc#375,SET [72646c78766965616c72776865676e79] - // tmgc-tvsalezade#356,RANGEDEL - // - // Here, the range tombstone in 000240.sst falls "after" one in - // 000241.sst, despite 000240.sst being ordered "before" 000241.sst for - // levelIter's purposes. While each file is still consistent before its - // bounds, it's safer to have all rangedel iterators be visible to - // mergingIter. - iter := level.files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - rangeDelIter, closer, err := c.newRangeDelIter(newIters, iter.Take(), nil, l, &c.bytesIterated) - if err != nil { - // The error will already be annotated with the BackingFileNum, so - // we annotate it with the FileNum. - return errors.Wrapf(err, "pebble: could not open table %s", errors.Safe(f.FileNum)) - } - if rangeDelIter == nil { - continue - } - rangeDelIters = append(rangeDelIters, rangeDelIter) - c.closers = append(c.closers, closer) - } - - // Check if this level has any range keys. - hasRangeKeys := false - for f := iter.First(); f != nil; f = iter.Next() { - if f.HasRangeKeys { - hasRangeKeys = true - break - } - } - if hasRangeKeys { - li := &keyspan.LevelIter{} - newRangeKeyIterWrapper := func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { - iter, err := newRangeKeyIter(file, iterOptions) - if err != nil { - return nil, err - } else if iter == nil { - return emptyKeyspanIter, nil - } - // Ensure that the range key iter is not closed until the compaction is - // finished. This is necessary because range key processing - // requires the range keys to be held in memory for up to the - // lifetime of the compaction. - c.closers = append(c.closers, iter) - iter = noCloseIter{iter} - - // We do not need to truncate range keys to sstable boundaries, or - // only read within the file's atomic compaction units, unlike with - // range tombstones. This is because range keys were added after we - // stopped splitting user keys across sstables, so all the range keys - // in this sstable must wholly lie within the file's bounds. - return iter, err - } - li.Init(keyspan.SpanIterOptions{}, c.cmp, newRangeKeyIterWrapper, level.files.Iter(), l, manifest.KeyTypeRange) - rangeKeyIters = append(rangeKeyIters, li) - } - return nil - } - - for i := range c.inputs { - // If the level is annotated with l0SublevelInfo, expand it into one - // level per sublevel. - // TODO(jackson): Perform this expansion even earlier when we pick the - // compaction? - if len(c.inputs[i].l0SublevelInfo) > 0 { - for _, info := range c.startLevel.l0SublevelInfo { - sublevelCompactionLevel := &compactionLevel{0, info.LevelSlice, nil} - if err := addItersForLevel(sublevelCompactionLevel, info.sublevel); err != nil { - return nil, err - } - } - continue - } - if err := addItersForLevel(&c.inputs[i], manifest.Level(c.inputs[i].level)); err != nil { - return nil, err - } - } - } - - // In normal operation, levelIter iterates over the point operations in a - // level, and initializes a rangeDelIter pointer for the range deletions in - // each table. During compaction, we want to iterate over the merged view of - // point operations and range deletions. In order to do this we create one - // levelIter per level to iterate over the point operations, and collect up - // all the range deletion files. - // - // The range deletion levels are first combined with a keyspan.MergingIter - // (currently wrapped by a keyspan.InternalIteratorShim to satisfy the - // internal iterator interface). The resulting merged rangedel iterator is - // then included with the point levels in a single mergingIter. - // - // Combine all the rangedel iterators using a keyspan.MergingIterator and a - // InternalIteratorShim so that the range deletions may be interleaved in - // the compaction input. - // TODO(jackson): Replace the InternalIteratorShim with an interleaving - // iterator. - if len(rangeDelIters) > 0 { - c.rangeDelIter.Init(c.cmp, rangeDelIters...) - iters = append(iters, &c.rangeDelIter) - } - - // If there's only one constituent point iterator, we can avoid the overhead - // of a *mergingIter. This is possible, for example, when performing a flush - // of a single memtable. Otherwise, combine all the iterators into a merging - // iter. - iter := iters[0] - if len(iters) > 0 { - iter = newMergingIter(c.logger, &c.stats, c.cmp, nil, iters...) - } - // If there are range key iterators, we need to combine them using - // keyspan.MergingIter, and then interleave them among the points. - if len(rangeKeyIters) > 0 { - mi := &keyspan.MergingIter{} - mi.Init(c.cmp, rangeKeyCompactionTransform(c.equal, snapshots, c.elideRangeKey), new(keyspan.MergingBuffers), rangeKeyIters...) - di := &keyspan.DefragmentingIter{} - di.Init(c.comparer, mi, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, new(keyspan.DefragmentingBuffers)) - c.rangeKeyInterleaving.Init(c.comparer, iter, di, keyspan.InterleavingIterOpts{}) - iter = &c.rangeKeyInterleaving - } - return iter, nil -} - -func (c *compaction) newRangeDelIter( - newIters tableNewIters, - f manifest.LevelFile, - _ *IterOptions, - l manifest.Level, - bytesIterated *uint64, -) (keyspan.FragmentIterator, io.Closer, error) { - iter, rangeDelIter, err := newIters(context.Background(), f.FileMetadata, - &IterOptions{level: l}, internalIterOpts{ - bytesIterated: &c.bytesIterated, - bufferPool: &c.bufferPool, - }) - if err != nil { - return nil, nil, err - } - // TODO(peter): It is mildly wasteful to open the point iterator only to - // immediately close it. One way to solve this would be to add new - // methods to tableCache for creating point and range-deletion iterators - // independently. We'd only want to use those methods here, - // though. Doesn't seem worth the hassle in the near term. - if err = iter.Close(); err != nil { - if rangeDelIter != nil { - err = errors.CombineErrors(err, rangeDelIter.Close()) - } - return nil, nil, err - } - if rangeDelIter == nil { - // The file doesn't contain any range deletions. - return nil, nil, nil - } - - // Ensure that rangeDelIter is not closed until the compaction is - // finished. This is necessary because range tombstone processing - // requires the range tombstones to be held in memory for up to the - // lifetime of the compaction. - closer := rangeDelIter - rangeDelIter = noCloseIter{rangeDelIter} - - // Truncate the range tombstones returned by the iterator to the - // upper bound of the atomic compaction unit of the file. We want to - // truncate the range tombstone to the bounds of the file, but files - // with split user keys pose an obstacle: The file's largest bound - // is inclusive whereas the range tombstone's end is exclusive. - // - // Consider the example: - // - // 000001:[b-f#200] range del [c,k) - // 000002:[f#190-g#inf] range del [c,k) - // 000003:[g#500-i#3] - // - // Files 000001 and 000002 contain the untruncated range tombstones - // [c,k). While the keyspace covered by 000003 was at one point - // deleted by the tombstone [c,k), the tombstone may have already - // been compacted away and the file does not contain an untruncated - // range tombstone. We want to bound 000001's tombstone to the file - // bounds, but it's not possible to encode a range tombstone with an - // end boundary within a user key (eg, between sequence numbers - // f#200 and f#190). Instead, we expand 000001 to its atomic - // compaction unit (000001 and 000002) and truncate the tombstone to - // g#inf. - // - // NB: We must not use the atomic compaction unit of the entire - // compaction, because the [c,k) tombstone contained in the file - // 000001 ≥ g. If 000001, 000002 and 000003 are all included in the - // same compaction, the compaction's atomic compaction unit includes - // 000003. However 000003's keys must not be covered by 000001's - // untruncated range tombstone. - // - // Note that we need do this truncation at read time in order to - // handle sstables generated by RocksDB and earlier versions of - // Pebble which do not truncate range tombstones to atomic - // compaction unit boundaries at write time. - // - // The current Pebble compaction logic DOES truncate tombstones to - // atomic unit boundaries at compaction time too. - atomicUnit, _ := expandToAtomicUnit(c.cmp, f.Slice(), true /* disableIsCompacting */) - lowerBound, upperBound := manifest.KeyRange(c.cmp, atomicUnit.Iter()) - // Range deletion tombstones are often written to sstables - // untruncated on the end key side. However, they are still only - // valid within a given file's bounds. The logic for writing range - // tombstones to an output file sometimes has an incomplete view - // of range tombstones outside the file's internal key bounds. Skip - // any range tombstones completely outside file bounds. - rangeDelIter = keyspan.Truncate( - c.cmp, rangeDelIter, lowerBound.UserKey, upperBound.UserKey, - &f.Smallest, &f.Largest, false, /* panicOnUpperTruncate */ - ) - return rangeDelIter, closer, nil -} - -func (c *compaction) String() string { - if len(c.flushing) != 0 { - return "flush\n" - } - - var buf bytes.Buffer - for level := c.startLevel.level; level <= c.outputLevel.level; level++ { - i := level - c.startLevel.level - fmt.Fprintf(&buf, "%d:", level) - iter := c.inputs[i].files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - fmt.Fprintf(&buf, " %s:%s-%s", f.FileNum, f.Smallest, f.Largest) - } - fmt.Fprintf(&buf, "\n") - } - return buf.String() -} - -type manualCompaction struct { - // Count of the retries either due to too many concurrent compactions, or a - // concurrent compaction to overlapping levels. - retries int - level int - outputLevel int - done chan error - start []byte - end []byte - split bool -} - -type readCompaction struct { - level int - // [start, end] key ranges are used for de-duping. - start []byte - end []byte - - // The file associated with the compaction. - // If the file no longer belongs in the same - // level, then we skip the compaction. - fileNum base.FileNum -} - -func (d *DB) addInProgressCompaction(c *compaction) { - d.mu.compact.inProgress[c] = struct{}{} - var isBase, isIntraL0 bool - for _, cl := range c.inputs { - iter := cl.files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - if f.IsCompacting() { - d.opts.Logger.Fatalf("L%d->L%d: %s already being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum) - } - f.SetCompactionState(manifest.CompactionStateCompacting) - if c.startLevel != nil && c.outputLevel != nil && c.startLevel.level == 0 { - if c.outputLevel.level == 0 { - f.IsIntraL0Compacting = true - isIntraL0 = true - } else { - isBase = true - } - } - } - } - - if (isIntraL0 || isBase) && c.version.L0Sublevels != nil { - l0Inputs := []manifest.LevelSlice{c.startLevel.files} - if isIntraL0 { - l0Inputs = append(l0Inputs, c.outputLevel.files) - } - if err := c.version.L0Sublevels.UpdateStateForStartedCompaction(l0Inputs, isBase); err != nil { - d.opts.Logger.Fatalf("could not update state for compaction: %s", err) - } - } -} - -// Removes compaction markers from files in a compaction. The rollback parameter -// indicates whether the compaction state should be rolled back to its original -// state in the case of an unsuccessful compaction. -// -// DB.mu must be held when calling this method, however this method can drop and -// re-acquire that mutex. All writes to the manifest for this compaction should -// have completed by this point. -func (d *DB) clearCompactingState(c *compaction, rollback bool) { - c.versionEditApplied = true - for _, cl := range c.inputs { - iter := cl.files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - if !f.IsCompacting() { - d.opts.Logger.Fatalf("L%d->L%d: %s not being compacted", c.startLevel.level, c.outputLevel.level, f.FileNum) - } - if !rollback { - // On success all compactions other than move-compactions transition the - // file into the Compacted state. Move-compacted files become eligible - // for compaction again and transition back to NotCompacting. - if c.kind != compactionKindMove { - f.SetCompactionState(manifest.CompactionStateCompacted) - } else { - f.SetCompactionState(manifest.CompactionStateNotCompacting) - } - } else { - // Else, on rollback, all input files unconditionally transition back to - // NotCompacting. - f.SetCompactionState(manifest.CompactionStateNotCompacting) - } - f.IsIntraL0Compacting = false - } - } - l0InProgress := inProgressL0Compactions(d.getInProgressCompactionInfoLocked(c)) - func() { - // InitCompactingFileInfo requires that no other manifest writes be - // happening in parallel with it, i.e. we're not in the midst of installing - // another version. Otherwise, it's possible that we've created another - // L0Sublevels instance, but not added it to the versions list, causing - // all the indices in FileMetadata to be inaccurate. To ensure this, - // grab the manifest lock. - d.mu.versions.logLock() - defer d.mu.versions.logUnlock() - d.mu.versions.currentVersion().L0Sublevels.InitCompactingFileInfo(l0InProgress) - }() -} - -func (d *DB) calculateDiskAvailableBytes() uint64 { - if space, err := d.opts.FS.GetDiskUsage(d.dirname); err == nil { - d.diskAvailBytes.Store(space.AvailBytes) - return space.AvailBytes - } else if !errors.Is(err, vfs.ErrUnsupported) { - d.opts.EventListener.BackgroundError(err) - } - return d.diskAvailBytes.Load() -} - -func (d *DB) getDeletionPacerInfo() deletionPacerInfo { - var pacerInfo deletionPacerInfo - // Call GetDiskUsage after every file deletion. This may seem inefficient, - // but in practice this was observed to take constant time, regardless of - // volume size used, at least on linux with ext4 and zfs. All invocations - // take 10 microseconds or less. - pacerInfo.freeBytes = d.calculateDiskAvailableBytes() - d.mu.Lock() - pacerInfo.obsoleteBytes = d.mu.versions.metrics.Table.ObsoleteSize - pacerInfo.liveBytes = uint64(d.mu.versions.metrics.Total().Size) - d.mu.Unlock() - return pacerInfo -} - -// onObsoleteTableDelete is called to update metrics when an sstable is deleted. -func (d *DB) onObsoleteTableDelete(fileSize uint64) { - d.mu.Lock() - d.mu.versions.metrics.Table.ObsoleteCount-- - d.mu.versions.metrics.Table.ObsoleteSize -= fileSize - d.mu.Unlock() -} - -// maybeScheduleFlush schedules a flush if necessary. -// -// d.mu must be held when calling this. -func (d *DB) maybeScheduleFlush() { - if d.mu.compact.flushing || d.closed.Load() != nil || d.opts.ReadOnly { - return - } - if len(d.mu.mem.queue) <= 1 { - return - } - - if !d.passedFlushThreshold() { - return - } - - d.mu.compact.flushing = true - go d.flush() -} - -func (d *DB) passedFlushThreshold() bool { - var n int - var size uint64 - for ; n < len(d.mu.mem.queue)-1; n++ { - if !d.mu.mem.queue[n].readyForFlush() { - break - } - if d.mu.mem.queue[n].flushForced { - // A flush was forced. Pretend the memtable size is the configured - // size. See minFlushSize below. - size += d.opts.MemTableSize - } else { - size += d.mu.mem.queue[n].totalBytes() - } - } - if n == 0 { - // None of the immutable memtables are ready for flushing. - return false - } - - // Only flush once the sum of the queued memtable sizes exceeds half the - // configured memtable size. This prevents flushing of memtables at startup - // while we're undergoing the ramp period on the memtable size. See - // DB.newMemTable(). - minFlushSize := d.opts.MemTableSize / 2 - return size >= minFlushSize -} - -func (d *DB) maybeScheduleDelayedFlush(tbl *memTable, dur time.Duration) { - var mem *flushableEntry - for _, m := range d.mu.mem.queue { - if m.flushable == tbl { - mem = m - break - } - } - if mem == nil || mem.flushForced { - return - } - deadline := d.timeNow().Add(dur) - if !mem.delayedFlushForcedAt.IsZero() && deadline.After(mem.delayedFlushForcedAt) { - // Already scheduled to flush sooner than within `dur`. - return - } - mem.delayedFlushForcedAt = deadline - go func() { - timer := time.NewTimer(dur) - defer timer.Stop() - - select { - case <-d.closedCh: - return - case <-mem.flushed: - return - case <-timer.C: - d.commit.mu.Lock() - defer d.commit.mu.Unlock() - d.mu.Lock() - defer d.mu.Unlock() - - // NB: The timer may fire concurrently with a call to Close. If a - // Close call beat us to acquiring d.mu, d.closed holds ErrClosed, - // and it's too late to flush anything. Otherwise, the Close call - // will block on locking d.mu until we've finished scheduling the - // flush and set `d.mu.compact.flushing` to true. Close will wait - // for the current flush to complete. - if d.closed.Load() != nil { - return - } - - if d.mu.mem.mutable == tbl { - d.makeRoomForWrite(nil) - } else { - mem.flushForced = true - } - d.maybeScheduleFlush() - } - }() -} - -func (d *DB) flush() { - pprof.Do(context.Background(), flushLabels, func(context.Context) { - flushingWorkStart := time.Now() - d.mu.Lock() - defer d.mu.Unlock() - idleDuration := flushingWorkStart.Sub(d.mu.compact.noOngoingFlushStartTime) - var bytesFlushed uint64 - var err error - if bytesFlushed, err = d.flush1(); err != nil { - // TODO(peter): count consecutive flush errors and backoff. - d.opts.EventListener.BackgroundError(err) - } - d.mu.compact.flushing = false - d.mu.compact.noOngoingFlushStartTime = time.Now() - workDuration := d.mu.compact.noOngoingFlushStartTime.Sub(flushingWorkStart) - d.mu.compact.flushWriteThroughput.Bytes += int64(bytesFlushed) - d.mu.compact.flushWriteThroughput.WorkDuration += workDuration - d.mu.compact.flushWriteThroughput.IdleDuration += idleDuration - // More flush work may have arrived while we were flushing, so schedule - // another flush if needed. - d.maybeScheduleFlush() - // The flush may have produced too many files in a level, so schedule a - // compaction if needed. - d.maybeScheduleCompaction() - d.mu.compact.cond.Broadcast() - }) -} - -// runIngestFlush is used to generate a flush version edit for sstables which -// were ingested as flushables. Both DB.mu and the manifest lock must be held -// while runIngestFlush is called. -func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) { - if len(c.flushing) != 1 { - panic("pebble: ingestedFlushable must be flushed one at a time.") - } - - // Construct the VersionEdit, levelMetrics etc. - c.metrics = make(map[int]*LevelMetrics, numLevels) - // Finding the target level for ingestion must use the latest version - // after the logLock has been acquired. - c.version = d.mu.versions.currentVersion() - - baseLevel := d.mu.versions.picker.getBaseLevel() - iterOpts := IterOptions{logger: d.opts.Logger} - ve := &versionEdit{} - var level int - var err error - var fileToSplit *fileMetadata - var ingestSplitFiles []ingestSplitFile - for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files { - suggestSplit := d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit() && - d.FormatMajorVersion() >= FormatVirtualSSTables - level, fileToSplit, err = ingestTargetLevel( - d.newIters, d.tableNewRangeKeyIter, iterOpts, d.opts.Comparer, - c.version, baseLevel, d.mu.compact.inProgress, file.FileMetadata, - suggestSplit, - ) - if err != nil { - return nil, err - } - ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: file.FileMetadata}) - if fileToSplit != nil { - ingestSplitFiles = append(ingestSplitFiles, ingestSplitFile{ - ingestFile: file.FileMetadata, - splitFile: fileToSplit, - level: level, - }) - } - levelMetrics := c.metrics[level] - if levelMetrics == nil { - levelMetrics = &LevelMetrics{} - c.metrics[level] = levelMetrics - } - levelMetrics.BytesIngested += file.Size - levelMetrics.TablesIngested++ - } - - updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) { - levelMetrics := c.metrics[level] - if levelMetrics == nil { - levelMetrics = &LevelMetrics{} - c.metrics[level] = levelMetrics - } - levelMetrics.NumFiles-- - levelMetrics.Size -= int64(m.Size) - for i := range added { - levelMetrics.NumFiles++ - levelMetrics.Size += int64(added[i].Meta.Size) - } - } - - if len(ingestSplitFiles) > 0 { - ve.DeletedFiles = make(map[manifest.DeletedFileEntry]*manifest.FileMetadata) - replacedFiles := make(map[base.FileNum][]newFileEntry) - if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, ingestSplitFiles, replacedFiles); err != nil { - return nil, err - } - } - - return ve, nil -} - -// flush runs a compaction that copies the immutable memtables from memory to -// disk. -// -// d.mu must be held when calling this, but the mutex may be dropped and -// re-acquired during the course of this method. -func (d *DB) flush1() (bytesFlushed uint64, err error) { - // NB: The flushable queue can contain flushables of type ingestedFlushable. - // The sstables in ingestedFlushable.files must be placed into the appropriate - // level in the lsm. Let's say the flushable queue contains a prefix of - // regular immutable memtables, then an ingestedFlushable, and then the - // mutable memtable. When the flush of the ingestedFlushable is performed, - // it needs an updated view of the lsm. That is, the prefix of immutable - // memtables must have already been flushed. Similarly, if there are two - // contiguous ingestedFlushables in the queue, then the first flushable must - // be flushed, so that the second flushable can see an updated view of the - // lsm. - // - // Given the above, we restrict flushes to either some prefix of regular - // memtables, or a single flushable of type ingestedFlushable. The DB.flush - // function will call DB.maybeScheduleFlush again, so a new flush to finish - // the remaining flush work should be scheduled right away. - // - // NB: Large batches placed in the flushable queue share the WAL with the - // previous memtable in the queue. We must ensure the property that both the - // large batch and the memtable with which it shares a WAL are flushed - // together. The property ensures that the minimum unflushed log number - // isn't incremented incorrectly. Since a flushableBatch.readyToFlush always - // returns true, and since the large batch will always be placed right after - // the memtable with which it shares a WAL, the property is naturally - // ensured. The large batch will always be placed after the memtable with - // which it shares a WAL because we ensure it in DB.commitWrite by holding - // the commitPipeline.mu and then holding DB.mu. As an extra defensive - // measure, if we try to flush the memtable without also flushing the - // flushable batch in the same flush, since the memtable and flushableBatch - // have the same logNum, the logNum invariant check below will trigger. - var n, inputs int - var inputBytes uint64 - var ingest bool - for ; n < len(d.mu.mem.queue)-1; n++ { - if f, ok := d.mu.mem.queue[n].flushable.(*ingestedFlushable); ok { - if n == 0 { - // The first flushable is of type ingestedFlushable. Since these - // must be flushed individually, we perform a flush for just - // this. - if !f.readyForFlush() { - // This check is almost unnecessary, but we guard against it - // just in case this invariant changes in the future. - panic("pebble: ingestedFlushable should always be ready to flush.") - } - // By setting n = 1, we ensure that the first flushable(n == 0) - // is scheduled for a flush. The number of tables added is equal to the - // number of files in the ingest operation. - n = 1 - inputs = len(f.files) - ingest = true - break - } else { - // There was some prefix of flushables which weren't of type - // ingestedFlushable. So, perform a flush for those. - break - } - } - if !d.mu.mem.queue[n].readyForFlush() { - break - } - inputBytes += d.mu.mem.queue[n].inuseBytes() - } - if n == 0 { - // None of the immutable memtables are ready for flushing. - return 0, nil - } - if !ingest { - // Flushes of memtables add the prefix of n memtables from the flushable - // queue. - inputs = n - } - - // Require that every memtable being flushed has a log number less than the - // new minimum unflushed log number. - minUnflushedLogNum := d.mu.mem.queue[n].logNum - if !d.opts.DisableWAL { - for i := 0; i < n; i++ { - if logNum := d.mu.mem.queue[i].logNum; logNum >= minUnflushedLogNum { - panic(errors.AssertionFailedf("logNum invariant violated: flushing %d items; %d:type=%T,logNum=%d; %d:type=%T,logNum=%d", - n, - i, d.mu.mem.queue[i].flushable, logNum, - n, d.mu.mem.queue[n].flushable, minUnflushedLogNum)) - } - } - } - - c := newFlush(d.opts, d.mu.versions.currentVersion(), - d.mu.versions.picker.getBaseLevel(), d.mu.mem.queue[:n], d.timeNow()) - d.addInProgressCompaction(c) - - jobID := d.mu.nextJobID - d.mu.nextJobID++ - d.opts.EventListener.FlushBegin(FlushInfo{ - JobID: jobID, - Input: inputs, - InputBytes: inputBytes, - Ingest: ingest, - }) - startTime := d.timeNow() - - var ve *manifest.VersionEdit - var pendingOutputs []physicalMeta - var stats compactStats - // To determine the target level of the files in the ingestedFlushable, we - // need to acquire the logLock, and not release it for that duration. Since, - // we need to acquire the logLock below to perform the logAndApply step - // anyway, we create the VersionEdit for ingestedFlushable outside of - // runCompaction. For all other flush cases, we construct the VersionEdit - // inside runCompaction. - if c.kind != compactionKindIngestedFlushable { - ve, pendingOutputs, stats, err = d.runCompaction(jobID, c) - } - - // Acquire logLock. This will be released either on an error, by way of - // logUnlock, or through a call to logAndApply if there is no error. - d.mu.versions.logLock() - - if c.kind == compactionKindIngestedFlushable { - ve, err = d.runIngestFlush(c) - } - - info := FlushInfo{ - JobID: jobID, - Input: inputs, - InputBytes: inputBytes, - Duration: d.timeNow().Sub(startTime), - Done: true, - Ingest: ingest, - Err: err, - } - if err == nil { - for i := range ve.NewFiles { - e := &ve.NewFiles[i] - info.Output = append(info.Output, e.Meta.TableInfo()) - // Ingested tables are not necessarily flushed to L0. Record the level of - // each ingested file explicitly. - if ingest { - info.IngestLevels = append(info.IngestLevels, e.Level) - } - } - if len(ve.NewFiles) == 0 { - info.Err = errEmptyTable - } - - // The flush succeeded or it produced an empty sstable. In either case we - // want to bump the minimum unflushed log number to the log number of the - // oldest unflushed memtable. - ve.MinUnflushedLogNum = minUnflushedLogNum - if c.kind != compactionKindIngestedFlushable { - metrics := c.metrics[0] - if d.opts.DisableWAL { - // If the WAL is disabled, every flushable has a zero [logSize], - // resulting in zero bytes in. Instead, use the number of bytes we - // flushed as the BytesIn. This ensures we get a reasonable w-amp - // calculation even when the WAL is disabled. - metrics.BytesIn = metrics.BytesFlushed - } else { - for i := 0; i < n; i++ { - metrics.BytesIn += d.mu.mem.queue[i].logSize - } - } - } else if len(ve.DeletedFiles) > 0 { - // c.kind == compactionKindIngestedFlushable && we have deleted files due - // to ingest-time splits. - // - // Iterate through all other compactions, and check if their inputs have - // been replaced due to an ingest-time split. In that case, cancel the - // compaction. - for c2 := range d.mu.compact.inProgress { - for i := range c2.inputs { - iter := c2.inputs[i].files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - if _, ok := ve.DeletedFiles[deletedFileEntry{FileNum: f.FileNum, Level: c2.inputs[i].level}]; ok { - c2.cancel.Store(true) - break - } - } - } - } - } - err = d.mu.versions.logAndApply(jobID, ve, c.metrics, false, /* forceRotation */ - func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) }) - if err != nil { - info.Err = err - // TODO(peter): untested. - for _, f := range pendingOutputs { - // Note that the FileBacking for the file metadata might not have - // been set yet. So, we directly use the FileNum. Since these - // files were generated as compaction outputs, these must be - // physical files on disk. This property might not hold once - // https://github.com/cockroachdb/pebble/issues/389 is - // implemented if #389 creates virtual sstables as output files. - d.mu.versions.obsoleteTables = append( - d.mu.versions.obsoleteTables, - fileInfo{f.FileNum.DiskFileNum(), f.Size}, - ) - } - d.mu.versions.updateObsoleteTableMetricsLocked() - } - } else { - // We won't be performing the logAndApply step because of the error, - // so logUnlock. - d.mu.versions.logUnlock() - } - - bytesFlushed = c.bytesIterated - - // If err != nil, then the flush will be retried, and we will recalculate - // these metrics. - if err == nil { - d.mu.snapshots.cumulativePinnedCount += stats.cumulativePinnedKeys - d.mu.snapshots.cumulativePinnedSize += stats.cumulativePinnedSize - d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.countMissizedDels - d.maybeUpdateDeleteCompactionHints(c) - } - - d.clearCompactingState(c, err != nil) - delete(d.mu.compact.inProgress, c) - d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics) - - var flushed flushableList - if err == nil { - flushed = d.mu.mem.queue[:n] - d.mu.mem.queue = d.mu.mem.queue[n:] - d.updateReadStateLocked(d.opts.DebugCheck) - d.updateTableStatsLocked(ve.NewFiles) - if ingest { - d.mu.versions.metrics.Flush.AsIngestCount++ - for _, l := range c.metrics { - d.mu.versions.metrics.Flush.AsIngestBytes += l.BytesIngested - d.mu.versions.metrics.Flush.AsIngestTableCount += l.TablesIngested - } - } - - // Update if any eventually file-only snapshots have now transitioned to - // being file-only. - earliestUnflushedSeqNum := d.getEarliestUnflushedSeqNumLocked() - currentVersion := d.mu.versions.currentVersion() - for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; { - if s.efos == nil { - s = s.next - continue - } - if base.Visible(earliestUnflushedSeqNum, s.efos.seqNum, InternalKeySeqNumMax) { - s = s.next - continue - } - if s.efos.excised.Load() { - // If a concurrent excise has happened that overlaps with one of the key - // ranges this snapshot is interested in, this EFOS cannot transition to - // a file-only snapshot as keys in that range could now be deleted. Move - // onto the next snapshot. - s = s.next - continue - } - currentVersion.Ref() - - // NB: s.efos.transitionToFileOnlySnapshot could close s, in which - // case s.next would be nil. Save it before calling it. - next := s.next - _ = s.efos.transitionToFileOnlySnapshot(currentVersion) - s = next - } - } - // Signal FlushEnd after installing the new readState. This helps for unit - // tests that use the callback to trigger a read using an iterator with - // IterOptions.OnlyReadGuaranteedDurable. - info.TotalDuration = d.timeNow().Sub(startTime) - d.opts.EventListener.FlushEnd(info) - - // The order of these operations matters here for ease of testing. - // Removing the reader reference first allows tests to be guaranteed that - // the memtable reservation has been released by the time a synchronous - // flush returns. readerUnrefLocked may also produce obsolete files so the - // call to deleteObsoleteFiles must happen after it. - for i := range flushed { - flushed[i].readerUnrefLocked(true) - } - - d.deleteObsoleteFiles(jobID) - - // Mark all the memtables we flushed as flushed. - for i := range flushed { - close(flushed[i].flushed) - } - - return bytesFlushed, err -} - -// maybeScheduleCompactionAsync should be used when -// we want to possibly schedule a compaction, but don't -// want to eat the cost of running maybeScheduleCompaction. -// This method should be launched in a separate goroutine. -// d.mu must not be held when this is called. -func (d *DB) maybeScheduleCompactionAsync() { - defer d.compactionSchedulers.Done() - - d.mu.Lock() - d.maybeScheduleCompaction() - d.mu.Unlock() -} - -// maybeScheduleCompaction schedules a compaction if necessary. -// -// d.mu must be held when calling this. -func (d *DB) maybeScheduleCompaction() { - d.maybeScheduleCompactionPicker(pickAuto) -} - -func pickAuto(picker compactionPicker, env compactionEnv) *pickedCompaction { - return picker.pickAuto(env) -} - -func pickElisionOnly(picker compactionPicker, env compactionEnv) *pickedCompaction { - return picker.pickElisionOnlyCompaction(env) -} - -// maybeScheduleCompactionPicker schedules a compaction if necessary, -// calling `pickFunc` to pick automatic compactions. -// -// d.mu must be held when calling this. -func (d *DB) maybeScheduleCompactionPicker( - pickFunc func(compactionPicker, compactionEnv) *pickedCompaction, -) { - if d.closed.Load() != nil || d.opts.ReadOnly { - return - } - maxConcurrentCompactions := d.opts.MaxConcurrentCompactions() - if d.mu.compact.compactingCount >= maxConcurrentCompactions { - if len(d.mu.compact.manual) > 0 { - // Inability to run head blocks later manual compactions. - d.mu.compact.manual[0].retries++ - } - return - } - - // Compaction picking needs a coherent view of a Version. In particular, we - // need to exlude concurrent ingestions from making a decision on which level - // to ingest into that conflicts with our compaction - // decision. versionSet.logLock provides the necessary mutual exclusion. - d.mu.versions.logLock() - defer d.mu.versions.logUnlock() - - // Check for the closed flag again, in case the DB was closed while we were - // waiting for logLock(). - if d.closed.Load() != nil { - return - } - - env := compactionEnv{ - diskAvailBytes: d.diskAvailBytes.Load(), - earliestSnapshotSeqNum: d.mu.snapshots.earliest(), - earliestUnflushedSeqNum: d.getEarliestUnflushedSeqNumLocked(), - } - - // Check for delete-only compactions first, because they're expected to be - // cheap and reduce future compaction work. - if !d.opts.private.disableDeleteOnlyCompactions && - len(d.mu.compact.deletionHints) > 0 && - d.mu.compact.compactingCount < maxConcurrentCompactions && - !d.opts.DisableAutomaticCompactions { - v := d.mu.versions.currentVersion() - snapshots := d.mu.snapshots.toSlice() - inputs, unresolvedHints := checkDeleteCompactionHints(d.cmp, v, d.mu.compact.deletionHints, snapshots) - d.mu.compact.deletionHints = unresolvedHints - - if len(inputs) > 0 { - c := newDeleteOnlyCompaction(d.opts, v, inputs, d.timeNow()) - d.mu.compact.compactingCount++ - d.addInProgressCompaction(c) - go d.compact(c, nil) - } - } - - for len(d.mu.compact.manual) > 0 && d.mu.compact.compactingCount < maxConcurrentCompactions { - v := d.mu.versions.currentVersion() - manual := d.mu.compact.manual[0] - env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) - pc, retryLater := pickManualCompaction(v, d.opts, env, d.mu.versions.picker.getBaseLevel(), manual) - if pc != nil { - c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider()) - d.mu.compact.manual = d.mu.compact.manual[1:] - d.mu.compact.compactingCount++ - d.addInProgressCompaction(c) - go d.compact(c, manual.done) - } else if !retryLater { - // Noop - d.mu.compact.manual = d.mu.compact.manual[1:] - manual.done <- nil - } else { - // Inability to run head blocks later manual compactions. - manual.retries++ - break - } - } - - for !d.opts.DisableAutomaticCompactions && d.mu.compact.compactingCount < maxConcurrentCompactions { - env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) - env.readCompactionEnv = readCompactionEnv{ - readCompactions: &d.mu.compact.readCompactions, - flushing: d.mu.compact.flushing || d.passedFlushThreshold(), - rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction, - } - pc := pickFunc(d.mu.versions.picker, env) - if pc == nil { - break - } - c := newCompaction(pc, d.opts, d.timeNow(), d.ObjProvider()) - d.mu.compact.compactingCount++ - d.addInProgressCompaction(c) - go d.compact(c, nil) - } -} - -// deleteCompactionHintType indicates whether the deleteCompactionHint was -// generated from a span containing a range del (point key only), a range key -// delete (range key only), or both a point and range key. -type deleteCompactionHintType uint8 - -const ( - // NOTE: While these are primarily used as enumeration types, they are also - // used for some bitwise operations. Care should be taken when updating. - deleteCompactionHintTypeUnknown deleteCompactionHintType = iota - deleteCompactionHintTypePointKeyOnly - deleteCompactionHintTypeRangeKeyOnly - deleteCompactionHintTypePointAndRangeKey -) - -// String implements fmt.Stringer. -func (h deleteCompactionHintType) String() string { - switch h { - case deleteCompactionHintTypeUnknown: - return "unknown" - case deleteCompactionHintTypePointKeyOnly: - return "point-key-only" - case deleteCompactionHintTypeRangeKeyOnly: - return "range-key-only" - case deleteCompactionHintTypePointAndRangeKey: - return "point-and-range-key" - default: - panic(fmt.Sprintf("unknown hint type: %d", h)) - } -} - -// compactionHintFromKeys returns a deleteCompactionHintType given a slice of -// keyspan.Keys. -func compactionHintFromKeys(keys []keyspan.Key) deleteCompactionHintType { - var hintType deleteCompactionHintType - for _, k := range keys { - switch k.Kind() { - case base.InternalKeyKindRangeDelete: - hintType |= deleteCompactionHintTypePointKeyOnly - case base.InternalKeyKindRangeKeyDelete: - hintType |= deleteCompactionHintTypeRangeKeyOnly - default: - panic(fmt.Sprintf("unsupported key kind: %s", k.Kind())) - } - } - return hintType -} - -// A deleteCompactionHint records a user key and sequence number span that has been -// deleted by a range tombstone. A hint is recorded if at least one sstable -// falls completely within both the user key and sequence number spans. -// Once the tombstones and the observed completely-contained sstables fall -// into the same snapshot stripe, a delete-only compaction may delete any -// sstables within the range. -type deleteCompactionHint struct { - // The type of key span that generated this hint (point key, range key, or - // both). - hintType deleteCompactionHintType - // start and end are user keys specifying a key range [start, end) of - // deleted keys. - start []byte - end []byte - // The level of the file containing the range tombstone(s) when the hint - // was created. Only lower levels need to be searched for files that may - // be deleted. - tombstoneLevel int - // The file containing the range tombstone(s) that created the hint. - tombstoneFile *fileMetadata - // The smallest and largest sequence numbers of the abutting tombstones - // merged to form this hint. All of a tables' keys must be less than the - // tombstone smallest sequence number to be deleted. All of a tables' - // sequence numbers must fall into the same snapshot stripe as the - // tombstone largest sequence number to be deleted. - tombstoneLargestSeqNum uint64 - tombstoneSmallestSeqNum uint64 - // The smallest sequence number of a sstable that was found to be covered - // by this hint. The hint cannot be resolved until this sequence number is - // in the same snapshot stripe as the largest tombstone sequence number. - // This is set when a hint is created, so the LSM may look different and - // notably no longer contain the sstable that contained the key at this - // sequence number. - fileSmallestSeqNum uint64 -} - -func (h deleteCompactionHint) String() string { - return fmt.Sprintf( - "L%d.%s %s-%s seqnums(tombstone=%d-%d, file-smallest=%d, type=%s)", - h.tombstoneLevel, h.tombstoneFile.FileNum, h.start, h.end, - h.tombstoneSmallestSeqNum, h.tombstoneLargestSeqNum, h.fileSmallestSeqNum, - h.hintType, - ) -} - -func (h *deleteCompactionHint) canDelete(cmp Compare, m *fileMetadata, snapshots []uint64) bool { - // The file can only be deleted if all of its keys are older than the - // earliest tombstone aggregated into the hint. - if m.LargestSeqNum >= h.tombstoneSmallestSeqNum || m.SmallestSeqNum < h.fileSmallestSeqNum { - return false - } - - // The file's oldest key must be in the same snapshot stripe as the - // newest tombstone. NB: We already checked the hint's sequence numbers, - // but this file's oldest sequence number might be lower than the hint's - // smallest sequence number despite the file falling within the key range - // if this file was constructed after the hint by a compaction. - ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots) - fi, _ := snapshotIndex(m.SmallestSeqNum, snapshots) - if ti != fi { - return false - } - - switch h.hintType { - case deleteCompactionHintTypePointKeyOnly: - // A hint generated by a range del span cannot delete tables that contain - // range keys. - if m.HasRangeKeys { - return false - } - case deleteCompactionHintTypeRangeKeyOnly: - // A hint generated by a range key del span cannot delete tables that - // contain point keys. - if m.HasPointKeys { - return false - } - case deleteCompactionHintTypePointAndRangeKey: - // A hint from a span that contains both range dels *and* range keys can - // only be deleted if both bounds fall within the hint. The next check takes - // care of this. - default: - panic(fmt.Sprintf("pebble: unknown delete compaction hint type: %d", h.hintType)) - } - - // The file's keys must be completely contained within the hint range. - return cmp(h.start, m.Smallest.UserKey) <= 0 && cmp(m.Largest.UserKey, h.end) < 0 -} - -func (d *DB) maybeUpdateDeleteCompactionHints(c *compaction) { - // Compactions that zero sequence numbers can interfere with compaction - // deletion hints. Deletion hints apply to tables containing keys older - // than a threshold. If a key more recent than the threshold is zeroed in - // a compaction, a delete-only compaction may mistake it as meeting the - // threshold and drop a table containing live data. - // - // To avoid this scenario, compactions that zero sequence numbers remove - // any conflicting deletion hints. A deletion hint is conflicting if both - // of the following conditions apply: - // * its key space overlaps with the compaction - // * at least one of its inputs contains a key as recent as one of the - // hint's tombstones. - // - if !c.allowedZeroSeqNum { - return - } - - updatedHints := d.mu.compact.deletionHints[:0] - for _, h := range d.mu.compact.deletionHints { - // If the compaction's key space is disjoint from the hint's key - // space, the zeroing of sequence numbers won't affect the hint. Keep - // the hint. - keysDisjoint := d.cmp(h.end, c.smallest.UserKey) < 0 || d.cmp(h.start, c.largest.UserKey) > 0 - if keysDisjoint { - updatedHints = append(updatedHints, h) - continue - } - - // All of the compaction's inputs must be older than the hint's - // tombstones. - inputsOlder := true - for _, in := range c.inputs { - iter := in.files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - inputsOlder = inputsOlder && f.LargestSeqNum < h.tombstoneSmallestSeqNum - } - } - if inputsOlder { - updatedHints = append(updatedHints, h) - continue - } - - // Drop h, because the compaction c may have zeroed sequence numbers - // of keys more recent than some of h's tombstones. - } - d.mu.compact.deletionHints = updatedHints -} - -func checkDeleteCompactionHints( - cmp Compare, v *version, hints []deleteCompactionHint, snapshots []uint64, -) ([]compactionLevel, []deleteCompactionHint) { - var files map[*fileMetadata]bool - var byLevel [numLevels][]*fileMetadata - - unresolvedHints := hints[:0] - for _, h := range hints { - // Check each compaction hint to see if it's resolvable. Resolvable - // hints are removed and trigger a delete-only compaction if any files - // in the current LSM still meet their criteria. Unresolvable hints - // are saved and don't trigger a delete-only compaction. - // - // When a compaction hint is created, the sequence numbers of the - // range tombstones and the covered file with the oldest key are - // recorded. The largest tombstone sequence number and the smallest - // file sequence number must be in the same snapshot stripe for the - // hint to be resolved. The below graphic models a compaction hint - // covering the keyspace [b, r). The hint completely contains two - // files, 000002 and 000003. The file 000003 contains the lowest - // covered sequence number at #90. The tombstone b.RANGEDEL.230:h has - // the highest tombstone sequence number incorporated into the hint. - // The hint may be resolved only once the snapshots at #100, #180 and - // #210 are all closed. File 000001 is not included within the hint - // because it extends beyond the range tombstones in user key space. - // - // 250 - // - // |-b...230:h-| - // _____________________________________________________ snapshot #210 - // 200 |--h.RANGEDEL.200:r--| - // - // _____________________________________________________ snapshot #180 - // - // 150 +--------+ - // +---------+ | 000003 | - // | 000002 | | | - // +_________+ | | - // 100_____________________|________|___________________ snapshot #100 - // +--------+ - // _____________________________________________________ snapshot #70 - // +---------------+ - // 50 | 000001 | - // | | - // +---------------+ - // ______________________________________________________________ - // a b c d e f g h i j k l m n o p q r s t u v w x y z - - ti, _ := snapshotIndex(h.tombstoneLargestSeqNum, snapshots) - fi, _ := snapshotIndex(h.fileSmallestSeqNum, snapshots) - if ti != fi { - // Cannot resolve yet. - unresolvedHints = append(unresolvedHints, h) - continue - } - - // The hint h will be resolved and dropped, regardless of whether - // there are any tables that can be deleted. - for l := h.tombstoneLevel + 1; l < numLevels; l++ { - overlaps := v.Overlaps(l, cmp, h.start, h.end, true /* exclusiveEnd */) - iter := overlaps.Iter() - for m := iter.First(); m != nil; m = iter.Next() { - if m.IsCompacting() || !h.canDelete(cmp, m, snapshots) || files[m] { - continue - } - if files == nil { - // Construct files lazily, assuming most calls will not - // produce delete-only compactions. - files = make(map[*fileMetadata]bool) - } - files[m] = true - byLevel[l] = append(byLevel[l], m) - } - } - } - - var compactLevels []compactionLevel - for l, files := range byLevel { - if len(files) == 0 { - continue - } - compactLevels = append(compactLevels, compactionLevel{ - level: l, - files: manifest.NewLevelSliceKeySorted(cmp, files), - }) - } - return compactLevels, unresolvedHints -} - -// compact runs one compaction and maybe schedules another call to compact. -func (d *DB) compact(c *compaction, errChannel chan error) { - pprof.Do(context.Background(), compactLabels, func(context.Context) { - d.mu.Lock() - defer d.mu.Unlock() - if err := d.compact1(c, errChannel); err != nil { - // TODO(peter): count consecutive compaction errors and backoff. - d.opts.EventListener.BackgroundError(err) - } - d.mu.compact.compactingCount-- - delete(d.mu.compact.inProgress, c) - // Add this compaction's duration to the cumulative duration. NB: This - // must be atomic with the above removal of c from - // d.mu.compact.InProgress to ensure Metrics.Compact.Duration does not - // miss or double count a completing compaction's duration. - d.mu.compact.duration += d.timeNow().Sub(c.beganAt) - - // The previous compaction may have produced too many files in a - // level, so reschedule another compaction if needed. - d.maybeScheduleCompaction() - d.mu.compact.cond.Broadcast() - }) -} - -// compact1 runs one compaction. -// -// d.mu must be held when calling this, but the mutex may be dropped and -// re-acquired during the course of this method. -func (d *DB) compact1(c *compaction, errChannel chan error) (err error) { - if errChannel != nil { - defer func() { - errChannel <- err - }() - } - - jobID := d.mu.nextJobID - d.mu.nextJobID++ - info := c.makeInfo(jobID) - d.opts.EventListener.CompactionBegin(info) - startTime := d.timeNow() - - ve, pendingOutputs, stats, err := d.runCompaction(jobID, c) - - info.Duration = d.timeNow().Sub(startTime) - if err == nil { - err = func() error { - var err error - d.mu.versions.logLock() - // Check if this compaction had a conflicting operation (eg. a d.excise()) - // that necessitates it restarting from scratch. Note that since we hold - // the manifest lock, we don't expect this bool to change its value - // as only the holder of the manifest lock will ever write to it. - if c.cancel.Load() { - err = firstError(err, ErrCancelledCompaction) - } - if err != nil { - // logAndApply calls logUnlock. If we didn't call it, we need to call - // logUnlock ourselves. - d.mu.versions.logUnlock() - return err - } - return d.mu.versions.logAndApply(jobID, ve, c.metrics, false /* forceRotation */, func() []compactionInfo { - return d.getInProgressCompactionInfoLocked(c) - }) - }() - if err != nil { - // TODO(peter): untested. - for _, f := range pendingOutputs { - // Note that the FileBacking for the file metadata might not have - // been set yet. So, we directly use the FileNum. Since these - // files were generated as compaction outputs, these must be - // physical files on disk. This property might not hold once - // https://github.com/cockroachdb/pebble/issues/389 is - // implemented if #389 creates virtual sstables as output files. - d.mu.versions.obsoleteTables = append( - d.mu.versions.obsoleteTables, - fileInfo{f.FileNum.DiskFileNum(), f.Size}, - ) - } - d.mu.versions.updateObsoleteTableMetricsLocked() - } - } - - info.Done = true - info.Err = err - if err == nil { - for i := range ve.NewFiles { - e := &ve.NewFiles[i] - info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo()) - } - d.mu.snapshots.cumulativePinnedCount += stats.cumulativePinnedKeys - d.mu.snapshots.cumulativePinnedSize += stats.cumulativePinnedSize - d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.countMissizedDels - d.maybeUpdateDeleteCompactionHints(c) - } - - // NB: clearing compacting state must occur before updating the read state; - // L0Sublevels initialization depends on it. - d.clearCompactingState(c, err != nil) - d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.pickerMetrics) - d.mu.versions.incrementCompactionBytes(-c.bytesWritten) - - info.TotalDuration = d.timeNow().Sub(c.beganAt) - d.opts.EventListener.CompactionEnd(info) - - // Update the read state before deleting obsolete files because the - // read-state update will cause the previous version to be unref'd and if - // there are no references obsolete tables will be added to the obsolete - // table list. - if err == nil { - d.updateReadStateLocked(d.opts.DebugCheck) - d.updateTableStatsLocked(ve.NewFiles) - } - d.deleteObsoleteFiles(jobID) - - return err -} - -type compactStats struct { - cumulativePinnedKeys uint64 - cumulativePinnedSize uint64 - countMissizedDels uint64 -} - -// runCopyCompaction runs a copy compaction where a new FileNum is created that -// is a byte-for-byte copy of the input file. This is used in lieu of a move -// compaction when a file is being moved across the local/remote storage -// boundary. -// -// d.mu must be held when calling this method. -func (d *DB) runCopyCompaction( - jobID int, - c *compaction, - meta *fileMetadata, - objMeta objstorage.ObjectMetadata, - versionEdit *versionEdit, -) (ve *versionEdit, pendingOutputs []physicalMeta, retErr error) { - ve = versionEdit - if objMeta.IsRemote() || !remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level) { - panic("pebble: scheduled a copy compaction that is not actually moving files to shared storage") - } - // Note that based on logic in the compaction picker, we're guaranteed - // meta.Virtual is false. - if meta.Virtual { - panic(errors.AssertionFailedf("cannot do a copy compaction of a virtual sstable across local/remote storage")) - } - // We are in the relatively more complex case where we need to copy this - // file to remote/shared storage. Drop the db mutex while we do the - // copy. - // - // To ease up cleanup of the local file and tracking of refs, we create - // a new FileNum. This has the potential of making the block cache less - // effective, however. - metaCopy := new(fileMetadata) - *metaCopy = fileMetadata{ - Size: meta.Size, - CreationTime: meta.CreationTime, - SmallestSeqNum: meta.SmallestSeqNum, - LargestSeqNum: meta.LargestSeqNum, - Stats: meta.Stats, - Virtual: meta.Virtual, - } - if meta.HasPointKeys { - metaCopy.ExtendPointKeyBounds(c.cmp, meta.SmallestPointKey, meta.LargestPointKey) - } - if meta.HasRangeKeys { - metaCopy.ExtendRangeKeyBounds(c.cmp, meta.SmallestRangeKey, meta.LargestRangeKey) - } - metaCopy.FileNum = d.mu.versions.getNextFileNum() - metaCopy.InitPhysicalBacking() - c.metrics = map[int]*LevelMetrics{ - c.outputLevel.level: { - BytesIn: meta.Size, - BytesCompacted: meta.Size, - TablesCompacted: 1, - }, - } - pendingOutputs = append(pendingOutputs, metaCopy.PhysicalMeta()) - - d.mu.Unlock() - defer d.mu.Lock() - _, err := d.objProvider.LinkOrCopyFromLocal(context.TODO(), d.opts.FS, - d.objProvider.Path(objMeta), fileTypeTable, metaCopy.FileBacking.DiskFileNum, - objstorage.CreateOptions{PreferSharedStorage: true}) - if err != nil { - return ve, pendingOutputs, err - } - ve.NewFiles[0].Meta = metaCopy - - if err := d.objProvider.Sync(); err != nil { - return nil, pendingOutputs, err - } - return ve, pendingOutputs, nil -} - -// runCompactions runs a compaction that produces new on-disk tables from -// memtables or old on-disk tables. -// -// d.mu must be held when calling this, but the mutex may be dropped and -// re-acquired during the course of this method. -func (d *DB) runCompaction( - jobID int, c *compaction, -) (ve *versionEdit, pendingOutputs []physicalMeta, stats compactStats, retErr error) { - // As a sanity check, confirm that the smallest / largest keys for new and - // deleted files in the new versionEdit pass a validation function before - // returning the edit. - defer func() { - // If we're handling a panic, don't expect the version edit to validate. - if r := recover(); r != nil { - panic(r) - } else if ve != nil { - err := validateVersionEdit(ve, d.opts.Experimental.KeyValidationFunc, d.opts.Comparer.FormatKey) - if err != nil { - d.opts.Logger.Fatalf("pebble: version edit validation failed: %s", err) - } - } - }() - - // Check for a delete-only compaction. This can occur when wide range - // tombstones completely contain sstables. - if c.kind == compactionKindDeleteOnly { - c.metrics = make(map[int]*LevelMetrics, len(c.inputs)) - ve := &versionEdit{ - DeletedFiles: map[deletedFileEntry]*fileMetadata{}, - } - for _, cl := range c.inputs { - levelMetrics := &LevelMetrics{} - iter := cl.files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - ve.DeletedFiles[deletedFileEntry{ - Level: cl.level, - FileNum: f.FileNum, - }] = f - } - c.metrics[cl.level] = levelMetrics - } - return ve, nil, stats, nil - } - - if c.kind == compactionKindIngestedFlushable { - panic("pebble: runCompaction cannot handle compactionKindIngestedFlushable.") - } - - // Check for a move or copy of one table from one level to the next. We avoid - // such a move if there is lots of overlapping grandparent data. Otherwise, - // the move could create a parent file that will require a very expensive - // merge later on. - if c.kind == compactionKindMove || c.kind == compactionKindCopy { - iter := c.startLevel.files.Iter() - meta := iter.First() - if invariants.Enabled { - if iter.Next() != nil { - panic("got more than one file for a move or copy compaction") - } - } - objMeta, err := d.objProvider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) - if err != nil { - return ve, pendingOutputs, stats, err - } - c.metrics = map[int]*LevelMetrics{ - c.outputLevel.level: { - BytesMoved: meta.Size, - TablesMoved: 1, - }, - } - ve := &versionEdit{ - DeletedFiles: map[deletedFileEntry]*fileMetadata{ - {Level: c.startLevel.level, FileNum: meta.FileNum}: meta, - }, - NewFiles: []newFileEntry{ - {Level: c.outputLevel.level, Meta: meta}, - }, - } - if c.kind == compactionKindCopy { - ve, pendingOutputs, retErr = d.runCopyCompaction(jobID, c, meta, objMeta, ve) - if retErr != nil { - return ve, pendingOutputs, stats, retErr - } - } - return ve, nil, stats, nil - } - - defer func() { - if retErr != nil { - pendingOutputs = nil - } - }() - - snapshots := d.mu.snapshots.toSlice() - formatVers := d.FormatMajorVersion() - - if c.flushing == nil { - // Before dropping the db mutex, grab a ref to the current version. This - // prevents any concurrent excises from deleting files that this compaction - // needs to read/maintain a reference to. - // - // Note that unlike user iterators, compactionIter does not maintain a ref - // of the version or read state. - vers := d.mu.versions.currentVersion() - vers.Ref() - defer vers.UnrefLocked() - } - - if c.cancel.Load() { - return ve, nil, stats, ErrCancelledCompaction - } - - // Release the d.mu lock while doing I/O. - // Note the unusual order: Unlock and then Lock. - d.mu.Unlock() - defer d.mu.Lock() - - // Compactions use a pool of buffers to read blocks, avoiding polluting the - // block cache with blocks that will not be read again. We initialize the - // buffer pool with a size 12. This initial size does not need to be - // accurate, because the pool will grow to accommodate the maximum number of - // blocks allocated at a given time over the course of the compaction. But - // choosing a size larger than that working set avoids any additional - // allocations to grow the size of the pool over the course of iteration. - // - // Justification for initial size 12: In a two-level compaction, at any - // given moment we'll have 2 index blocks in-use and 2 data blocks in-use. - // Additionally, when decoding a compressed block, we'll temporarily - // allocate 1 additional block to hold the compressed buffer. In the worst - // case that all input sstables have two-level index blocks (+2), value - // blocks (+2), range deletion blocks (+n) and range key blocks (+n), we'll - // additionally require 2n+4 blocks where n is the number of input sstables. - // Range deletion and range key blocks are relatively rare, and the cost of - // an additional allocation or two over the course of the compaction is - // considered to be okay. A larger initial size would cause the pool to hold - // on to more memory, even when it's not in-use because the pool will - // recycle buffers up to the current capacity of the pool. The memory use of - // a 12-buffer pool is expected to be within reason, even if all the buffers - // grow to the typical size of an index block (256 KiB) which would - // translate to 3 MiB per compaction. - c.bufferPool.Init(12) - defer c.bufferPool.Release() - - iiter, err := c.newInputIter(d.newIters, d.tableNewRangeKeyIter, snapshots) - if err != nil { - return nil, pendingOutputs, stats, err - } - c.allowedZeroSeqNum = c.allowZeroSeqNum() - iiter = invalidating.MaybeWrapIfInvariants(iiter) - iter := newCompactionIter(c.cmp, c.equal, c.formatKey, d.merge, iiter, snapshots, - &c.rangeDelFrag, &c.rangeKeyFrag, c.allowedZeroSeqNum, c.elideTombstone, - c.elideRangeTombstone, d.opts.Experimental.IneffectualSingleDeleteCallback, - d.opts.Experimental.SingleDeleteInvariantViolationCallback, - d.FormatMajorVersion()) - - var ( - createdFiles []base.DiskFileNum - tw *sstable.Writer - pinnedKeySize uint64 - pinnedValueSize uint64 - pinnedCount uint64 - ) - defer func() { - if iter != nil { - retErr = firstError(retErr, iter.Close()) - } - if tw != nil { - retErr = firstError(retErr, tw.Close()) - } - if retErr != nil { - for _, fileNum := range createdFiles { - _ = d.objProvider.Remove(fileTypeTable, fileNum) - } - } - for _, closer := range c.closers { - retErr = firstError(retErr, closer.Close()) - } - }() - - ve = &versionEdit{ - DeletedFiles: map[deletedFileEntry]*fileMetadata{}, - } - - startLevelBytes := c.startLevel.files.SizeSum() - outputMetrics := &LevelMetrics{ - BytesIn: startLevelBytes, - BytesRead: c.outputLevel.files.SizeSum(), - } - if len(c.extraLevels) > 0 { - outputMetrics.BytesIn += c.extraLevels[0].files.SizeSum() - } - outputMetrics.BytesRead += outputMetrics.BytesIn - - c.metrics = map[int]*LevelMetrics{ - c.outputLevel.level: outputMetrics, - } - if len(c.flushing) == 0 && c.metrics[c.startLevel.level] == nil { - c.metrics[c.startLevel.level] = &LevelMetrics{} - } - if len(c.extraLevels) > 0 { - c.metrics[c.extraLevels[0].level] = &LevelMetrics{} - outputMetrics.MultiLevel.BytesInTop = startLevelBytes - outputMetrics.MultiLevel.BytesIn = outputMetrics.BytesIn - outputMetrics.MultiLevel.BytesRead = outputMetrics.BytesRead - } - - // The table is typically written at the maximum allowable format implied by - // the current format major version of the DB. - tableFormat := formatVers.MaxTableFormat() - - // In format major versions with maximum table formats of Pebblev3, value - // blocks were conditional on an experimental setting. In format major - // versions with maximum table formats of Pebblev4 and higher, value blocks - // are always enabled. - if tableFormat == sstable.TableFormatPebblev3 && - (d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks()) { - tableFormat = sstable.TableFormatPebblev2 - } - - writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat) - if formatVers < FormatBlockPropertyCollector { - // Cannot yet write block properties. - writerOpts.BlockPropertyCollectors = nil - } - - // prevPointKey is a sstable.WriterOption that provides access to - // the last point key written to a writer's sstable. When a new - // output begins in newOutput, prevPointKey is updated to point to - // the new output's sstable.Writer. This allows the compaction loop - // to access the last written point key without requiring the - // compaction loop to make a copy of each key ahead of time. Users - // must be careful, because the byte slice returned by UnsafeKey - // points directly into the Writer's block buffer. - var prevPointKey sstable.PreviousPointKeyOpt - var cpuWorkHandle CPUWorkHandle - defer func() { - if cpuWorkHandle != nil { - d.opts.Experimental.CPUWorkPermissionGranter.CPUWorkDone(cpuWorkHandle) - } - }() - - newOutput := func() error { - // Check if we've been cancelled by a concurrent operation. - if c.cancel.Load() { - return ErrCancelledCompaction - } - fileMeta := &fileMetadata{} - d.mu.Lock() - fileNum := d.mu.versions.getNextFileNum() - fileMeta.FileNum = fileNum - pendingOutputs = append(pendingOutputs, fileMeta.PhysicalMeta()) - d.mu.Unlock() - - ctx := context.TODO() - if objiotracing.Enabled { - ctx = objiotracing.WithLevel(ctx, c.outputLevel.level) - switch c.kind { - case compactionKindFlush: - ctx = objiotracing.WithReason(ctx, objiotracing.ForFlush) - case compactionKindIngestedFlushable: - ctx = objiotracing.WithReason(ctx, objiotracing.ForIngestion) - default: - ctx = objiotracing.WithReason(ctx, objiotracing.ForCompaction) - } - } - // Prefer shared storage if present. - createOpts := objstorage.CreateOptions{ - PreferSharedStorage: remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, c.outputLevel.level), - } - writable, objMeta, err := d.objProvider.Create(ctx, fileTypeTable, fileNum.DiskFileNum(), createOpts) - if err != nil { - return err - } - - reason := "flushing" - if c.flushing == nil { - reason = "compacting" - } - d.opts.EventListener.TableCreated(TableCreateInfo{ - JobID: jobID, - Reason: reason, - Path: d.objProvider.Path(objMeta), - FileNum: fileNum, - }) - if c.kind != compactionKindFlush { - writable = &compactionWritable{ - Writable: writable, - versions: d.mu.versions, - written: &c.bytesWritten, - } - } - createdFiles = append(createdFiles, fileNum.DiskFileNum()) - cacheOpts := private.SSTableCacheOpts(d.cacheID, fileNum.DiskFileNum()).(sstable.WriterOption) - - const MaxFileWriteAdditionalCPUTime = time.Millisecond * 100 - cpuWorkHandle = d.opts.Experimental.CPUWorkPermissionGranter.GetPermission( - MaxFileWriteAdditionalCPUTime, - ) - writerOpts.Parallelism = - d.opts.Experimental.MaxWriterConcurrency > 0 && - (cpuWorkHandle.Permitted() || d.opts.Experimental.ForceWriterParallelism) - - tw = sstable.NewWriter(writable, writerOpts, cacheOpts, &prevPointKey) - - fileMeta.CreationTime = time.Now().Unix() - ve.NewFiles = append(ve.NewFiles, newFileEntry{ - Level: c.outputLevel.level, - Meta: fileMeta, - }) - return nil - } - - // splitL0Outputs is true during flushes and intra-L0 compactions with flush - // splits enabled. - splitL0Outputs := c.outputLevel.level == 0 && d.opts.FlushSplitBytes > 0 - - // finishOutput is called with the a user key up to which all tombstones - // should be flushed. Typically, this is the first key of the next - // sstable or an empty key if this output is the final sstable. - finishOutput := func(splitKey []byte) error { - // If we haven't output any point records to the sstable (tw == nil) then the - // sstable will only contain range tombstones and/or range keys. The smallest - // key in the sstable will be the start key of the first range tombstone or - // range key added. We need to ensure that this start key is distinct from - // the splitKey passed to finishOutput (if set), otherwise we would generate - // an sstable where the largest key is smaller than the smallest key due to - // how the largest key boundary is set below. NB: It is permissible for the - // range tombstone / range key start key to be the empty string. - // - // TODO: It is unfortunate that we have to do this check here rather than - // when we decide to finish the sstable in the runCompaction loop. A better - // structure currently eludes us. - if tw == nil { - startKey := c.rangeDelFrag.Start() - if len(iter.tombstones) > 0 { - startKey = iter.tombstones[0].Start - } - if startKey == nil { - startKey = c.rangeKeyFrag.Start() - if len(iter.rangeKeys) > 0 { - startKey = iter.rangeKeys[0].Start - } - } - if splitKey != nil && d.cmp(startKey, splitKey) == 0 { - return nil - } - } - - // NB: clone the key because the data can be held on to by the call to - // compactionIter.Tombstones via keyspan.Fragmenter.FlushTo, and by the - // WriterMetadata.LargestRangeDel.UserKey. - splitKey = append([]byte(nil), splitKey...) - for _, v := range iter.Tombstones(splitKey) { - if tw == nil { - if err := newOutput(); err != nil { - return err - } - } - // The tombstone being added could be completely outside the - // eventual bounds of the sstable. Consider this example (bounds - // in square brackets next to table filename): - // - // ./000240.sst [tmgc#391,MERGE-tmgc#391,MERGE] - // tmgc#391,MERGE [786e627a] - // tmgc-udkatvs#331,RANGEDEL - // - // ./000241.sst [tmgc#384,MERGE-tmgc#384,MERGE] - // tmgc#384,MERGE [666c7070] - // tmgc-tvsalezade#383,RANGEDEL - // tmgc-tvsalezade#331,RANGEDEL - // - // ./000242.sst [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL] - // tmgc-tvsalezade#383,RANGEDEL - // tmgc#375,SET [72646c78766965616c72776865676e79] - // tmgc-tvsalezade#356,RANGEDEL - // - // Note that both of the top two SSTables have range tombstones - // that start after the file's end keys. Since the file bound - // computation happens well after all range tombstones have been - // added to the writer, eliding out-of-file range tombstones based - // on sequence number at this stage is difficult, and necessitates - // read-time logic to ignore range tombstones outside file bounds. - if err := rangedel.Encode(&v, tw.Add); err != nil { - return err - } - } - for _, v := range iter.RangeKeys(splitKey) { - // Same logic as for range tombstones, except added using tw.AddRangeKey. - if tw == nil { - if err := newOutput(); err != nil { - return err - } - } - if err := rangekey.Encode(&v, tw.AddRangeKey); err != nil { - return err - } - } - - if tw == nil { - return nil - } - { - // Set internal sstable properties. - p := getInternalWriterProperties(tw) - // Set the external sst version to 0. This is what RocksDB expects for - // db-internal sstables; otherwise, it could apply a global sequence number. - p.ExternalFormatVersion = 0 - // Set the snapshot pinned totals. - p.SnapshotPinnedKeys = pinnedCount - p.SnapshotPinnedKeySize = pinnedKeySize - p.SnapshotPinnedValueSize = pinnedValueSize - stats.cumulativePinnedKeys += pinnedCount - stats.cumulativePinnedSize += pinnedKeySize + pinnedValueSize - pinnedCount = 0 - pinnedKeySize = 0 - pinnedValueSize = 0 - } - if err := tw.Close(); err != nil { - tw = nil - return err - } - d.opts.Experimental.CPUWorkPermissionGranter.CPUWorkDone(cpuWorkHandle) - cpuWorkHandle = nil - writerMeta, err := tw.Metadata() - if err != nil { - tw = nil - return err - } - tw = nil - meta := ve.NewFiles[len(ve.NewFiles)-1].Meta - meta.Size = writerMeta.Size - meta.SmallestSeqNum = writerMeta.SmallestSeqNum - meta.LargestSeqNum = writerMeta.LargestSeqNum - meta.InitPhysicalBacking() - - // If the file didn't contain any range deletions, we can fill its - // table stats now, avoiding unnecessarily loading the table later. - maybeSetStatsFromProperties( - meta.PhysicalMeta(), &writerMeta.Properties, - ) - - if c.flushing == nil { - outputMetrics.TablesCompacted++ - outputMetrics.BytesCompacted += meta.Size - } else { - outputMetrics.TablesFlushed++ - outputMetrics.BytesFlushed += meta.Size - } - outputMetrics.Size += int64(meta.Size) - outputMetrics.NumFiles++ - outputMetrics.Additional.BytesWrittenDataBlocks += writerMeta.Properties.DataSize - outputMetrics.Additional.BytesWrittenValueBlocks += writerMeta.Properties.ValueBlocksSize - - if n := len(ve.NewFiles); n > 1 { - // This is not the first output file. Ensure the sstable boundaries - // are nonoverlapping. - prevMeta := ve.NewFiles[n-2].Meta - if writerMeta.SmallestRangeDel.UserKey != nil { - c := d.cmp(writerMeta.SmallestRangeDel.UserKey, prevMeta.Largest.UserKey) - if c < 0 { - return errors.Errorf( - "pebble: smallest range tombstone start key is less than previous sstable largest key: %s < %s", - writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey), - prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey)) - } else if c == 0 && !prevMeta.Largest.IsExclusiveSentinel() { - // The user key portion of the range boundary start key is - // equal to the previous table's largest key user key, and - // the previous table's largest key is not exclusive. This - // violates the invariant that tables are key-space - // partitioned. - return errors.Errorf( - "pebble: invariant violation: previous sstable largest key %s, current sstable smallest rangedel: %s", - prevMeta.Largest.Pretty(d.opts.Comparer.FormatKey), - writerMeta.SmallestRangeDel.Pretty(d.opts.Comparer.FormatKey), - ) - } - } - } - - // Verify that all range deletions outputted to the sstable are - // truncated to split key. - if splitKey != nil && writerMeta.LargestRangeDel.UserKey != nil && - d.cmp(writerMeta.LargestRangeDel.UserKey, splitKey) > 0 { - return errors.Errorf( - "pebble: invariant violation: rangedel largest key %q extends beyond split key %q", - writerMeta.LargestRangeDel.Pretty(d.opts.Comparer.FormatKey), - d.opts.Comparer.FormatKey(splitKey), - ) - } - - if writerMeta.HasPointKeys { - meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestPoint, writerMeta.LargestPoint) - } - if writerMeta.HasRangeDelKeys { - meta.ExtendPointKeyBounds(d.cmp, writerMeta.SmallestRangeDel, writerMeta.LargestRangeDel) - } - if writerMeta.HasRangeKeys { - meta.ExtendRangeKeyBounds(d.cmp, writerMeta.SmallestRangeKey, writerMeta.LargestRangeKey) - } - - // Verify that the sstable bounds fall within the compaction input - // bounds. This is a sanity check that we don't have a logic error - // elsewhere that causes the sstable bounds to accidentally expand past the - // compaction input bounds as doing so could lead to various badness such - // as keys being deleted by a range tombstone incorrectly. - if c.smallest.UserKey != nil { - switch v := d.cmp(meta.Smallest.UserKey, c.smallest.UserKey); { - case v >= 0: - // Nothing to do. - case v < 0: - return errors.Errorf("pebble: compaction output grew beyond bounds of input: %s < %s", - meta.Smallest.Pretty(d.opts.Comparer.FormatKey), - c.smallest.Pretty(d.opts.Comparer.FormatKey)) - } - } - if c.largest.UserKey != nil { - switch v := d.cmp(meta.Largest.UserKey, c.largest.UserKey); { - case v <= 0: - // Nothing to do. - case v > 0: - return errors.Errorf("pebble: compaction output grew beyond bounds of input: %s > %s", - meta.Largest.Pretty(d.opts.Comparer.FormatKey), - c.largest.Pretty(d.opts.Comparer.FormatKey)) - } - } - // Verify that we never split different revisions of the same user key - // across two different sstables. - if err := c.errorOnUserKeyOverlap(ve); err != nil { - return err - } - if err := meta.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { - return err - } - return nil - } - - // Build a compactionOutputSplitter that contains all logic to determine - // whether the compaction loop should stop writing to one output sstable and - // switch to a new one. Some splitters can wrap other splitters, and the - // splitterGroup can be composed of multiple splitters. In this case, we - // start off with splitters for file sizes, grandparent limits, and (for L0 - // splits) L0 limits, before wrapping them in an splitterGroup. - sizeSplitter := newFileSizeSplitter(&iter.frontiers, c.maxOutputFileSize, c.grandparents.Iter()) - unsafePrevUserKey := func() []byte { - // Return the largest point key written to tw or the start of - // the current range deletion in the fragmenter, whichever is - // greater. - prevPoint := prevPointKey.UnsafeKey() - if c.cmp(prevPoint.UserKey, c.rangeDelFrag.Start()) > 0 { - return prevPoint.UserKey - } - return c.rangeDelFrag.Start() - } - outputSplitters := []compactionOutputSplitter{ - // We do not split the same user key across different sstables within - // one flush or compaction. The fileSizeSplitter may request a split in - // the middle of a user key, so the userKeyChangeSplitter ensures we are - // at a user key change boundary when doing a split. - &userKeyChangeSplitter{ - cmp: c.cmp, - splitter: sizeSplitter, - unsafePrevUserKey: unsafePrevUserKey, - }, - newLimitFuncSplitter(&iter.frontiers, c.findGrandparentLimit), - } - if splitL0Outputs { - outputSplitters = append(outputSplitters, newLimitFuncSplitter(&iter.frontiers, c.findL0Limit)) - } - splitter := &splitterGroup{cmp: c.cmp, splitters: outputSplitters} - - // Each outer loop iteration produces one output file. An iteration that - // produces a file containing point keys (and optionally range tombstones) - // guarantees that the input iterator advanced. An iteration that produces - // a file containing only range tombstones guarantees the limit passed to - // `finishOutput()` advanced to a strictly greater user key corresponding - // to a grandparent file largest key, or nil. Taken together, these - // progress guarantees ensure that eventually the input iterator will be - // exhausted and the range tombstone fragments will all be flushed. - for key, val := iter.First(); key != nil || !c.rangeDelFrag.Empty() || !c.rangeKeyFrag.Empty(); { - var firstKey []byte - if key != nil { - firstKey = key.UserKey - } else if startKey := c.rangeDelFrag.Start(); startKey != nil { - // Pass the start key of the first pending tombstone to find the - // next limit. All pending tombstones have the same start key. We - // use this as opposed to the end key of the last written sstable to - // effectively handle cases like these: - // - // a.SET.3 - // (lf.limit at b) - // d.RANGEDEL.4:f - // - // In this case, the partition after b has only range deletions, so - // if we were to find the limit after the last written key at the - // split point (key a), we'd get the limit b again, and - // finishOutput() would not advance any further because the next - // range tombstone to write does not start until after the L0 split - // point. - firstKey = startKey - } - splitterSuggestion := splitter.onNewOutput(firstKey) - - // Each inner loop iteration processes one key from the input iterator. - for ; key != nil; key, val = iter.Next() { - if split := splitter.shouldSplitBefore(key, tw); split == splitNow { - break - } - - switch key.Kind() { - case InternalKeyKindRangeDelete: - // Range tombstones are handled specially. They are fragmented, - // and they're not written until later during `finishOutput()`. - // We add them to the `Fragmenter` now to make them visible to - // `compactionIter` so covered keys in the same snapshot stripe - // can be elided. - - // The interleaved range deletion might only be one of many with - // these bounds. Some fragmenting is performed ahead of time by - // keyspan.MergingIter. - if s := c.rangeDelIter.Span(); !s.Empty() { - // The memory management here is subtle. Range deletions - // blocks do NOT use prefix compression, which ensures that - // range deletion spans' memory is available as long we keep - // the iterator open. However, the keyspan.MergingIter that - // merges spans across levels only guarantees the lifetime - // of the [start, end) bounds until the next positioning - // method is called. - // - // Additionally, the Span.Keys slice is owned by the the - // range deletion iterator stack, and it may be overwritten - // when we advance. - // - // Clone the Keys slice and the start and end keys. - // - // TODO(jackson): Avoid the clone by removing c.rangeDelFrag - // and performing explicit truncation of the pending - // rangedel span as necessary. - clone := keyspan.Span{ - Start: iter.cloneKey(s.Start), - End: iter.cloneKey(s.End), - Keys: make([]keyspan.Key, len(s.Keys)), - } - copy(clone.Keys, s.Keys) - c.rangeDelFrag.Add(clone) - } - continue - case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: - // Range keys are handled in the same way as range tombstones, except - // with a dedicated fragmenter. - if s := c.rangeKeyInterleaving.Span(); !s.Empty() { - clone := keyspan.Span{ - Start: iter.cloneKey(s.Start), - End: iter.cloneKey(s.End), - Keys: make([]keyspan.Key, len(s.Keys)), - } - // Since the keys' Suffix and Value fields are not deep cloned, the - // underlying blockIter must be kept open for the lifetime of the - // compaction. - copy(clone.Keys, s.Keys) - c.rangeKeyFrag.Add(clone) - } - continue - } - if tw == nil { - if err := newOutput(); err != nil { - return nil, pendingOutputs, stats, err - } - } - if err := tw.AddWithForceObsolete(*key, val, iter.forceObsoleteDueToRangeDel); err != nil { - return nil, pendingOutputs, stats, err - } - if iter.snapshotPinned { - // The kv pair we just added to the sstable was only surfaced by - // the compaction iterator because an open snapshot prevented - // its elision. Increment the stats. - pinnedCount++ - pinnedKeySize += uint64(len(key.UserKey)) + base.InternalTrailerLen - pinnedValueSize += uint64(len(val)) - } - } - - // A splitter requested a split, and we're ready to finish the output. - // We need to choose the key at which to split any pending range - // tombstones. There are two options: - // 1. splitterSuggestion — The key suggested by the splitter. This key - // is guaranteed to be greater than the last key written to the - // current output. - // 2. key.UserKey — the first key of the next sstable output. This user - // key is also guaranteed to be greater than the last user key - // written to the current output (see userKeyChangeSplitter). - // - // Use whichever is smaller. Using the smaller of the two limits - // overlap with grandparents. Consider the case where the - // grandparent limit is calculated to be 'b', key is 'x', and - // there exist many sstables between 'b' and 'x'. If the range - // deletion fragmenter has a pending tombstone [a,x), splitting - // at 'x' would cause the output table to overlap many - // grandparents well beyond the calculated grandparent limit - // 'b'. Splitting at the smaller `splitterSuggestion` avoids - // this unbounded overlap with grandparent tables. - splitKey := splitterSuggestion - if key != nil && (splitKey == nil || c.cmp(splitKey, key.UserKey) > 0) { - splitKey = key.UserKey - } - if err := finishOutput(splitKey); err != nil { - return nil, pendingOutputs, stats, err - } - } - - for _, cl := range c.inputs { - iter := cl.files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - ve.DeletedFiles[deletedFileEntry{ - Level: cl.level, - FileNum: f.FileNum, - }] = f - } - } - - // The compaction iterator keeps track of a count of the number of DELSIZED - // keys that encoded an incorrect size. Propagate it up as a part of - // compactStats. - stats.countMissizedDels = iter.stats.countMissizedDels - - if err := d.objProvider.Sync(); err != nil { - return nil, pendingOutputs, stats, err - } - - // Refresh the disk available statistic whenever a compaction/flush - // completes, before re-acquiring the mutex. - _ = d.calculateDiskAvailableBytes() - - return ve, pendingOutputs, stats, nil -} - -// validateVersionEdit validates that start and end keys across new and deleted -// files in a versionEdit pass the given validation function. -func validateVersionEdit( - ve *versionEdit, validateFn func([]byte) error, format base.FormatKey, -) error { - validateMetaFn := func(f *manifest.FileMetadata) error { - for _, key := range []InternalKey{f.Smallest, f.Largest} { - if err := validateFn(key.UserKey); err != nil { - return errors.Wrapf(err, "key=%q; file=%s", format(key.UserKey), f) - } - } - return nil - } - - // Validate both new and deleted files. - for _, f := range ve.NewFiles { - if err := validateMetaFn(f.Meta); err != nil { - return err - } - } - for _, m := range ve.DeletedFiles { - if err := validateMetaFn(m); err != nil { - return err - } - } - - return nil -} - -// scanObsoleteFiles scans the filesystem for files that are no longer needed -// and adds those to the internal lists of obsolete files. Note that the files -// are not actually deleted by this method. A subsequent call to -// deleteObsoleteFiles must be performed. Must be not be called concurrently -// with compactions and flushes. db.mu must be held when calling this function. -func (d *DB) scanObsoleteFiles(list []string) { - // Disable automatic compactions temporarily to avoid concurrent compactions / - // flushes from interfering. The original value is restored on completion. - disabledPrev := d.opts.DisableAutomaticCompactions - defer func() { - d.opts.DisableAutomaticCompactions = disabledPrev - }() - d.opts.DisableAutomaticCompactions = true - - // Wait for any ongoing compaction to complete before continuing. - for d.mu.compact.compactingCount > 0 || d.mu.compact.flushing { - d.mu.compact.cond.Wait() - } - - liveFileNums := make(map[base.DiskFileNum]struct{}) - d.mu.versions.addLiveFileNums(liveFileNums) - // Protect against files which are only referred to by the ingestedFlushable - // from being deleted. These are added to the flushable queue on WAL replay - // during read only mode and aren't part of the Version. Note that if - // !d.opts.ReadOnly, then all flushables of type ingestedFlushable have - // already been flushed. - for _, fEntry := range d.mu.mem.queue { - if f, ok := fEntry.flushable.(*ingestedFlushable); ok { - for _, file := range f.files { - liveFileNums[file.FileBacking.DiskFileNum] = struct{}{} - } - } - } - - minUnflushedLogNum := d.mu.versions.minUnflushedLogNum - manifestFileNum := d.mu.versions.manifestFileNum - - var obsoleteLogs []fileInfo - var obsoleteTables []fileInfo - var obsoleteManifests []fileInfo - var obsoleteOptions []fileInfo - - for _, filename := range list { - fileType, diskFileNum, ok := base.ParseFilename(d.opts.FS, filename) - if !ok { - continue - } - switch fileType { - case fileTypeLog: - if diskFileNum.FileNum() >= minUnflushedLogNum { - continue - } - fi := fileInfo{fileNum: diskFileNum} - if stat, err := d.opts.FS.Stat(filename); err == nil { - fi.fileSize = uint64(stat.Size()) - } - obsoleteLogs = append(obsoleteLogs, fi) - case fileTypeManifest: - if diskFileNum.FileNum() >= manifestFileNum { - continue - } - fi := fileInfo{fileNum: diskFileNum} - if stat, err := d.opts.FS.Stat(filename); err == nil { - fi.fileSize = uint64(stat.Size()) - } - obsoleteManifests = append(obsoleteManifests, fi) - case fileTypeOptions: - if diskFileNum.FileNum() >= d.optionsFileNum.FileNum() { - continue - } - fi := fileInfo{fileNum: diskFileNum} - if stat, err := d.opts.FS.Stat(filename); err == nil { - fi.fileSize = uint64(stat.Size()) - } - obsoleteOptions = append(obsoleteOptions, fi) - case fileTypeTable: - // Objects are handled through the objstorage provider below. - default: - // Don't delete files we don't know about. - } - } - - objects := d.objProvider.List() - for _, obj := range objects { - switch obj.FileType { - case fileTypeTable: - if _, ok := liveFileNums[obj.DiskFileNum]; ok { - continue - } - fileInfo := fileInfo{ - fileNum: obj.DiskFileNum, - } - if size, err := d.objProvider.Size(obj); err == nil { - fileInfo.fileSize = uint64(size) - } - obsoleteTables = append(obsoleteTables, fileInfo) - - default: - // Ignore object types we don't know about. - } - } - - d.mu.log.queue = merge(d.mu.log.queue, obsoleteLogs) - d.mu.versions.metrics.WAL.Files = int64(len(d.mu.log.queue)) - d.mu.versions.obsoleteTables = mergeFileInfo(d.mu.versions.obsoleteTables, obsoleteTables) - d.mu.versions.updateObsoleteTableMetricsLocked() - d.mu.versions.obsoleteManifests = merge(d.mu.versions.obsoleteManifests, obsoleteManifests) - d.mu.versions.obsoleteOptions = merge(d.mu.versions.obsoleteOptions, obsoleteOptions) -} - -// disableFileDeletions disables file deletions and then waits for any -// in-progress deletion to finish. The caller is required to call -// enableFileDeletions in order to enable file deletions again. It is ok for -// multiple callers to disable file deletions simultaneously, though they must -// all invoke enableFileDeletions in order for file deletions to be re-enabled -// (there is an internal reference count on file deletion disablement). -// -// d.mu must be held when calling this method. -func (d *DB) disableFileDeletions() { - d.mu.disableFileDeletions++ - d.mu.Unlock() - defer d.mu.Lock() - d.cleanupManager.Wait() -} - -// enableFileDeletions enables previously disabled file deletions. A cleanup job -// is queued if necessary. -// -// d.mu must be held when calling this method. -func (d *DB) enableFileDeletions() { - if d.mu.disableFileDeletions <= 0 { - panic("pebble: file deletion disablement invariant violated") - } - d.mu.disableFileDeletions-- - if d.mu.disableFileDeletions > 0 { - return - } - jobID := d.mu.nextJobID - d.mu.nextJobID++ - d.deleteObsoleteFiles(jobID) -} - -type fileInfo struct { - fileNum base.DiskFileNum - fileSize uint64 -} - -// deleteObsoleteFiles enqueues a cleanup job to the cleanup manager, if necessary. -// -// d.mu must be held when calling this. The function will release and re-aquire the mutex. -// -// Does nothing if file deletions are disabled (see disableFileDeletions). A -// cleanup job will be scheduled when file deletions are re-enabled. -func (d *DB) deleteObsoleteFiles(jobID int) { - if d.mu.disableFileDeletions > 0 { - return - } - - var obsoleteLogs []fileInfo - for i := range d.mu.log.queue { - // NB: d.mu.versions.minUnflushedLogNum is the log number of the earliest - // log that has not had its contents flushed to an sstable. We can recycle - // the prefix of d.mu.log.queue with log numbers less than - // minUnflushedLogNum. - if d.mu.log.queue[i].fileNum.FileNum() >= d.mu.versions.minUnflushedLogNum { - obsoleteLogs = d.mu.log.queue[:i] - d.mu.log.queue = d.mu.log.queue[i:] - d.mu.versions.metrics.WAL.Files -= int64(len(obsoleteLogs)) - break - } - } - - obsoleteTables := append([]fileInfo(nil), d.mu.versions.obsoleteTables...) - d.mu.versions.obsoleteTables = nil - - for _, tbl := range obsoleteTables { - delete(d.mu.versions.zombieTables, tbl.fileNum) - } - - // Sort the manifests cause we want to delete some contiguous prefix - // of the older manifests. - sort.Slice(d.mu.versions.obsoleteManifests, func(i, j int) bool { - return d.mu.versions.obsoleteManifests[i].fileNum.FileNum() < - d.mu.versions.obsoleteManifests[j].fileNum.FileNum() - }) - - var obsoleteManifests []fileInfo - manifestsToDelete := len(d.mu.versions.obsoleteManifests) - d.opts.NumPrevManifest - if manifestsToDelete > 0 { - obsoleteManifests = d.mu.versions.obsoleteManifests[:manifestsToDelete] - d.mu.versions.obsoleteManifests = d.mu.versions.obsoleteManifests[manifestsToDelete:] - if len(d.mu.versions.obsoleteManifests) == 0 { - d.mu.versions.obsoleteManifests = nil - } - } - - obsoleteOptions := d.mu.versions.obsoleteOptions - d.mu.versions.obsoleteOptions = nil - - // Release d.mu while preparing the cleanup job and possibly waiting. - // Note the unusual order: Unlock and then Lock. - d.mu.Unlock() - defer d.mu.Lock() - - files := [4]struct { - fileType fileType - obsolete []fileInfo - }{ - {fileTypeLog, obsoleteLogs}, - {fileTypeTable, obsoleteTables}, - {fileTypeManifest, obsoleteManifests}, - {fileTypeOptions, obsoleteOptions}, - } - _, noRecycle := d.opts.Cleaner.(base.NeedsFileContents) - filesToDelete := make([]obsoleteFile, 0, len(obsoleteLogs)+len(obsoleteTables)+len(obsoleteManifests)+len(obsoleteOptions)) - for _, f := range files { - // We sort to make the order of deletions deterministic, which is nice for - // tests. - sort.Slice(f.obsolete, func(i, j int) bool { - return f.obsolete[i].fileNum.FileNum() < f.obsolete[j].fileNum.FileNum() - }) - for _, fi := range f.obsolete { - dir := d.dirname - switch f.fileType { - case fileTypeLog: - if !noRecycle && d.logRecycler.add(fi) { - continue - } - dir = d.walDirname - case fileTypeTable: - d.tableCache.evict(fi.fileNum) - } - - filesToDelete = append(filesToDelete, obsoleteFile{ - dir: dir, - fileNum: fi.fileNum, - fileType: f.fileType, - fileSize: fi.fileSize, - }) - } - } - if len(filesToDelete) > 0 { - d.cleanupManager.EnqueueJob(jobID, filesToDelete) - } - if d.opts.private.testingAlwaysWaitForCleanup { - d.cleanupManager.Wait() - } -} - -func (d *DB) maybeScheduleObsoleteTableDeletion() { - d.mu.Lock() - defer d.mu.Unlock() - d.maybeScheduleObsoleteTableDeletionLocked() -} - -func (d *DB) maybeScheduleObsoleteTableDeletionLocked() { - if len(d.mu.versions.obsoleteTables) > 0 { - jobID := d.mu.nextJobID - d.mu.nextJobID++ - d.deleteObsoleteFiles(jobID) - } -} - -func merge(a, b []fileInfo) []fileInfo { - if len(b) == 0 { - return a - } - - a = append(a, b...) - sort.Slice(a, func(i, j int) bool { - return a[i].fileNum.FileNum() < a[j].fileNum.FileNum() - }) - - n := 0 - for i := 0; i < len(a); i++ { - if n == 0 || a[i].fileNum != a[n-1].fileNum { - a[n] = a[i] - n++ - } - } - return a[:n] -} - -func mergeFileInfo(a, b []fileInfo) []fileInfo { - if len(b) == 0 { - return a - } - - a = append(a, b...) - sort.Slice(a, func(i, j int) bool { - return a[i].fileNum.FileNum() < a[j].fileNum.FileNum() - }) - - n := 0 - for i := 0; i < len(a); i++ { - if n == 0 || a[i].fileNum != a[n-1].fileNum { - a[n] = a[i] - n++ - } - } - return a[:n] -} - -func max[I constraints.Ordered](a, b I) I { - if b > a { - return b - } - return a -} diff --git a/vendor/github.com/cockroachdb/pebble/compaction_iter.go b/vendor/github.com/cockroachdb/pebble/compaction_iter.go deleted file mode 100644 index 299dbfc..0000000 --- a/vendor/github.com/cockroachdb/pebble/compaction_iter.go +++ /dev/null @@ -1,1658 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "bytes" - "encoding/binary" - "fmt" - "io" - "sort" - "strconv" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/bytealloc" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/rangekey" - "github.com/cockroachdb/redact" -) - -// compactionIter provides a forward-only iterator that encapsulates the logic -// for collapsing entries during compaction. It wraps an internal iterator and -// collapses entries that are no longer necessary because they are shadowed by -// newer entries. The simplest example of this is when the internal iterator -// contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries, -// compactionIter collapses the second entry because it is no longer -// necessary. The high-level structure for compactionIter is to iterate over -// its internal iterator and output 1 entry for every user-key. There are four -// complications to this story. -// -// 1. Eliding Deletion Tombstones -// -// Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to -// a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly -// shadows an entry at a lower level. If we're compacting to the base-level in -// the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower -// level and can be elided. -// -// We can do slightly better than only eliding deletion tombstones at the base -// level by observing that we can elide a deletion tombstone if there are no -// sstables that contain the entry's key. This check is performed by -// elideTombstone. -// -// 2. Merges -// -// The MERGE operation merges the value for an entry with the existing value -// for an entry. The logical value of an entry can be composed of a series of -// merge operations. When compactionIter sees a MERGE, it scans forward in its -// internal iterator collapsing MERGE operations for the same key until it -// encounters a SET or DELETE operation. For example, the keys a.MERGE.4, -// a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be -// merged using the specified Merger. -// -// An interesting case here occurs when MERGE is combined with SET. Consider -// the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The -// reason that the kind is changed to SET is because the SET operation acts as -// a barrier preventing further merging. This can be seen better in the -// scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower -// (older) level and not involved in the compaction. If the compaction of -// a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with -// a.MERGE.1 would merge the values together incorrectly. -// -// 3. Snapshots -// -// Snapshots are lightweight point-in-time views of the DB state. At its core, -// a snapshot is a sequence number along with a guarantee from Pebble that it -// will maintain the view of the database at that sequence number. Part of this -// guarantee is relatively straightforward to achieve. When reading from the -// database Pebble will ignore sequence numbers that are larger than the -// snapshot sequence number. The primary complexity with snapshots occurs -// during compaction: the collapsing of entries that are shadowed by newer -// entries is at odds with the guarantee that Pebble will maintain the view of -// the database at the snapshot sequence number. Rather than collapsing entries -// up to the next user key, compactionIter can only collapse entries up to the -// next snapshot boundary. That is, every snapshot boundary potentially causes -// another entry for the same user-key to be emitted. Another way to view this -// is that snapshots define stripes and entries are collapsed within stripes, -// but not across stripes. Consider the following scenario: -// -// a.PUT.9 -// a.DEL.8 -// a.PUT.7 -// a.DEL.6 -// a.PUT.5 -// -// In the absence of snapshots these entries would be collapsed to -// a.PUT.9. What if there is a snapshot at sequence number 7? The entries can -// be divided into two stripes and collapsed within the stripes: -// -// a.PUT.9 a.PUT.9 -// a.DEL.8 ---> -// a.PUT.7 -// -- -- -// a.DEL.6 ---> a.DEL.6 -// a.PUT.5 -// -// All of the rules described earlier still apply, but they are confined to -// operate within a snapshot stripe. Snapshots only affect compaction when the -// snapshot sequence number lies within the range of sequence numbers being -// compacted. In the above example, a snapshot at sequence number 10 or at -// sequence number 5 would not have any effect. -// -// 4. Range Deletions -// -// Range deletions provide the ability to delete all of the keys (and values) -// in a contiguous range. Range deletions are stored indexed by their start -// key. The end key of the range is stored in the value. In order to support -// lookup of the range deletions which overlap with a particular key, the range -// deletion tombstones need to be fragmented whenever they overlap. This -// fragmentation is performed by keyspan.Fragmenter. The fragments are then -// subject to the rules for snapshots. For example, consider the two range -// tombstones [a,e)#1 and [c,g)#2: -// -// 2: c-------g -// 1: a-------e -// -// These tombstones will be fragmented into: -// -// 2: c---e---g -// 1: a---c---e -// -// Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer -// depends on whether it is in a new snapshot stripe. -// -// In addition to the fragmentation of range tombstones, compaction also needs -// to take the range tombstones into consideration when outputting normal -// keys. Just as with point deletions, a range deletion covering an entry can -// cause the entry to be elided. -// -// A note on the stability of keys and values. -// -// The stability guarantees of keys and values returned by the iterator tree -// that backs a compactionIter is nuanced and care must be taken when -// referencing any returned items. -// -// Keys and values returned by exported functions (i.e. First, Next, etc.) have -// lifetimes that fall into two categories: -// -// Lifetime valid for duration of compaction. Range deletion keys and values are -// stable for the duration of the compaction, due to way in which a -// compactionIter is typically constructed (i.e. via (*compaction).newInputIter, -// which wraps the iterator over the range deletion block in a noCloseIter, -// preventing the release of the backing memory until the compaction is -// finished). -// -// Lifetime limited to duration of sstable block liveness. Point keys (SET, DEL, -// etc.) and values must be cloned / copied following the return from the -// exported function, and before a subsequent call to Next advances the iterator -// and mutates the contents of the returned key and value. -type compactionIter struct { - equal Equal - merge Merge - iter internalIterator - err error - // `key.UserKey` is set to `keyBuf` caused by saving `i.iterKey.UserKey` - // and `key.Trailer` is set to `i.iterKey.Trailer`. This is the - // case on return from all public methods -- these methods return `key`. - // Additionally, it is the internal state when the code is moving to the - // next key so it can determine whether the user key has changed from - // the previous key. - key InternalKey - // keyTrailer is updated when `i.key` is updated and holds the key's - // original trailer (eg, before any sequence-number zeroing or changes to - // key kind). - keyTrailer uint64 - value []byte - valueCloser io.Closer - // Temporary buffer used for storing the previous user key in order to - // determine when iteration has advanced to a new user key and thus a new - // snapshot stripe. - keyBuf []byte - // Temporary buffer used for storing the previous value, which may be an - // unsafe, i.iter-owned slice that could be altered when the iterator is - // advanced. - valueBuf []byte - // Is the current entry valid? - valid bool - iterKey *InternalKey - iterValue []byte - iterStripeChange stripeChangeType - // `skip` indicates whether the remaining skippable entries in the current - // snapshot stripe should be skipped or processed. An example of a non- - // skippable entry is a range tombstone as we need to return it from the - // `compactionIter`, even if a key covering its start key has already been - // seen in the same stripe. `skip` has no effect when `pos == iterPosNext`. - // - // TODO(jackson): If we use keyspan.InterleavingIter for range deletions, - // like we do for range keys, the only remaining 'non-skippable' key is - // the invalid key. We should be able to simplify this logic and remove this - // field. - skip bool - // `pos` indicates the iterator position at the top of `Next()`. Its type's - // (`iterPos`) values take on the following meanings in the context of - // `compactionIter`. - // - // - `iterPosCur`: the iterator is at the last key returned. - // - `iterPosNext`: the iterator has already been advanced to the next - // candidate key. For example, this happens when processing merge operands, - // where we advance the iterator all the way into the next stripe or next - // user key to ensure we've seen all mergeable operands. - // - `iterPosPrev`: this is invalid as compactionIter is forward-only. - pos iterPos - // `snapshotPinned` indicates whether the last point key returned by the - // compaction iterator was only returned because an open snapshot prevents - // its elision. This field only applies to point keys, and not to range - // deletions or range keys. - // - // For MERGE, it is possible that doing the merge is interrupted even when - // the next point key is in the same stripe. This can happen if the loop in - // mergeNext gets interrupted by sameStripeNonSkippable. - // sameStripeNonSkippable occurs due to RANGEDELs that sort before - // SET/MERGE/DEL with the same seqnum, so the RANGEDEL does not necessarily - // delete the subsequent SET/MERGE/DEL keys. - snapshotPinned bool - // forceObsoleteDueToRangeDel is set to true in a subset of the cases that - // snapshotPinned is true. This value is true when the point is obsolete due - // to a RANGEDEL but could not be deleted due to a snapshot. - // - // NB: it may seem that the additional cases that snapshotPinned captures - // are harmless in that they can also be used to mark a point as obsolete - // (it is merely a duplication of some logic that happens in - // Writer.AddWithForceObsolete), but that is not quite accurate as of this - // writing -- snapshotPinned originated in stats collection and for a - // sequence MERGE, SET, where the MERGE cannot merge with the (older) SET - // due to a snapshot, the snapshotPinned value for the SET is true. - // - // TODO(sumeer,jackson): improve the logic of snapshotPinned and reconsider - // whether we need forceObsoleteDueToRangeDel. - forceObsoleteDueToRangeDel bool - // The index of the snapshot for the current key within the snapshots slice. - curSnapshotIdx int - curSnapshotSeqNum uint64 - // The snapshot sequence numbers that need to be maintained. These sequence - // numbers define the snapshot stripes (see the Snapshots description - // above). The sequence numbers are in ascending order. - snapshots []uint64 - // frontiers holds a heap of user keys that affect compaction behavior when - // they're exceeded. Before a new key is returned, the compaction iterator - // advances the frontier, notifying any code that subscribed to be notified - // when a key was reached. The primary use today is within the - // implementation of compactionOutputSplitters in compaction.go. Many of - // these splitters wait for the compaction iterator to call Advance(k) when - // it's returning a new key. If the key that they're waiting for is - // surpassed, these splitters update internal state recording that they - // should request a compaction split next time they're asked in - // [shouldSplitBefore]. - frontiers frontiers - // Reference to the range deletion tombstone fragmenter (e.g., - // `compaction.rangeDelFrag`). - rangeDelFrag *keyspan.Fragmenter - rangeKeyFrag *keyspan.Fragmenter - // The fragmented tombstones. - tombstones []keyspan.Span - // The fragmented range keys. - rangeKeys []keyspan.Span - // Byte allocator for the tombstone keys. - alloc bytealloc.A - allowZeroSeqNum bool - elideTombstone func(key []byte) bool - elideRangeTombstone func(start, end []byte) bool - ineffectualSingleDeleteCallback func(userKey []byte) - singleDeleteInvariantViolationCallback func(userKey []byte) - // The on-disk format major version. This informs the types of keys that - // may be written to disk during a compaction. - formatVersion FormatMajorVersion - stats struct { - // count of DELSIZED keys that were missized. - countMissizedDels uint64 - } -} - -func newCompactionIter( - cmp Compare, - equal Equal, - formatKey base.FormatKey, - merge Merge, - iter internalIterator, - snapshots []uint64, - rangeDelFrag *keyspan.Fragmenter, - rangeKeyFrag *keyspan.Fragmenter, - allowZeroSeqNum bool, - elideTombstone func(key []byte) bool, - elideRangeTombstone func(start, end []byte) bool, - ineffectualSingleDeleteCallback func(userKey []byte), - singleDeleteInvariantViolationCallback func(userKey []byte), - formatVersion FormatMajorVersion, -) *compactionIter { - i := &compactionIter{ - equal: equal, - merge: merge, - iter: iter, - snapshots: snapshots, - frontiers: frontiers{cmp: cmp}, - rangeDelFrag: rangeDelFrag, - rangeKeyFrag: rangeKeyFrag, - allowZeroSeqNum: allowZeroSeqNum, - elideTombstone: elideTombstone, - elideRangeTombstone: elideRangeTombstone, - ineffectualSingleDeleteCallback: ineffectualSingleDeleteCallback, - singleDeleteInvariantViolationCallback: singleDeleteInvariantViolationCallback, - formatVersion: formatVersion, - } - i.rangeDelFrag.Cmp = cmp - i.rangeDelFrag.Format = formatKey - i.rangeDelFrag.Emit = i.emitRangeDelChunk - i.rangeKeyFrag.Cmp = cmp - i.rangeKeyFrag.Format = formatKey - i.rangeKeyFrag.Emit = i.emitRangeKeyChunk - return i -} - -func (i *compactionIter) First() (*InternalKey, []byte) { - if i.err != nil { - return nil, nil - } - var iterValue LazyValue - i.iterKey, iterValue = i.iter.First() - i.iterValue, _, i.err = iterValue.Value(nil) - if i.err != nil { - return nil, nil - } - if i.iterKey != nil { - i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(i.iterKey.SeqNum(), i.snapshots) - } - i.pos = iterPosNext - i.iterStripeChange = newStripeNewKey - return i.Next() -} - -func (i *compactionIter) Next() (*InternalKey, []byte) { - if i.err != nil { - return nil, nil - } - - // Close the closer for the current value if one was open. - if i.closeValueCloser() != nil { - return nil, nil - } - - // Prior to this call to `Next()` we are in one of four situations with - // respect to `iterKey` and related state: - // - // - `!skip && pos == iterPosNext`: `iterKey` is already at the next key. - // - `!skip && pos == iterPosCurForward`: We are at the key that has been returned. - // To move forward we advance by one key, even if that lands us in the same - // snapshot stripe. - // - `skip && pos == iterPosCurForward`: We are at the key that has been returned. - // To move forward we skip skippable entries in the stripe. - // - `skip && pos == iterPosNext && i.iterStripeChange == sameStripeNonSkippable`: - // This case may occur when skipping within a snapshot stripe and we - // encounter either: - // a) an invalid key kind; The previous call will have returned - // whatever key it was processing and deferred handling of the - // invalid key to this invocation of Next(). We're responsible for - // ignoring skip=true and falling into the invalid key kind case - // down below. - // b) an interleaved range delete; This is a wart of the current code - // structure. While skipping within a snapshot stripe, a range - // delete interleaved at its start key and sequence number - // interrupts the sequence of point keys. After we return the range - // delete to the caller, we need to pick up skipping at where we - // left off, so we preserve skip=true. - // TODO(jackson): This last case is confusing and can be removed if we - // interleave range deletions at the maximal sequence number using the - // keyspan interleaving iterator. This is the treatment given to range - // keys today. - if i.pos == iterPosCurForward { - if i.skip { - i.skipInStripe() - } else { - i.nextInStripe() - } - } else if i.skip { - if i.iterStripeChange != sameStripeNonSkippable { - panic(errors.AssertionFailedf("compaction iterator has skip=true, but iterator is at iterPosNext")) - } - } - - i.pos = iterPosCurForward - i.valid = false - - for i.iterKey != nil { - // If we entered a new snapshot stripe with the same key, any key we - // return on this iteration is only returned because the open snapshot - // prevented it from being elided or merged with the key returned for - // the previous stripe. Mark it as pinned so that the compaction loop - // can correctly populate output tables' pinned statistics. We might - // also set snapshotPinned=true down below if we observe that the key is - // deleted by a range deletion in a higher stripe or that this key is a - // tombstone that could be elided if only it were in the last snapshot - // stripe. - i.snapshotPinned = i.iterStripeChange == newStripeSameKey - - if i.iterKey.Kind() == InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKey.Kind()) { - // Return the span so the compaction can use it for file truncation and add - // it to the relevant fragmenter. We do not set `skip` to true before - // returning as there may be a forthcoming point key with the same user key - // and sequence number. Such a point key must be visible (i.e., not skipped - // over) since we promise point keys are not deleted by range tombstones at - // the same sequence number. - // - // Although, note that `skip` may already be true before reaching here - // due to an earlier key in the stripe. Then it is fine to leave it set - // to true, as the earlier key must have had a higher sequence number. - // - // NOTE: there is a subtle invariant violation here in that calling - // saveKey and returning a reference to the temporary slice violates - // the stability guarantee for range deletion keys. A potential - // mediation could return the original iterKey and iterValue - // directly, as the backing memory is guaranteed to be stable until - // the compaction completes. The violation here is only minor in - // that the caller immediately clones the range deletion InternalKey - // when passing the key to the deletion fragmenter (see the - // call-site in compaction.go). - // TODO(travers): address this violation by removing the call to - // saveKey and instead return the original iterKey and iterValue. - // This goes against the comment on i.key in the struct, and - // therefore warrants some investigation. - i.saveKey() - // TODO(jackson): Handle tracking pinned statistics for range keys - // and range deletions. This would require updating - // emitRangeDelChunk and rangeKeyCompactionTransform to update - // statistics when they apply their own snapshot striping logic. - i.snapshotPinned = false - i.value = i.iterValue - i.valid = true - return &i.key, i.value - } - - // TODO(sumeer): we could avoid calling Covers if i.iterStripeChange == - // sameStripeSameKey since that check has already been done in - // nextInStripeHelper. However, we also need to handle the case of - // CoversInvisibly below. - if cover := i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum); cover == keyspan.CoversVisibly { - // A pending range deletion deletes this key. Skip it. - i.saveKey() - i.skipInStripe() - continue - } else if cover == keyspan.CoversInvisibly { - // i.iterKey would be deleted by a range deletion if there weren't - // any open snapshots. Mark it as pinned. - // - // NB: there are multiple places in this file where we call - // i.rangeDelFrag.Covers and this is the only one where we are writing - // to i.snapshotPinned. Those other cases occur in mergeNext where the - // caller is deciding whether the value should be merged or not, and the - // key is in the same snapshot stripe. Hence, snapshotPinned is by - // definition false in those cases. - i.snapshotPinned = true - i.forceObsoleteDueToRangeDel = true - } else { - i.forceObsoleteDueToRangeDel = false - } - - switch i.iterKey.Kind() { - case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: - if i.elideTombstone(i.iterKey.UserKey) { - if i.curSnapshotIdx == 0 { - // If we're at the last snapshot stripe and the tombstone - // can be elided skip skippable keys in the same stripe. - i.saveKey() - if i.key.Kind() == InternalKeyKindSingleDelete { - i.skipDueToSingleDeleteElision() - } else { - i.skipInStripe() - if !i.skip && i.iterStripeChange != newStripeNewKey { - panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe disabled skip without advancing to new key")) - } - } - if i.iterStripeChange == newStripeSameKey { - panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe found a new stripe within the same key")) - } - continue - } else { - // We're not at the last snapshot stripe, so the tombstone - // can NOT yet be elided. Mark it as pinned, so that it's - // included in table statistics appropriately. - i.snapshotPinned = true - } - } - - switch i.iterKey.Kind() { - case InternalKeyKindDelete: - i.saveKey() - i.value = i.iterValue - i.valid = true - i.skip = true - return &i.key, i.value - - case InternalKeyKindDeleteSized: - // We may skip subsequent keys because of this tombstone. Scan - // ahead to see just how much data this tombstone drops and if - // the tombstone's value should be updated accordingly. - return i.deleteSizedNext() - - case InternalKeyKindSingleDelete: - if i.singleDeleteNext() { - return &i.key, i.value - } else if i.err != nil { - return nil, nil - } - continue - - default: - panic(errors.AssertionFailedf( - "unexpected kind %s", redact.SafeString(i.iterKey.Kind().String()))) - } - - case InternalKeyKindSet, InternalKeyKindSetWithDelete: - // The key we emit for this entry is a function of the current key - // kind, and whether this entry is followed by a DEL/SINGLEDEL - // entry. setNext() does the work to move the iterator forward, - // preserving the original value, and potentially mutating the key - // kind. - i.setNext() - if i.err != nil { - return nil, nil - } - return &i.key, i.value - - case InternalKeyKindMerge: - // Record the snapshot index before mergeNext as merging - // advances the iterator, adjusting curSnapshotIdx. - origSnapshotIdx := i.curSnapshotIdx - var valueMerger ValueMerger - valueMerger, i.err = i.merge(i.iterKey.UserKey, i.iterValue) - var change stripeChangeType - if i.err == nil { - change = i.mergeNext(valueMerger) - } - var needDelete bool - if i.err == nil { - // includesBase is true whenever we've transformed the MERGE record - // into a SET. - var includesBase bool - switch i.key.Kind() { - case InternalKeyKindSet, InternalKeyKindSetWithDelete: - includesBase = true - case InternalKeyKindMerge: - default: - panic(errors.AssertionFailedf( - "unexpected kind %s", redact.SafeString(i.key.Kind().String()))) - } - i.value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, includesBase) - } - if i.err == nil { - if needDelete { - i.valid = false - if i.closeValueCloser() != nil { - return nil, nil - } - continue - } - // A non-skippable entry does not necessarily cover later merge - // operands, so we must not zero the current merge result's seqnum. - // - // For example, suppose the forthcoming two keys are a range - // tombstone, `[a, b)#3`, and a merge operand, `a#3`. Recall that - // range tombstones do not cover point keys at the same seqnum, so - // `a#3` is not deleted. The range tombstone will be seen first due - // to its larger value type. Since it is a non-skippable key, the - // current merge will not include `a#3`. If we zeroed the current - // merge result's seqnum, then it would conflict with the upcoming - // merge including `a#3`, whose seqnum will also be zeroed. - if change != sameStripeNonSkippable { - i.maybeZeroSeqnum(origSnapshotIdx) - } - return &i.key, i.value - } - if i.err != nil { - i.valid = false - // TODO(sumeer): why is MarkCorruptionError only being called for - // MERGE? - i.err = base.MarkCorruptionError(i.err) - } - return nil, nil - - default: - i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) - i.valid = false - return nil, nil - } - } - - return nil, nil -} - -func (i *compactionIter) closeValueCloser() error { - if i.valueCloser == nil { - return nil - } - - i.err = i.valueCloser.Close() - i.valueCloser = nil - if i.err != nil { - i.valid = false - } - return i.err -} - -// snapshotIndex returns the index of the first sequence number in snapshots -// which is greater than or equal to seq. -func snapshotIndex(seq uint64, snapshots []uint64) (int, uint64) { - index := sort.Search(len(snapshots), func(i int) bool { - return snapshots[i] > seq - }) - if index >= len(snapshots) { - return index, InternalKeySeqNumMax - } - return index, snapshots[index] -} - -// skipInStripe skips over skippable keys in the same stripe and user key. It -// may set i.err, in which case i.iterKey will be nil. -func (i *compactionIter) skipInStripe() { - i.skip = true - // TODO(sumeer): we can avoid the overhead of calling i.rangeDelFrag.Covers, - // in this case of nextInStripe, since we are skipping all of them anyway. - for i.nextInStripe() == sameStripeSkippable { - if i.err != nil { - panic(i.err) - } - } - // Reset skip if we landed outside the original stripe. Otherwise, we landed - // in the same stripe on a non-skippable key. In that case we should preserve - // `i.skip == true` such that later keys in the stripe will continue to be - // skipped. - if i.iterStripeChange == newStripeNewKey || i.iterStripeChange == newStripeSameKey { - i.skip = false - } -} - -func (i *compactionIter) iterNext() bool { - var iterValue LazyValue - i.iterKey, iterValue = i.iter.Next() - i.iterValue, _, i.err = iterValue.Value(nil) - if i.err != nil { - i.iterKey = nil - } - return i.iterKey != nil -} - -// stripeChangeType indicates how the snapshot stripe changed relative to the -// previous key. If no change, it also indicates whether the current entry is -// skippable. If the snapshot stripe changed, it also indicates whether the new -// stripe was entered because the iterator progressed onto an entirely new key -// or entered a new stripe within the same key. -type stripeChangeType int - -const ( - newStripeNewKey stripeChangeType = iota - newStripeSameKey - sameStripeSkippable - sameStripeNonSkippable -) - -// nextInStripe advances the iterator and returns one of the above const ints -// indicating how its state changed. -// -// All sameStripeSkippable keys that are covered by a RANGEDEL will be skipped -// and not returned. -// -// Calls to nextInStripe must be preceded by a call to saveKey to retain a -// temporary reference to the original key, so that forward iteration can -// proceed with a reference to the original key. Care should be taken to avoid -// overwriting or mutating the saved key or value before they have been returned -// to the caller of the exported function (i.e. the caller of Next, First, etc.) -// -// nextInStripe may set i.err, in which case the return value will be -// newStripeNewKey, and i.iterKey will be nil. -func (i *compactionIter) nextInStripe() stripeChangeType { - i.iterStripeChange = i.nextInStripeHelper() - return i.iterStripeChange -} - -// nextInStripeHelper is an internal helper for nextInStripe; callers should use -// nextInStripe and not call nextInStripeHelper. -func (i *compactionIter) nextInStripeHelper() stripeChangeType { - origSnapshotIdx := i.curSnapshotIdx - for { - if !i.iterNext() { - return newStripeNewKey - } - key := i.iterKey - - if !i.equal(i.key.UserKey, key.UserKey) { - i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) - return newStripeNewKey - } - - // If i.key and key have the same user key, then - // 1. i.key must not have had a zero sequence number (or it would've be the last - // key with its user key). - // 2. i.key must have a strictly larger sequence number - // There's an exception in that either key may be a range delete. Range - // deletes may share a sequence number with a point key if the keys were - // ingested together. Range keys may also share the sequence number if they - // were ingested, but range keys are interleaved into the compaction - // iterator's input iterator at the maximal sequence number so their - // original sequence number will not be observed here. - if prevSeqNum := base.SeqNumFromTrailer(i.keyTrailer); (prevSeqNum == 0 || prevSeqNum <= key.SeqNum()) && - i.key.Kind() != InternalKeyKindRangeDelete && key.Kind() != InternalKeyKindRangeDelete { - prevKey := i.key - prevKey.Trailer = i.keyTrailer - panic(errors.AssertionFailedf("pebble: invariant violation: %s and %s out of order", prevKey, key)) - } - - i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) - switch key.Kind() { - case InternalKeyKindRangeDelete: - // Range tombstones need to be exposed by the compactionIter to the upper level - // `compaction` object, so return them regardless of whether they are in the same - // snapshot stripe. - if i.curSnapshotIdx == origSnapshotIdx { - return sameStripeNonSkippable - } - return newStripeSameKey - case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: - // Range keys are interleaved at the max sequence number for a given user - // key, so we should not see any more range keys in this stripe. - panic("unreachable") - case InternalKeyKindInvalid: - if i.curSnapshotIdx == origSnapshotIdx { - return sameStripeNonSkippable - } - return newStripeSameKey - case InternalKeyKindDelete, InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSingleDelete, - InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized: - // Fall through - default: - i.iterKey = nil - i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) - i.valid = false - return newStripeNewKey - } - if i.curSnapshotIdx == origSnapshotIdx { - // Same snapshot. - if i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum) == keyspan.CoversVisibly { - continue - } - return sameStripeSkippable - } - return newStripeSameKey - } -} - -func (i *compactionIter) setNext() { - // Save the current key. - i.saveKey() - i.value = i.iterValue - i.valid = true - i.maybeZeroSeqnum(i.curSnapshotIdx) - - // There are two cases where we can early return and skip the remaining - // records in the stripe: - // - If the DB does not SETWITHDEL. - // - If this key is already a SETWITHDEL. - if i.formatVersion < FormatSetWithDelete || - i.iterKey.Kind() == InternalKeyKindSetWithDelete { - i.skip = true - return - } - - // We are iterating forward. Save the current value. - i.valueBuf = append(i.valueBuf[:0], i.iterValue...) - i.value = i.valueBuf - - // Else, we continue to loop through entries in the stripe looking for a - // DEL. Note that we may stop *before* encountering a DEL, if one exists. - // - // NB: nextInStripe will skip sameStripeSkippable keys that are visibly - // covered by a RANGEDEL. This can include DELs -- this is fine since such - // DELs don't need to be combined with SET to make SETWITHDEL. - for { - switch i.nextInStripe() { - case newStripeNewKey, newStripeSameKey: - i.pos = iterPosNext - return - case sameStripeNonSkippable: - i.pos = iterPosNext - // We iterated onto a key that we cannot skip. We can - // conservatively transform the original SET into a SETWITHDEL - // as an indication that there *may* still be a DEL/SINGLEDEL - // under this SET, even if we did not actually encounter one. - // - // This is safe to do, as: - // - // - in the case that there *is not* actually a DEL/SINGLEDEL - // under this entry, any SINGLEDEL above this now-transformed - // SETWITHDEL will become a DEL when the two encounter in a - // compaction. The DEL will eventually be elided in a - // subsequent compaction. The cost for ensuring correctness is - // that this entry is kept around for an additional compaction - // cycle(s). - // - // - in the case there *is* indeed a DEL/SINGLEDEL under us - // (but in a different stripe or sstable), then we will have - // already done the work to transform the SET into a - // SETWITHDEL, and we will skip any additional iteration when - // this entry is encountered again in a subsequent compaction. - // - // Ideally, this codepath would be smart enough to handle the - // case of SET <- RANGEDEL <- ... <- DEL/SINGLEDEL <- .... - // This requires preserving any RANGEDEL entries we encounter - // along the way, then emitting the original (possibly - // transformed) key, followed by the RANGEDELs. This requires - // a sizable refactoring of the existing code, as nextInStripe - // currently returns a sameStripeNonSkippable when it - // encounters a RANGEDEL. - // TODO(travers): optimize to handle the RANGEDEL case if it - // turns out to be a performance problem. - i.key.SetKind(InternalKeyKindSetWithDelete) - - // By setting i.skip=true, we are saying that after the - // non-skippable key is emitted (which is likely a RANGEDEL), - // the remaining point keys that share the same user key as this - // saved key should be skipped. - i.skip = true - return - case sameStripeSkippable: - // We're still in the same stripe. If this is a - // DEL/SINGLEDEL/DELSIZED, we stop looking and emit a SETWITHDEL. - // Subsequent keys are eligible for skipping. - switch i.iterKey.Kind() { - case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: - i.key.SetKind(InternalKeyKindSetWithDelete) - i.skip = true - return - case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSetWithDelete: - // Do nothing - default: - i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) - i.valid = false - } - default: - panic("pebble: unexpected stripeChangeType: " + strconv.Itoa(int(i.iterStripeChange))) - } - } -} - -func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { - // Save the current key. - i.saveKey() - i.valid = true - - // Loop looking for older values in the current snapshot stripe and merge - // them. - for { - if i.nextInStripe() != sameStripeSkippable { - i.pos = iterPosNext - return i.iterStripeChange - } - if i.err != nil { - panic(i.err) - } - // NB: MERGE#10+RANGEDEL#9 stays a MERGE, since nextInStripe skips - // sameStripeSkippable keys that are visibly covered by a RANGEDEL. There - // may be MERGE#7 that is invisibly covered and will be preserved, but - // there is no risk that MERGE#10 and MERGE#7 will get merged in the - // future as the RANGEDEL still exists and will be used in user-facing - // reads that see MERGE#10, and will also eventually cause MERGE#7 to be - // deleted in a compaction. - key := i.iterKey - switch key.Kind() { - case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: - // We've hit a deletion tombstone. Return everything up to this point and - // then skip entries until the next snapshot stripe. We change the kind - // of the result key to a Set so that it shadows keys in lower - // levels. That is, MERGE+DEL -> SETWITHDEL. - // - // We do the same for SingleDelete since SingleDelete is only - // permitted (with deterministic behavior) for keys that have been - // set once since the last SingleDelete/Delete, so everything - // older is acceptable to shadow. Note that this is slightly - // different from singleDeleteNext() which implements stricter - // semantics in terms of applying the SingleDelete to the single - // next Set. But those stricter semantics are not observable to - // the end-user since Iterator interprets SingleDelete as Delete. - // We could do something more complicated here and consume only a - // single Set, and then merge in any following Sets, but that is - // complicated wrt code and unnecessary given the narrow permitted - // use of SingleDelete. - i.key.SetKind(InternalKeyKindSetWithDelete) - i.skip = true - return sameStripeSkippable - - case InternalKeyKindSet, InternalKeyKindSetWithDelete: - // We've hit a Set or SetWithDel value. Merge with the existing - // value and return. We change the kind of the resulting key to a - // Set so that it shadows keys in lower levels. That is: - // MERGE + (SET*) -> SET. - i.err = valueMerger.MergeOlder(i.iterValue) - if i.err != nil { - i.valid = false - return sameStripeSkippable - } - i.key.SetKind(InternalKeyKindSet) - i.skip = true - return sameStripeSkippable - - case InternalKeyKindMerge: - // We've hit another Merge value. Merge with the existing value and - // continue looping. - i.err = valueMerger.MergeOlder(i.iterValue) - if i.err != nil { - i.valid = false - return sameStripeSkippable - } - - default: - i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) - i.valid = false - return sameStripeSkippable - } - } -} - -// singleDeleteNext processes a SingleDelete point tombstone. A SingleDelete, or -// SINGLEDEL, is unique in that it deletes exactly 1 internal key. It's a -// performance optimization when the client knows a user key has not been -// overwritten, allowing the elision of the tombstone earlier, avoiding write -// amplification. -// -// singleDeleteNext returns a boolean indicating whether or not the caller -// should yield the SingleDelete key to the consumer of the compactionIter. If -// singleDeleteNext returns false, the caller may consume/elide the -// SingleDelete. -func (i *compactionIter) singleDeleteNext() bool { - // Save the current key. - i.saveKey() - i.value = i.iterValue - i.valid = true - - // Loop until finds a key to be passed to the next level. - for { - // If we find a key that can't be skipped, return true so that the - // caller yields the SingleDelete to the caller. - if i.nextInStripe() != sameStripeSkippable { - // This defers additional error checking regarding single delete - // invariants to the compaction where the keys with the same user key as - // the single delete are in the same stripe. - i.pos = iterPosNext - return i.err == nil - } - if i.err != nil { - panic(i.err) - } - // INVARIANT: sameStripeSkippable. - key := i.iterKey - kind := key.Kind() - switch kind { - case InternalKeyKindDelete, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized: - if (kind == InternalKeyKindDelete || kind == InternalKeyKindDeleteSized) && - i.ineffectualSingleDeleteCallback != nil { - i.ineffectualSingleDeleteCallback(i.key.UserKey) - } - // We've hit a Delete, DeleteSized, SetWithDelete, transform - // the SingleDelete into a full Delete. - i.key.SetKind(InternalKeyKindDelete) - i.skip = true - return true - - case InternalKeyKindSet, InternalKeyKindMerge: - // This SingleDelete deletes the Set/Merge, and we can now elide the - // SingleDel as well. We advance past the Set and return false to - // indicate to the main compaction loop that we should NOT yield the - // current SingleDel key to the compaction loop. - // - // NB: singleDeleteNext was called with i.pos == iterPosCurForward, and - // after the call to nextInStripe, we are still at iterPosCurForward, - // since we are at the key after the Set/Merge that was single deleted. - change := i.nextInStripe() - switch change { - case sameStripeSkippable, newStripeSameKey: - // On the same user key. - nextKind := i.iterKey.Kind() - switch nextKind { - case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge: - if i.singleDeleteInvariantViolationCallback != nil { - // sameStripeSkippable keys returned by nextInStripe() are already - // known to not be covered by a RANGEDEL, so it is an invariant - // violation. The rare case is newStripeSameKey, where it is a - // violation if not covered by a RANGEDEL. - if change == sameStripeSkippable || - i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum) == keyspan.NoCover { - i.singleDeleteInvariantViolationCallback(i.key.UserKey) - } - } - case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete, - InternalKeyKindRangeDelete: - default: - panic(errors.AssertionFailedf( - "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) - } - case sameStripeNonSkippable: - // No ability to check whether there is another Set/Merge below with - // the same user key. - // - // TODO(sumeer): once range deletions are interleaved at the maximal - // sequence number, this case will go away. - case newStripeNewKey: - default: - panic("unreachable") - } - i.valid = false - return false - - case InternalKeyKindSingleDelete: - // Two single deletes met in a compaction. The first single delete is - // ineffectual. - if i.ineffectualSingleDeleteCallback != nil { - i.ineffectualSingleDeleteCallback(i.key.UserKey) - } - // Continue to apply the second single delete. - continue - - default: - i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) - i.valid = false - return false - } - } -} - -// skipDueToSingleDeleteElision is called when the SingleDelete is being -// elided because it is in the final snapshot stripe and there are no keys -// with the same user key in lower levels in the LSM (below the files in this -// compaction). -// -// TODO(sumeer): the only difference between singleDeleteNext and -// skipDueToSingleDeleteElision is the fact that the caller knows it will be -// eliding the single delete in the latter case. There are some similar things -// happening in both implementations. My first attempt at combining them into -// a single method was hard to comprehend. Try again. -func (i *compactionIter) skipDueToSingleDeleteElision() { - for { - stripeChange := i.nextInStripe() - if i.err != nil { - panic(i.err) - } - switch stripeChange { - case newStripeNewKey: - // The single delete is only now being elided, meaning it did not elide - // any keys earlier in its descent down the LSM. We stepped onto a new - // user key, meaning that even now at its moment of elision, it still - // hasn't elided any other keys. The single delete was ineffectual (a - // no-op). - if i.ineffectualSingleDeleteCallback != nil { - i.ineffectualSingleDeleteCallback(i.key.UserKey) - } - i.skip = false - return - case newStripeSameKey: - // This should be impossible. If we're eliding a single delete, we - // determined that the tombstone is in the final snapshot stripe, but we - // stepped into a new stripe of the same key. - panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe")) - case sameStripeNonSkippable: - // There's a key that we cannot skip. There are two possible cases: - // a. The key is invalid. This is an error. - // b. The key is a range deletion. - // The second case may also be an ineffectual single delete. However, it - // is possible that there is a SET that is at the same seqnum as the - // RANGEDEL, and so is not deleted by that RANGEDEL, and will be deleted - // by this single delete. So we cannot be certain that this is an - // ineffectual single delete. - // - // TODO(sumeer): the existing todo to interleave range deletions at the - // maximal sequence number will allow us to address this ambiguity. - // - // TODO(sumeer): by setting skip to true, the compactionIter is making a - // single delete stronger (like a del), which will hide bugs in the use of - // single delete. - i.skip = true - return - case sameStripeSkippable: - kind := i.iterKey.Kind() - switch kind { - case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete: - if i.ineffectualSingleDeleteCallback != nil { - i.ineffectualSingleDeleteCallback(i.key.UserKey) - } - switch kind { - case InternalKeyKindDelete, InternalKeyKindDeleteSized: - i.skipInStripe() - return - case InternalKeyKindSingleDelete: - // Repeat the same with this SingleDelete. We don't want to simply - // call skipInStripe(), since it increases the strength of the - // SingleDel, which hides bugs in the use of single delete. - continue - default: - panic(errors.AssertionFailedf( - "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) - } - case InternalKeyKindSetWithDelete: - // The SingleDelete should behave like a Delete. - i.skipInStripe() - return - case InternalKeyKindSet, InternalKeyKindMerge: - // This SingleDelete deletes the Set/Merge, and we are eliding the - // SingleDel as well. Step to the next key (this is not deleted by the - // SingleDelete). - // - // NB: skipDueToSingleDeleteElision was called with i.pos == - // iterPosCurForward, and after the call to nextInStripe, we are still - // at iterPosCurForward, since we are at the key after the Set/Merge - // that was single deleted. - change := i.nextInStripe() - if i.err != nil { - panic(i.err) - } - switch change { - case newStripeSameKey: - panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe")) - case newStripeNewKey: - case sameStripeSkippable: - // On the same key. - nextKind := i.iterKey.Kind() - switch nextKind { - case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge: - if i.singleDeleteInvariantViolationCallback != nil { - i.singleDeleteInvariantViolationCallback(i.key.UserKey) - } - case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete, - InternalKeyKindRangeDelete: - default: - panic(errors.AssertionFailedf( - "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) - } - case sameStripeNonSkippable: - // No ability to check whether there is another Set/Merge below with - // the same user key. - // - // TODO(sumeer): once range deletions are interleaved at the maximal - // sequence number, this case will go away. - default: - panic("unreachable") - } - // Whether in same stripe or new stripe, this key is not consumed by - // the SingleDelete. - i.skip = false - return - default: - panic(errors.AssertionFailedf( - "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) - } - default: - panic("unreachable") - } - } -} - -// deleteSizedNext processes a DELSIZED point tombstone. Unlike ordinary DELs, -// these tombstones carry a value that's a varint indicating the size of the -// entry (len(key)+len(value)) that the tombstone is expected to delete. -// -// When a deleteSizedNext is encountered, we skip ahead to see which keys, if -// any, are elided as a result of the tombstone. -func (i *compactionIter) deleteSizedNext() (*base.InternalKey, []byte) { - i.saveKey() - i.valid = true - i.skip = true - - // The DELSIZED tombstone may have no value at all. This happens when the - // tombstone has already deleted the key that the user originally predicted. - // In this case, we still peek forward in case there's another DELSIZED key - // with a lower sequence number, in which case we'll adopt its value. - if len(i.iterValue) == 0 { - i.value = i.valueBuf[:0] - } else { - i.valueBuf = append(i.valueBuf[:0], i.iterValue...) - i.value = i.valueBuf - } - - // Loop through all the keys within this stripe that are skippable. - i.pos = iterPosNext - for i.nextInStripe() == sameStripeSkippable { - if i.err != nil { - panic(i.err) - } - switch i.iterKey.Kind() { - case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete: - // We encountered a tombstone (DEL, or DELSIZED) that's deleted by - // the original DELSIZED tombstone. This can happen in two cases: - // - // (1) These tombstones were intended to delete two distinct values, - // and this DELSIZED has already dropped the relevant key. For - // example: - // - // a.DELSIZED.9 a.SET.7 a.DELSIZED.5 a.SET.4 - // - // If a.DELSIZED.9 has already deleted a.SET.7, its size has - // already been zeroed out. In this case, we want to adopt the - // value of the DELSIZED with the lower sequence number, in - // case the a.SET.4 key has not yet been elided. - // - // (2) This DELSIZED was missized. The user thought they were - // deleting a key with this user key, but this user key had - // already been deleted. - // - // We can differentiate these two cases by examining the length of - // the DELSIZED's value. A DELSIZED's value holds the size of both - // the user key and value that it intends to delete. For any user - // key with a length > 0, a DELSIZED that has not deleted a key must - // have a value with a length > 0. - // - // We treat both cases the same functionally, adopting the identity - // of the lower-sequence numbered tombstone. However in the second - // case, we also increment the stat counting missized tombstones. - if len(i.value) > 0 { - // The original DELSIZED key was missized. The key that the user - // thought they were deleting does not exist. - i.stats.countMissizedDels++ - } - i.valueBuf = append(i.valueBuf[:0], i.iterValue...) - i.value = i.valueBuf - if i.iterKey.Kind() != InternalKeyKindDeleteSized { - // Convert the DELSIZED to a DEL—The DEL/SINGLEDEL we're eliding - // may not have deleted the key(s) it was intended to yet. The - // ordinary DEL compaction heuristics are better suited at that, - // plus we don't want to count it as a missized DEL. We early - // exit in this case, after skipping the remainder of the - // snapshot stripe. - i.key.SetKind(InternalKeyKindDelete) - // NB: We skipInStripe now, rather than returning leaving - // i.skip=true and returning early, because Next() requires - // that i.skip=true only if i.iterPos = iterPosCurForward. - // - // Ignore any error caused by skipInStripe since it does not affect - // the key/value being returned here, and the next call to Next() will - // expose it. - i.skipInStripe() - return &i.key, i.value - } - // Continue, in case we uncover another DELSIZED or a key this - // DELSIZED deletes. - - case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSetWithDelete: - // If the DELSIZED is value-less, it already deleted the key that it - // was intended to delete. This is possible with a sequence like: - // - // DELSIZED.8 SET.7 SET.3 - // - // The DELSIZED only describes the size of the SET.7, which in this - // case has already been elided. We don't count it as a missizing, - // instead converting the DELSIZED to a DEL. Skip the remainder of - // the snapshot stripe and return. - if len(i.value) == 0 { - i.key.SetKind(InternalKeyKindDelete) - // NB: We skipInStripe now, rather than returning leaving - // i.skip=true and returning early, because Next() requires - // that i.skip=true only if i.iterPos = iterPosCurForward. - // - // Ignore any error caused by skipInStripe since it does not affect - // the key/value being returned here, and the next call to Next() will - // expose it. - i.skipInStripe() - return &i.key, i.value - } - // The deleted key is not a DEL, DELSIZED, and the DELSIZED in i.key - // has a positive size. - expectedSize, n := binary.Uvarint(i.value) - if n != len(i.value) { - i.err = base.CorruptionErrorf("DELSIZED holds invalid value: %x", errors.Safe(i.value)) - i.valid = false - return nil, nil - } - elidedSize := uint64(len(i.iterKey.UserKey)) + uint64(len(i.iterValue)) - if elidedSize != expectedSize { - // The original DELSIZED key was missized. It's unclear what to - // do. The user-provided size was wrong, so it's unlikely to be - // accurate or meaningful. We could: - // - // 1. return the DELSIZED with the original user-provided size unmodified - // 2. return the DELZIZED with a zeroed size to reflect that a key was - // elided, even if it wasn't the anticipated size. - // 3. subtract the elided size from the estimate and re-encode. - // 4. convert the DELSIZED into a value-less DEL, so that - // ordinary DEL heuristics apply. - // - // We opt for (4) under the rationale that we can't rely on the - // user-provided size for accuracy, so ordinary DEL heuristics - // are safer. - i.stats.countMissizedDels++ - i.key.SetKind(InternalKeyKindDelete) - i.value = i.valueBuf[:0] - // NB: We skipInStripe now, rather than returning leaving - // i.skip=true and returning early, because Next() requires - // that i.skip=true only if i.iterPos = iterPosCurForward. - // - // Ignore any error caused by skipInStripe since it does not affect - // the key/value being returned here, and the next call to Next() will - // expose it. - i.skipInStripe() - return &i.key, i.value - } - // NB: We remove the value regardless of whether the key was sized - // appropriately. The size encoded is 'consumed' the first time it - // meets a key that it deletes. - i.value = i.valueBuf[:0] - - default: - i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) - i.valid = false - return nil, nil - } - } - // Reset skip if we landed outside the original stripe. Otherwise, we landed - // in the same stripe on a non-skippable key. In that case we should preserve - // `i.skip == true` such that later keys in the stripe will continue to be - // skipped. - if i.iterStripeChange == newStripeNewKey || i.iterStripeChange == newStripeSameKey { - i.skip = false - } - if i.err != nil { - return nil, nil - } - return &i.key, i.value -} - -func (i *compactionIter) saveKey() { - i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) - i.key.UserKey = i.keyBuf - i.key.Trailer = i.iterKey.Trailer - i.keyTrailer = i.iterKey.Trailer - i.frontiers.Advance(i.key.UserKey) -} - -func (i *compactionIter) cloneKey(key []byte) []byte { - i.alloc, key = i.alloc.Copy(key) - return key -} - -func (i *compactionIter) Key() InternalKey { - return i.key -} - -func (i *compactionIter) Value() []byte { - return i.value -} - -func (i *compactionIter) Valid() bool { - return i.valid -} - -func (i *compactionIter) Error() error { - return i.err -} - -func (i *compactionIter) Close() error { - err := i.iter.Close() - if i.err == nil { - i.err = err - } - - // Close the closer for the current value if one was open. - if i.valueCloser != nil { - i.err = firstError(i.err, i.valueCloser.Close()) - i.valueCloser = nil - } - - return i.err -} - -// Tombstones returns a list of pending range tombstones in the fragmenter -// up to the specified key, or all pending range tombstones if key = nil. -func (i *compactionIter) Tombstones(key []byte) []keyspan.Span { - if key == nil { - i.rangeDelFrag.Finish() - } else { - // The specified end key is exclusive; no versions of the specified - // user key (including range tombstones covering that key) should - // be flushed yet. - i.rangeDelFrag.TruncateAndFlushTo(key) - } - tombstones := i.tombstones - i.tombstones = nil - return tombstones -} - -// RangeKeys returns a list of pending fragmented range keys up to the specified -// key, or all pending range keys if key = nil. -func (i *compactionIter) RangeKeys(key []byte) []keyspan.Span { - if key == nil { - i.rangeKeyFrag.Finish() - } else { - // The specified end key is exclusive; no versions of the specified - // user key (including range tombstones covering that key) should - // be flushed yet. - i.rangeKeyFrag.TruncateAndFlushTo(key) - } - rangeKeys := i.rangeKeys - i.rangeKeys = nil - return rangeKeys -} - -func (i *compactionIter) emitRangeDelChunk(fragmented keyspan.Span) { - // Apply the snapshot stripe rules, keeping only the latest tombstone for - // each snapshot stripe. - currentIdx := -1 - keys := fragmented.Keys[:0] - for _, k := range fragmented.Keys { - idx, _ := snapshotIndex(k.SeqNum(), i.snapshots) - if currentIdx == idx { - continue - } - if idx == 0 && i.elideRangeTombstone(fragmented.Start, fragmented.End) { - // This is the last snapshot stripe and the range tombstone - // can be elided. - break - } - - keys = append(keys, k) - if idx == 0 { - // This is the last snapshot stripe. - break - } - currentIdx = idx - } - if len(keys) > 0 { - i.tombstones = append(i.tombstones, keyspan.Span{ - Start: fragmented.Start, - End: fragmented.End, - Keys: keys, - }) - } -} - -func (i *compactionIter) emitRangeKeyChunk(fragmented keyspan.Span) { - // Elision of snapshot stripes happens in rangeKeyCompactionTransform, so no need to - // do that here. - if len(fragmented.Keys) > 0 { - i.rangeKeys = append(i.rangeKeys, fragmented) - } -} - -// maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing -// so improves compression and enables an optimization during forward iteration -// to skip some key comparisons. The seqnum for an entry can be zeroed if the -// entry is on the bottom snapshot stripe and on the bottom level of the LSM. -func (i *compactionIter) maybeZeroSeqnum(snapshotIdx int) { - if !i.allowZeroSeqNum { - // TODO(peter): allowZeroSeqNum applies to the entire compaction. We could - // make the determination on a key by key basis, similar to what is done - // for elideTombstone. Need to add a benchmark for compactionIter to verify - // that isn't too expensive. - return - } - if snapshotIdx > 0 { - // This is not the last snapshot - return - } - i.key.SetSeqNum(base.SeqNumZero) -} - -// A frontier is used to monitor a compaction's progression across the user -// keyspace. -// -// A frontier hold a user key boundary that it's concerned with in its `key` -// field. If/when the compaction iterator returns an InternalKey with a user key -// _k_ such that k ≥ frontier.key, the compaction iterator invokes the -// frontier's `reached` function, passing _k_ as its argument. -// -// The `reached` function returns a new value to use as the key. If `reached` -// returns nil, the frontier is forgotten and its `reached` method will not be -// invoked again, unless the user calls [Update] to set a new key. -// -// A frontier's key may be updated outside the context of a `reached` -// invocation at any time, through its Update method. -type frontier struct { - // container points to the containing *frontiers that was passed to Init - // when the frontier was initialized. - container *frontiers - - // key holds the frontier's current key. If nil, this frontier is inactive - // and its reached func will not be invoked. The value of this key may only - // be updated by the `frontiers` type, or the Update method. - key []byte - - // reached is invoked to inform a frontier that its key has been reached. - // It's invoked with the user key that reached the limit. The `key` argument - // is guaranteed to be ≥ the frontier's key. - // - // After reached is invoked, the frontier's key is updated to the return - // value of `reached`. Note bene, the frontier is permitted to update its - // key to a user key ≤ the argument `key`. - // - // If a frontier is set to key k1, and reached(k2) is invoked (k2 ≥ k1), the - // frontier will receive reached(k2) calls until it returns nil or a key - // `k3` such that k2 < k3. This property is useful for frontiers that use - // `reached` invocations to drive iteration through collections of keys that - // may contain multiple keys that are both < k2 and ≥ k1. - reached func(key []byte) (next []byte) -} - -// Init initializes the frontier with the provided key and reached callback. -// The frontier is attached to the provided *frontiers and the provided reached -// func will be invoked when the *frontiers is advanced to a key ≥ this -// frontier's key. -func (f *frontier) Init( - frontiers *frontiers, initialKey []byte, reached func(key []byte) (next []byte), -) { - *f = frontier{ - container: frontiers, - key: initialKey, - reached: reached, - } - if initialKey != nil { - f.container.push(f) - } -} - -// String implements fmt.Stringer. -func (f *frontier) String() string { - return string(f.key) -} - -// Update replaces the existing frontier's key with the provided key. The -// frontier's reached func will be invoked when the new key is reached. -func (f *frontier) Update(key []byte) { - c := f.container - prevKeyIsNil := f.key == nil - f.key = key - if prevKeyIsNil { - if key != nil { - c.push(f) - } - return - } - - // Find the frontier within the heap (it must exist within the heap because - // f.key was != nil). If the frontier key is now nil, remove it from the - // heap. Otherwise, fix up its position. - for i := 0; i < len(c.items); i++ { - if c.items[i] == f { - if key != nil { - c.fix(i) - } else { - n := c.len() - 1 - c.swap(i, n) - c.down(i, n) - c.items = c.items[:n] - } - return - } - } - panic("unreachable") -} - -// frontiers is used to track progression of a task (eg, compaction) across the -// keyspace. Clients that want to be informed when the task advances to a key ≥ -// some frontier may register a frontier, providing a callback. The task calls -// `Advance(k)` with each user key encountered, which invokes the `reached` func -// on all tracked frontiers with `key`s ≤ k. -// -// Internally, frontiers is implemented as a simple heap. -type frontiers struct { - cmp Compare - items []*frontier -} - -// String implements fmt.Stringer. -func (f *frontiers) String() string { - var buf bytes.Buffer - for i := 0; i < len(f.items); i++ { - if i > 0 { - fmt.Fprint(&buf, ", ") - } - fmt.Fprintf(&buf, "%s: %q", f.items[i], f.items[i].key) - } - return buf.String() -} - -// Advance notifies all member frontiers with keys ≤ k. -func (f *frontiers) Advance(k []byte) { - for len(f.items) > 0 && f.cmp(k, f.items[0].key) >= 0 { - // This frontier has been reached. Invoke the closure and update with - // the next frontier. - f.items[0].key = f.items[0].reached(k) - if f.items[0].key == nil { - // This was the final frontier that this user was concerned with. - // Remove it from the heap. - f.pop() - } else { - // Fix up the heap root. - f.fix(0) - } - } -} - -func (f *frontiers) len() int { - return len(f.items) -} - -func (f *frontiers) less(i, j int) bool { - return f.cmp(f.items[i].key, f.items[j].key) < 0 -} - -func (f *frontiers) swap(i, j int) { - f.items[i], f.items[j] = f.items[j], f.items[i] -} - -// fix, up and down are copied from the go stdlib. - -func (f *frontiers) fix(i int) { - if !f.down(i, f.len()) { - f.up(i) - } -} - -func (f *frontiers) push(ff *frontier) { - n := len(f.items) - f.items = append(f.items, ff) - f.up(n) -} - -func (f *frontiers) pop() *frontier { - n := f.len() - 1 - f.swap(0, n) - f.down(0, n) - item := f.items[n] - f.items = f.items[:n] - return item -} - -func (f *frontiers) up(j int) { - for { - i := (j - 1) / 2 // parent - if i == j || !f.less(j, i) { - break - } - f.swap(i, j) - j = i - } -} - -func (f *frontiers) down(i0, n int) bool { - i := i0 - for { - j1 := 2*i + 1 - if j1 >= n || j1 < 0 { // j1 < 0 after int overflow - break - } - j := j1 // left child - if j2 := j1 + 1; j2 < n && f.less(j2, j1) { - j = j2 // = 2*i + 2 // right child - } - if !f.less(j, i) { - break - } - f.swap(i, j) - i = j - } - return i > i0 -} diff --git a/vendor/github.com/cockroachdb/pebble/compaction_picker.go b/vendor/github.com/cockroachdb/pebble/compaction_picker.go deleted file mode 100644 index f35d475..0000000 --- a/vendor/github.com/cockroachdb/pebble/compaction_picker.go +++ /dev/null @@ -1,2068 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "bytes" - "fmt" - "math" - "sort" - "strings" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/humanize" - "github.com/cockroachdb/pebble/internal/manifest" -) - -// The minimum count for an intra-L0 compaction. This matches the RocksDB -// heuristic. -const minIntraL0Count = 4 - -type compactionEnv struct { - // diskAvailBytes holds a statistic on the number of bytes available on - // disk, as reported by the filesystem. It's used to be more restrictive in - // expanding compactions if available disk space is limited. - // - // The cached value (d.diskAvailBytes) is updated whenever a file is deleted - // and whenever a compaction or flush completes. Since file removal is the - // primary means of reclaiming space, there is a rough bound on the - // statistic's staleness when available bytes is growing. Compactions and - // flushes are longer, slower operations and provide a much looser bound - // when available bytes is decreasing. - diskAvailBytes uint64 - earliestUnflushedSeqNum uint64 - earliestSnapshotSeqNum uint64 - inProgressCompactions []compactionInfo - readCompactionEnv readCompactionEnv -} - -type compactionPicker interface { - getScores([]compactionInfo) [numLevels]float64 - getBaseLevel() int - estimatedCompactionDebt(l0ExtraSize uint64) uint64 - pickAuto(env compactionEnv) (pc *pickedCompaction) - pickElisionOnlyCompaction(env compactionEnv) (pc *pickedCompaction) - pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction) - pickReadTriggeredCompaction(env compactionEnv) (pc *pickedCompaction) - forceBaseLevel1() -} - -// readCompactionEnv is used to hold data required to perform read compactions -type readCompactionEnv struct { - rescheduleReadCompaction *bool - readCompactions *readCompactionQueue - flushing bool -} - -// Information about in-progress compactions provided to the compaction picker. -// These are used to constrain the new compactions that will be picked. -type compactionInfo struct { - // versionEditApplied is true if this compaction's version edit has already - // been committed. The compaction may still be in-progress deleting newly - // obsolete files. - versionEditApplied bool - inputs []compactionLevel - outputLevel int - smallest InternalKey - largest InternalKey -} - -func (info compactionInfo) String() string { - var buf bytes.Buffer - var largest int - for i, in := range info.inputs { - if i > 0 { - fmt.Fprintf(&buf, " -> ") - } - fmt.Fprintf(&buf, "L%d", in.level) - in.files.Each(func(m *fileMetadata) { - fmt.Fprintf(&buf, " %s", m.FileNum) - }) - if largest < in.level { - largest = in.level - } - } - if largest != info.outputLevel || len(info.inputs) == 1 { - fmt.Fprintf(&buf, " -> L%d", info.outputLevel) - } - return buf.String() -} - -type sortCompactionLevelsByPriority []candidateLevelInfo - -func (s sortCompactionLevelsByPriority) Len() int { - return len(s) -} - -// A level should be picked for compaction if the compensatedScoreRatio is >= the -// compactionScoreThreshold. -const compactionScoreThreshold = 1 - -// Less should return true if s[i] must be placed earlier than s[j] in the final -// sorted list. The candidateLevelInfo for the level placed earlier is more likely -// to be picked for a compaction. -func (s sortCompactionLevelsByPriority) Less(i, j int) bool { - iShouldCompact := s[i].compensatedScoreRatio >= compactionScoreThreshold - jShouldCompact := s[j].compensatedScoreRatio >= compactionScoreThreshold - // Ordering is defined as decreasing on (shouldCompact, uncompensatedScoreRatio) - // where shouldCompact is 1 for true and 0 for false. - if iShouldCompact && !jShouldCompact { - return true - } - if !iShouldCompact && jShouldCompact { - return false - } - - if s[i].uncompensatedScoreRatio != s[j].uncompensatedScoreRatio { - return s[i].uncompensatedScoreRatio > s[j].uncompensatedScoreRatio - } - return s[i].level < s[j].level -} - -func (s sortCompactionLevelsByPriority) Swap(i, j int) { - s[i], s[j] = s[j], s[i] -} - -// sublevelInfo is used to tag a LevelSlice for an L0 sublevel with the -// sublevel. -type sublevelInfo struct { - manifest.LevelSlice - sublevel manifest.Level -} - -func (cl sublevelInfo) Clone() sublevelInfo { - return sublevelInfo{ - sublevel: cl.sublevel, - LevelSlice: cl.LevelSlice.Reslice(func(start, end *manifest.LevelIterator) {}), - } -} -func (cl sublevelInfo) String() string { - return fmt.Sprintf(`Sublevel %s; Levels %s`, cl.sublevel, cl.LevelSlice) -} - -// generateSublevelInfo will generate the level slices for each of the sublevels -// from the level slice for all of L0. -func generateSublevelInfo(cmp base.Compare, levelFiles manifest.LevelSlice) []sublevelInfo { - sublevelMap := make(map[uint64][]*fileMetadata) - it := levelFiles.Iter() - for f := it.First(); f != nil; f = it.Next() { - sublevelMap[uint64(f.SubLevel)] = append(sublevelMap[uint64(f.SubLevel)], f) - } - - var sublevels []int - for level := range sublevelMap { - sublevels = append(sublevels, int(level)) - } - sort.Ints(sublevels) - - var levelSlices []sublevelInfo - for _, sublevel := range sublevels { - metas := sublevelMap[uint64(sublevel)] - levelSlices = append( - levelSlices, - sublevelInfo{ - manifest.NewLevelSliceKeySorted(cmp, metas), - manifest.L0Sublevel(sublevel), - }, - ) - } - return levelSlices -} - -// compactionPickerMetrics holds metrics related to the compaction picking process -type compactionPickerMetrics struct { - // scores contains the compensatedScoreRatio from the candidateLevelInfo. - scores []float64 - singleLevelOverlappingRatio float64 - multiLevelOverlappingRatio float64 -} - -// pickedCompaction contains information about a compaction that has already -// been chosen, and is being constructed. Compaction construction info lives in -// this struct, and is copied over into the compaction struct when that's -// created. -type pickedCompaction struct { - cmp Compare - // score of the chosen compaction. This is the same as the - // compensatedScoreRatio in the candidateLevelInfo. - score float64 - // kind indicates the kind of compaction. - kind compactionKind - // startLevel is the level that is being compacted. Inputs from startLevel - // and outputLevel will be merged to produce a set of outputLevel files. - startLevel *compactionLevel - // outputLevel is the level that files are being produced in. outputLevel is - // equal to startLevel+1 except when: - // - if startLevel is 0, the output level equals compactionPicker.baseLevel(). - // - in multilevel compaction, the output level is the lowest level involved in - // the compaction - outputLevel *compactionLevel - // extraLevels contain additional levels in between the input and output - // levels that get compacted in multi level compactions - extraLevels []*compactionLevel - inputs []compactionLevel - // LBase at the time of compaction picking. - baseLevel int - // L0-specific compaction info. Set to a non-nil value for all compactions - // where startLevel == 0 that were generated by L0Sublevels. - lcf *manifest.L0CompactionFiles - // maxOutputFileSize is the maximum size of an individual table created - // during compaction. - maxOutputFileSize uint64 - // maxOverlapBytes is the maximum number of bytes of overlap allowed for a - // single output table with the tables in the grandparent level. - maxOverlapBytes uint64 - // maxReadCompactionBytes is the maximum bytes a read compaction is allowed to - // overlap in its output level with. If the overlap is greater than - // maxReadCompaction bytes, then we don't proceed with the compaction. - maxReadCompactionBytes uint64 - // The boundaries of the input data. - smallest InternalKey - largest InternalKey - version *version - pickerMetrics compactionPickerMetrics -} - -func defaultOutputLevel(startLevel, baseLevel int) int { - outputLevel := startLevel + 1 - if startLevel == 0 { - outputLevel = baseLevel - } - if outputLevel >= numLevels-1 { - outputLevel = numLevels - 1 - } - return outputLevel -} - -func newPickedCompaction( - opts *Options, cur *version, startLevel, outputLevel, baseLevel int, -) *pickedCompaction { - if startLevel > 0 && startLevel < baseLevel { - panic(fmt.Sprintf("invalid compaction: start level %d should not be empty (base level %d)", - startLevel, baseLevel)) - } - - adjustedLevel := adjustedOutputLevel(outputLevel, baseLevel) - pc := &pickedCompaction{ - cmp: opts.Comparer.Compare, - version: cur, - baseLevel: baseLevel, - inputs: []compactionLevel{{level: startLevel}, {level: outputLevel}}, - maxOutputFileSize: uint64(opts.Level(adjustedLevel).TargetFileSize), - maxOverlapBytes: maxGrandparentOverlapBytes(opts, adjustedLevel), - maxReadCompactionBytes: maxReadCompactionBytes(opts, adjustedLevel), - } - pc.startLevel = &pc.inputs[0] - pc.outputLevel = &pc.inputs[1] - return pc -} - -// adjustedOutputLevel is the output level used for the purpose of -// determining the target output file size, overlap bytes, and expanded -// bytes, taking into account the base level. -func adjustedOutputLevel(outputLevel int, baseLevel int) int { - adjustedOutputLevel := outputLevel - if adjustedOutputLevel > 0 { - // Output level is in the range [baseLevel, numLevels]. For the purpose of - // determining the target output file size, overlap bytes, and expanded - // bytes, we want to adjust the range to [1,numLevels]. - adjustedOutputLevel = 1 + outputLevel - baseLevel - } - return adjustedOutputLevel -} - -func newPickedCompactionFromL0( - lcf *manifest.L0CompactionFiles, opts *Options, vers *version, baseLevel int, isBase bool, -) *pickedCompaction { - outputLevel := baseLevel - if !isBase { - outputLevel = 0 // Intra L0 - } - - pc := newPickedCompaction(opts, vers, 0, outputLevel, baseLevel) - pc.lcf = lcf - pc.outputLevel.level = outputLevel - - // Manually build the compaction as opposed to calling - // pickAutoHelper. This is because L0Sublevels has already added - // any overlapping L0 SSTables that need to be added, and - // because compactions built by L0SSTables do not necessarily - // pick contiguous sequences of files in pc.version.Levels[0]. - files := make([]*manifest.FileMetadata, 0, len(lcf.Files)) - iter := vers.Levels[0].Iter() - for f := iter.First(); f != nil; f = iter.Next() { - if lcf.FilesIncluded[f.L0Index] { - files = append(files, f) - } - } - pc.startLevel.files = manifest.NewLevelSliceSeqSorted(files) - return pc -} - -func (pc *pickedCompaction) String() string { - var builder strings.Builder - builder.WriteString(fmt.Sprintf(`Score=%f, `, pc.score)) - builder.WriteString(fmt.Sprintf(`Kind=%s, `, pc.kind)) - builder.WriteString(fmt.Sprintf(`AdjustedOutputLevel=%d, `, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel))) - builder.WriteString(fmt.Sprintf(`maxOutputFileSize=%d, `, pc.maxOutputFileSize)) - builder.WriteString(fmt.Sprintf(`maxReadCompactionBytes=%d, `, pc.maxReadCompactionBytes)) - builder.WriteString(fmt.Sprintf(`smallest=%s, `, pc.smallest)) - builder.WriteString(fmt.Sprintf(`largest=%s, `, pc.largest)) - builder.WriteString(fmt.Sprintf(`version=%s, `, pc.version)) - builder.WriteString(fmt.Sprintf(`inputs=%s, `, pc.inputs)) - builder.WriteString(fmt.Sprintf(`startlevel=%s, `, pc.startLevel)) - builder.WriteString(fmt.Sprintf(`outputLevel=%s, `, pc.outputLevel)) - builder.WriteString(fmt.Sprintf(`extraLevels=%s, `, pc.extraLevels)) - builder.WriteString(fmt.Sprintf(`l0SublevelInfo=%s, `, pc.startLevel.l0SublevelInfo)) - builder.WriteString(fmt.Sprintf(`lcf=%s`, pc.lcf)) - return builder.String() -} - -// Clone creates a deep copy of the pickedCompaction -func (pc *pickedCompaction) clone() *pickedCompaction { - - // Quickly copy over fields that do not require special deep copy care, and - // set all fields that will require a deep copy to nil. - newPC := &pickedCompaction{ - cmp: pc.cmp, - score: pc.score, - kind: pc.kind, - baseLevel: pc.baseLevel, - maxOutputFileSize: pc.maxOutputFileSize, - maxOverlapBytes: pc.maxOverlapBytes, - maxReadCompactionBytes: pc.maxReadCompactionBytes, - smallest: pc.smallest.Clone(), - largest: pc.largest.Clone(), - - // TODO(msbutler): properly clone picker metrics - pickerMetrics: pc.pickerMetrics, - - // Both copies see the same manifest, therefore, it's ok for them to se - // share the same pc. version. - version: pc.version, - } - - newPC.inputs = make([]compactionLevel, len(pc.inputs)) - newPC.extraLevels = make([]*compactionLevel, 0, len(pc.extraLevels)) - for i := range pc.inputs { - newPC.inputs[i] = pc.inputs[i].Clone() - if i == 0 { - newPC.startLevel = &newPC.inputs[i] - } else if i == len(pc.inputs)-1 { - newPC.outputLevel = &newPC.inputs[i] - } else { - newPC.extraLevels = append(newPC.extraLevels, &newPC.inputs[i]) - } - } - - if len(pc.startLevel.l0SublevelInfo) > 0 { - newPC.startLevel.l0SublevelInfo = make([]sublevelInfo, len(pc.startLevel.l0SublevelInfo)) - for i := range pc.startLevel.l0SublevelInfo { - newPC.startLevel.l0SublevelInfo[i] = pc.startLevel.l0SublevelInfo[i].Clone() - } - } - if pc.lcf != nil { - newPC.lcf = pc.lcf.Clone() - } - return newPC -} - -// maybeExpandedBounds is a helper function for setupInputs which ensures the -// pickedCompaction's smallest and largest internal keys are updated iff -// the candidate keys expand the key span. This avoids a bug for multi-level -// compactions: during the second call to setupInputs, the picked compaction's -// smallest and largest keys should not decrease the key span. -func (pc *pickedCompaction) maybeExpandBounds(smallest InternalKey, largest InternalKey) { - emptyKey := InternalKey{} - if base.InternalCompare(pc.cmp, smallest, emptyKey) == 0 { - if base.InternalCompare(pc.cmp, largest, emptyKey) != 0 { - panic("either both candidate keys are empty or neither are empty") - } - return - } - if base.InternalCompare(pc.cmp, pc.smallest, emptyKey) == 0 { - if base.InternalCompare(pc.cmp, pc.largest, emptyKey) != 0 { - panic("either both pc keys are empty or neither are empty") - } - pc.smallest = smallest - pc.largest = largest - return - } - if base.InternalCompare(pc.cmp, pc.smallest, smallest) >= 0 { - pc.smallest = smallest - } - if base.InternalCompare(pc.cmp, pc.largest, largest) <= 0 { - pc.largest = largest - } -} - -// setupInputs returns true if a compaction has been set up. It returns false if -// a concurrent compaction is occurring on the start or output level files. -func (pc *pickedCompaction) setupInputs( - opts *Options, diskAvailBytes uint64, startLevel *compactionLevel, -) bool { - // maxExpandedBytes is the maximum size of an expanded compaction. If - // growing a compaction results in a larger size, the original compaction - // is used instead. - maxExpandedBytes := expandedCompactionByteSizeLimit( - opts, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel), diskAvailBytes, - ) - - // Expand the initial inputs to a clean cut. - var isCompacting bool - startLevel.files, isCompacting = expandToAtomicUnit(pc.cmp, startLevel.files, false /* disableIsCompacting */) - if isCompacting { - return false - } - pc.maybeExpandBounds(manifest.KeyRange(pc.cmp, startLevel.files.Iter())) - - // Determine the sstables in the output level which overlap with the input - // sstables, and then expand those tables to a clean cut. No need to do - // this for intra-L0 compactions; outputLevel.files is left empty for those. - if startLevel.level != pc.outputLevel.level { - pc.outputLevel.files = pc.version.Overlaps(pc.outputLevel.level, pc.cmp, pc.smallest.UserKey, - pc.largest.UserKey, pc.largest.IsExclusiveSentinel()) - pc.outputLevel.files, isCompacting = expandToAtomicUnit(pc.cmp, pc.outputLevel.files, - false /* disableIsCompacting */) - if isCompacting { - return false - } - pc.maybeExpandBounds(manifest.KeyRange(pc.cmp, - startLevel.files.Iter(), pc.outputLevel.files.Iter())) - } - - // Grow the sstables in startLevel.level as long as it doesn't affect the number - // of sstables included from pc.outputLevel.level. - if pc.lcf != nil && startLevel.level == 0 && pc.outputLevel.level != 0 { - // Call the L0-specific compaction extension method. Similar logic as - // pc.grow. Additional L0 files are optionally added to the compaction at - // this step. Note that the bounds passed in are not the bounds of the - // compaction, but rather the smallest and largest internal keys that - // the compaction cannot include from L0 without pulling in more Lbase - // files. Consider this example: - // - // L0: c-d e+f g-h - // Lbase: a-b e+f i-j - // a b c d e f g h i j - // - // The e-f files have already been chosen in the compaction. As pulling - // in more LBase files is undesirable, the logic below will pass in - // smallest = b and largest = i to ExtendL0ForBaseCompactionTo, which - // will expand the compaction to include c-d and g-h from L0. The - // bounds passed in are exclusive; the compaction cannot be expanded - // to include files that "touch" it. - smallestBaseKey := base.InvalidInternalKey - largestBaseKey := base.InvalidInternalKey - if pc.outputLevel.files.Empty() { - baseIter := pc.version.Levels[pc.outputLevel.level].Iter() - if sm := baseIter.SeekLT(pc.cmp, pc.smallest.UserKey); sm != nil { - smallestBaseKey = sm.Largest - } - if la := baseIter.SeekGE(pc.cmp, pc.largest.UserKey); la != nil { - largestBaseKey = la.Smallest - } - } else { - // NB: We use Reslice to access the underlying level's files, but - // we discard the returned slice. The pc.outputLevel.files slice - // is not modified. - _ = pc.outputLevel.files.Reslice(func(start, end *manifest.LevelIterator) { - if sm := start.Prev(); sm != nil { - smallestBaseKey = sm.Largest - } - if la := end.Next(); la != nil { - largestBaseKey = la.Smallest - } - }) - } - oldLcf := pc.lcf.Clone() - if pc.version.L0Sublevels.ExtendL0ForBaseCompactionTo(smallestBaseKey, largestBaseKey, pc.lcf) { - var newStartLevelFiles []*fileMetadata - iter := pc.version.Levels[0].Iter() - var sizeSum uint64 - for j, f := 0, iter.First(); f != nil; j, f = j+1, iter.Next() { - if pc.lcf.FilesIncluded[f.L0Index] { - newStartLevelFiles = append(newStartLevelFiles, f) - sizeSum += f.Size - } - } - if sizeSum+pc.outputLevel.files.SizeSum() < maxExpandedBytes { - startLevel.files = manifest.NewLevelSliceSeqSorted(newStartLevelFiles) - pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, - startLevel.files.Iter(), pc.outputLevel.files.Iter()) - } else { - *pc.lcf = *oldLcf - } - } - } else if pc.grow(pc.smallest, pc.largest, maxExpandedBytes, startLevel) { - pc.maybeExpandBounds(manifest.KeyRange(pc.cmp, - startLevel.files.Iter(), pc.outputLevel.files.Iter())) - } - - if pc.startLevel.level == 0 { - // We don't change the input files for the compaction beyond this point. - pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files) - } - - return true -} - -// grow grows the number of inputs at c.level without changing the number of -// c.level+1 files in the compaction, and returns whether the inputs grew. sm -// and la are the smallest and largest InternalKeys in all of the inputs. -func (pc *pickedCompaction) grow( - sm, la InternalKey, maxExpandedBytes uint64, startLevel *compactionLevel, -) bool { - if pc.outputLevel.files.Empty() { - return false - } - grow0 := pc.version.Overlaps(startLevel.level, pc.cmp, sm.UserKey, - la.UserKey, la.IsExclusiveSentinel()) - grow0, isCompacting := expandToAtomicUnit(pc.cmp, grow0, false /* disableIsCompacting */) - if isCompacting { - return false - } - if grow0.Len() <= startLevel.files.Len() { - return false - } - if grow0.SizeSum()+pc.outputLevel.files.SizeSum() >= maxExpandedBytes { - return false - } - // We need to include the outputLevel iter because without it, in a multiLevel scenario, - // sm1 and la1 could shift the output level keyspace when pc.outputLevel.files is set to grow1. - sm1, la1 := manifest.KeyRange(pc.cmp, grow0.Iter(), pc.outputLevel.files.Iter()) - grow1 := pc.version.Overlaps(pc.outputLevel.level, pc.cmp, sm1.UserKey, - la1.UserKey, la1.IsExclusiveSentinel()) - grow1, isCompacting = expandToAtomicUnit(pc.cmp, grow1, false /* disableIsCompacting */) - if isCompacting { - return false - } - if grow1.Len() != pc.outputLevel.files.Len() { - return false - } - startLevel.files = grow0 - pc.outputLevel.files = grow1 - return true -} - -func (pc *pickedCompaction) compactionSize() uint64 { - var bytesToCompact uint64 - for i := range pc.inputs { - bytesToCompact += pc.inputs[i].files.SizeSum() - } - return bytesToCompact -} - -// setupMultiLevelCandidated returns true if it successfully added another level -// to the compaction. -func (pc *pickedCompaction) setupMultiLevelCandidate(opts *Options, diskAvailBytes uint64) bool { - pc.inputs = append(pc.inputs, compactionLevel{level: pc.outputLevel.level + 1}) - - // Recalibrate startLevel and outputLevel: - // - startLevel and outputLevel pointers may be obsolete after appending to pc.inputs. - // - push outputLevel to extraLevels and move the new level to outputLevel - pc.startLevel = &pc.inputs[0] - pc.extraLevels = []*compactionLevel{&pc.inputs[1]} - pc.outputLevel = &pc.inputs[2] - return pc.setupInputs(opts, diskAvailBytes, pc.extraLevels[len(pc.extraLevels)-1]) -} - -// expandToAtomicUnit expands the provided level slice within its level both -// forwards and backwards to its "atomic compaction unit" boundaries, if -// necessary. -// -// While picking compaction inputs, this is required to maintain the invariant -// that the versions of keys at level+1 are older than the versions of keys at -// level. Tables are added to the right of the current slice tables such that -// the rightmost table has a "clean cut". A clean cut is either a change in -// user keys, or when the largest key in the left sstable is a range tombstone -// sentinel key (InternalKeyRangeDeleteSentinel). -// -// In addition to maintaining the seqnum invariant, expandToAtomicUnit is used -// to provide clean boundaries for range tombstone truncation during -// compaction. In order to achieve these clean boundaries, expandToAtomicUnit -// needs to find a "clean cut" on the left edge of the compaction as well. -// This is necessary in order for "atomic compaction units" to always be -// compacted as a unit. Failure to do this leads to a subtle bug with -// truncation of range tombstones to atomic compaction unit boundaries. -// Consider the scenario: -// -// L3: -// 12:[a#2,15-b#1,1] -// 13:[b#0,15-d#72057594037927935,15] -// -// These sstables contain a range tombstone [a-d)#2 which spans the two -// sstables. The two sstables need to always be kept together. Compacting -// sstable 13 independently of sstable 12 would result in: -// -// L3: -// 12:[a#2,15-b#1,1] -// L4: -// 14:[b#0,15-d#72057594037927935,15] -// -// This state is still ok, but when sstable 12 is next compacted, its range -// tombstones will be truncated at "b" (the largest key in its atomic -// compaction unit). In the scenario here, that could result in b#1 becoming -// visible when it should be deleted. -// -// isCompacting is returned true for any atomic units that contain files that -// have in-progress compactions, i.e. FileMetadata.Compacting == true. If -// disableIsCompacting is true, isCompacting always returns false. This helps -// avoid spurious races from being detected when this method is used outside -// of compaction picking code. -// -// TODO(jackson): Compactions and flushes no longer split a user key between two -// sstables. We could perform a migration, re-compacting any sstables with split -// user keys, which would allow us to remove atomic compaction unit expansion -// code. -func expandToAtomicUnit( - cmp Compare, inputs manifest.LevelSlice, disableIsCompacting bool, -) (slice manifest.LevelSlice, isCompacting bool) { - // NB: Inputs for L0 can't be expanded and *version.Overlaps guarantees - // that we get a 'clean cut.' For L0, Overlaps will return a slice without - // access to the rest of the L0 files, so it's OK to try to reslice. - if inputs.Empty() { - // Nothing to expand. - return inputs, false - } - - // TODO(jackson): Update to avoid use of LevelIterator.Current(). The - // Reslice interface will require some tweaking, because we currently rely - // on Reslice having already positioned the LevelIterator appropriately. - - inputs = inputs.Reslice(func(start, end *manifest.LevelIterator) { - iter := start.Clone() - iter.Prev() - for cur, prev := start.Current(), iter.Current(); prev != nil; cur, prev = start.Prev(), iter.Prev() { - if cur.IsCompacting() { - isCompacting = true - } - if cmp(prev.Largest.UserKey, cur.Smallest.UserKey) < 0 { - break - } - if prev.Largest.IsExclusiveSentinel() { - // The table prev has a largest key indicating that the user key - // prev.largest.UserKey doesn't actually exist in the table. - break - } - // prev.Largest.UserKey == cur.Smallest.UserKey, so we need to - // include prev in the compaction. - } - - iter = end.Clone() - iter.Next() - for cur, next := end.Current(), iter.Current(); next != nil; cur, next = end.Next(), iter.Next() { - if cur.IsCompacting() { - isCompacting = true - } - if cmp(cur.Largest.UserKey, next.Smallest.UserKey) < 0 { - break - } - if cur.Largest.IsExclusiveSentinel() { - // The table cur has a largest key indicating that the user key - // cur.largest.UserKey doesn't actually exist in the table. - break - } - // cur.Largest.UserKey == next.Smallest.UserKey, so we need to - // include next in the compaction. - } - }) - inputIter := inputs.Iter() - isCompacting = !disableIsCompacting && - (isCompacting || inputIter.First().IsCompacting() || inputIter.Last().IsCompacting()) - return inputs, isCompacting -} - -func newCompactionPicker( - v *version, opts *Options, inProgressCompactions []compactionInfo, -) compactionPicker { - p := &compactionPickerByScore{ - opts: opts, - vers: v, - } - p.initLevelMaxBytes(inProgressCompactions) - return p -} - -// Information about a candidate compaction level that has been identified by -// the compaction picker. -type candidateLevelInfo struct { - // The compensatedScore of the level after adjusting according to the other - // levels' sizes. For L0, the compensatedScoreRatio is equivalent to the - // uncompensatedScoreRatio as we don't account for level size compensation in - // L0. - compensatedScoreRatio float64 - // The score of the level after accounting for level size compensation before - // adjusting according to other levels' sizes. For L0, the compensatedScore - // is equivalent to the uncompensatedScore as we don't account for level - // size compensation in L0. - compensatedScore float64 - // The score of the level to be compacted, calculated using uncompensated file - // sizes and without any adjustments. - uncompensatedScore float64 - // uncompensatedScoreRatio is the uncompensatedScore adjusted according to - // the other levels' sizes. - uncompensatedScoreRatio float64 - level int - // The level to compact to. - outputLevel int - // The file in level that will be compacted. Additional files may be - // picked by the compaction, and a pickedCompaction created for the - // compaction. - file manifest.LevelFile -} - -func (c *candidateLevelInfo) shouldCompact() bool { - return c.compensatedScoreRatio >= compactionScoreThreshold -} - -func fileCompensation(f *fileMetadata) uint64 { - return uint64(f.Stats.PointDeletionsBytesEstimate) + f.Stats.RangeDeletionsBytesEstimate -} - -// compensatedSize returns f's file size, inflated according to compaction -// priorities. -func compensatedSize(f *fileMetadata) uint64 { - // Add in the estimate of disk space that may be reclaimed by compacting the - // file's tombstones. - return f.Size + fileCompensation(f) -} - -// compensatedSizeAnnotator implements manifest.Annotator, annotating B-Tree -// nodes with the sum of the files' compensated sizes. Its annotation type is -// a *uint64. Compensated sizes may change once a table's stats are loaded -// asynchronously, so its values are marked as cacheable only if a file's -// stats have been loaded. -type compensatedSizeAnnotator struct { -} - -var _ manifest.Annotator = compensatedSizeAnnotator{} - -func (a compensatedSizeAnnotator) Zero(dst interface{}) interface{} { - if dst == nil { - return new(uint64) - } - v := dst.(*uint64) - *v = 0 - return v -} - -func (a compensatedSizeAnnotator) Accumulate( - f *fileMetadata, dst interface{}, -) (v interface{}, cacheOK bool) { - vptr := dst.(*uint64) - *vptr = *vptr + compensatedSize(f) - return vptr, f.StatsValid() -} - -func (a compensatedSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} { - srcV := src.(*uint64) - dstV := dst.(*uint64) - *dstV = *dstV + *srcV - return dstV -} - -// totalCompensatedSize computes the compensated size over a file metadata -// iterator. Note that this function is linear in the files available to the -// iterator. Use the compensatedSizeAnnotator if querying the total -// compensated size of a level. -func totalCompensatedSize(iter manifest.LevelIterator) uint64 { - var sz uint64 - for f := iter.First(); f != nil; f = iter.Next() { - sz += compensatedSize(f) - } - return sz -} - -// compactionPickerByScore holds the state and logic for picking a compaction. A -// compaction picker is associated with a single version. A new compaction -// picker is created and initialized every time a new version is installed. -type compactionPickerByScore struct { - opts *Options - vers *version - // The level to target for L0 compactions. Levels L1 to baseLevel must be - // empty. - baseLevel int - // levelMaxBytes holds the dynamically adjusted max bytes setting for each - // level. - levelMaxBytes [numLevels]int64 -} - -var _ compactionPicker = &compactionPickerByScore{} - -func (p *compactionPickerByScore) getScores(inProgress []compactionInfo) [numLevels]float64 { - var scores [numLevels]float64 - for _, info := range p.calculateLevelScores(inProgress) { - scores[info.level] = info.compensatedScoreRatio - } - return scores -} - -func (p *compactionPickerByScore) getBaseLevel() int { - if p == nil { - return 1 - } - return p.baseLevel -} - -// estimatedCompactionDebt estimates the number of bytes which need to be -// compacted before the LSM tree becomes stable. -func (p *compactionPickerByScore) estimatedCompactionDebt(l0ExtraSize uint64) uint64 { - if p == nil { - return 0 - } - - // We assume that all the bytes in L0 need to be compacted to Lbase. This is - // unlike the RocksDB logic that figures out whether L0 needs compaction. - bytesAddedToNextLevel := l0ExtraSize + p.vers.Levels[0].Size() - lbaseSize := p.vers.Levels[p.baseLevel].Size() - - var compactionDebt uint64 - if bytesAddedToNextLevel > 0 && lbaseSize > 0 { - // We only incur compaction debt if both L0 and Lbase contain data. If L0 - // is empty, no compaction is necessary. If Lbase is empty, a move-based - // compaction from L0 would occur. - compactionDebt += bytesAddedToNextLevel + lbaseSize - } - - // loop invariant: At the beginning of the loop, bytesAddedToNextLevel is the - // bytes added to `level` in the loop. - for level := p.baseLevel; level < numLevels-1; level++ { - levelSize := p.vers.Levels[level].Size() + bytesAddedToNextLevel - nextLevelSize := p.vers.Levels[level+1].Size() - if levelSize > uint64(p.levelMaxBytes[level]) { - bytesAddedToNextLevel = levelSize - uint64(p.levelMaxBytes[level]) - if nextLevelSize > 0 { - // We only incur compaction debt if the next level contains data. If the - // next level is empty, a move-based compaction would be used. - levelRatio := float64(nextLevelSize) / float64(levelSize) - // The current level contributes bytesAddedToNextLevel to compactions. - // The next level contributes levelRatio * bytesAddedToNextLevel. - compactionDebt += uint64(float64(bytesAddedToNextLevel) * (levelRatio + 1)) - } - } else { - // We're not moving any bytes to the next level. - bytesAddedToNextLevel = 0 - } - } - return compactionDebt -} - -func (p *compactionPickerByScore) initLevelMaxBytes(inProgressCompactions []compactionInfo) { - // The levelMaxBytes calculations here differ from RocksDB in two ways: - // - // 1. The use of dbSize vs maxLevelSize. RocksDB uses the size of the maximum - // level in L1-L6, rather than determining the size of the bottom level - // based on the total amount of data in the dB. The RocksDB calculation is - // problematic if L0 contains a significant fraction of data, or if the - // level sizes are roughly equal and thus there is a significant fraction - // of data outside of the largest level. - // - // 2. Not adjusting the size of Lbase based on L0. RocksDB computes - // baseBytesMax as the maximum of the configured LBaseMaxBytes and the - // size of L0. This is problematic because baseBytesMax is used to compute - // the max size of lower levels. A very large baseBytesMax will result in - // an overly large value for the size of lower levels which will caused - // those levels not to be compacted even when they should be - // compacted. This often results in "inverted" LSM shapes where Ln is - // larger than Ln+1. - - // Determine the first non-empty level and the total DB size. - firstNonEmptyLevel := -1 - var dbSize uint64 - for level := 1; level < numLevels; level++ { - if p.vers.Levels[level].Size() > 0 { - if firstNonEmptyLevel == -1 { - firstNonEmptyLevel = level - } - dbSize += p.vers.Levels[level].Size() - } - } - for _, c := range inProgressCompactions { - if c.outputLevel == 0 || c.outputLevel == -1 { - continue - } - if c.inputs[0].level == 0 && (firstNonEmptyLevel == -1 || c.outputLevel < firstNonEmptyLevel) { - firstNonEmptyLevel = c.outputLevel - } - } - - // Initialize the max-bytes setting for each level to "infinity" which will - // disallow compaction for that level. We'll fill in the actual value below - // for levels we want to allow compactions from. - for level := 0; level < numLevels; level++ { - p.levelMaxBytes[level] = math.MaxInt64 - } - - if dbSize == 0 { - // No levels for L1 and up contain any data. Target L0 compactions for the - // last level or to the level to which there is an ongoing L0 compaction. - p.baseLevel = numLevels - 1 - if firstNonEmptyLevel >= 0 { - p.baseLevel = firstNonEmptyLevel - } - return - } - - dbSize += p.vers.Levels[0].Size() - bottomLevelSize := dbSize - dbSize/uint64(p.opts.Experimental.LevelMultiplier) - - curLevelSize := bottomLevelSize - for level := numLevels - 2; level >= firstNonEmptyLevel; level-- { - curLevelSize = uint64(float64(curLevelSize) / float64(p.opts.Experimental.LevelMultiplier)) - } - - // Compute base level (where L0 data is compacted to). - baseBytesMax := uint64(p.opts.LBaseMaxBytes) - p.baseLevel = firstNonEmptyLevel - for p.baseLevel > 1 && curLevelSize > baseBytesMax { - p.baseLevel-- - curLevelSize = uint64(float64(curLevelSize) / float64(p.opts.Experimental.LevelMultiplier)) - } - - smoothedLevelMultiplier := 1.0 - if p.baseLevel < numLevels-1 { - smoothedLevelMultiplier = math.Pow( - float64(bottomLevelSize)/float64(baseBytesMax), - 1.0/float64(numLevels-p.baseLevel-1)) - } - - levelSize := float64(baseBytesMax) - for level := p.baseLevel; level < numLevels; level++ { - if level > p.baseLevel && levelSize > 0 { - levelSize *= smoothedLevelMultiplier - } - // Round the result since test cases use small target level sizes, which - // can be impacted by floating-point imprecision + integer truncation. - roundedLevelSize := math.Round(levelSize) - if roundedLevelSize > float64(math.MaxInt64) { - p.levelMaxBytes[level] = math.MaxInt64 - } else { - p.levelMaxBytes[level] = int64(roundedLevelSize) - } - } -} - -type levelSizeAdjust struct { - incomingActualBytes uint64 - outgoingActualBytes uint64 - outgoingCompensatedBytes uint64 -} - -func (a levelSizeAdjust) compensated() uint64 { - return a.incomingActualBytes - a.outgoingCompensatedBytes -} - -func (a levelSizeAdjust) actual() uint64 { - return a.incomingActualBytes - a.outgoingActualBytes -} - -func calculateSizeAdjust(inProgressCompactions []compactionInfo) [numLevels]levelSizeAdjust { - // Compute size adjustments for each level based on the in-progress - // compactions. We sum the file sizes of all files leaving and entering each - // level in in-progress compactions. For outgoing files, we also sum a - // separate sum of 'compensated file sizes', which are inflated according - // to deletion estimates. - // - // When we adjust a level's size according to these values during score - // calculation, we subtract the compensated size of start level inputs to - // account for the fact that score calculation uses compensated sizes. - // - // Since compensated file sizes may be compensated because they reclaim - // space from the output level's files, we only add the real file size to - // the output level. - // - // This is slightly different from RocksDB's behavior, which simply elides - // compacting files from the level size calculation. - var sizeAdjust [numLevels]levelSizeAdjust - for i := range inProgressCompactions { - c := &inProgressCompactions[i] - // If this compaction's version edit has already been applied, there's - // no need to adjust: The LSM we'll examine will already reflect the - // new LSM state. - if c.versionEditApplied { - continue - } - - for _, input := range c.inputs { - actualSize := input.files.SizeSum() - compensatedSize := totalCompensatedSize(input.files.Iter()) - - if input.level != c.outputLevel { - sizeAdjust[input.level].outgoingCompensatedBytes += compensatedSize - sizeAdjust[input.level].outgoingActualBytes += actualSize - if c.outputLevel != -1 { - sizeAdjust[c.outputLevel].incomingActualBytes += actualSize - } - } - } - } - return sizeAdjust -} - -func levelCompensatedSize(lm manifest.LevelMetadata) uint64 { - return *lm.Annotation(compensatedSizeAnnotator{}).(*uint64) -} - -func (p *compactionPickerByScore) calculateLevelScores( - inProgressCompactions []compactionInfo, -) [numLevels]candidateLevelInfo { - var scores [numLevels]candidateLevelInfo - for i := range scores { - scores[i].level = i - scores[i].outputLevel = i + 1 - } - l0UncompensatedScore := calculateL0UncompensatedScore(p.vers, p.opts, inProgressCompactions) - scores[0] = candidateLevelInfo{ - outputLevel: p.baseLevel, - uncompensatedScore: l0UncompensatedScore, - compensatedScore: l0UncompensatedScore, /* No level size compensation for L0 */ - } - sizeAdjust := calculateSizeAdjust(inProgressCompactions) - for level := 1; level < numLevels; level++ { - compensatedLevelSize := levelCompensatedSize(p.vers.Levels[level]) + sizeAdjust[level].compensated() - scores[level].compensatedScore = float64(compensatedLevelSize) / float64(p.levelMaxBytes[level]) - scores[level].uncompensatedScore = float64(p.vers.Levels[level].Size()+sizeAdjust[level].actual()) / float64(p.levelMaxBytes[level]) - } - - // Adjust each level's {compensated, uncompensated}Score by the uncompensatedScore - // of the next level to get a {compensated, uncompensated}ScoreRatio. If the - // next level has a high uncompensatedScore, and is thus a priority for compaction, - // this reduces the priority for compacting the current level. If the next level - // has a low uncompensatedScore (i.e. it is below its target size), this increases - // the priority for compacting the current level. - // - // The effect of this adjustment is to help prioritize compactions in lower - // levels. The following example shows the compensatedScoreRatio and the - // compensatedScore. In this scenario, L0 has 68 sublevels. L3 (a.k.a. Lbase) - // is significantly above its target size. The original score prioritizes - // compactions from those two levels, but doing so ends up causing a future - // problem: data piles up in the higher levels, starving L5->L6 compactions, - // and to a lesser degree starving L4->L5 compactions. - // - // Note that in the example shown there is no level size compensation so the - // compensatedScore and the uncompensatedScore is the same for each level. - // - // compensatedScoreRatio compensatedScore uncompensatedScore size max-size - // L0 3.2 68.0 68.0 2.2 G - - // L3 3.2 21.1 21.1 1.3 G 64 M - // L4 3.4 6.7 6.7 3.1 G 467 M - // L5 3.4 2.0 2.0 6.6 G 3.3 G - // L6 0.6 0.6 0.6 14 G 24 G - var prevLevel int - for level := p.baseLevel; level < numLevels; level++ { - // The compensated scores, and uncompensated scores will be turned into - // ratios as they're adjusted according to other levels' sizes. - scores[prevLevel].compensatedScoreRatio = scores[prevLevel].compensatedScore - scores[prevLevel].uncompensatedScoreRatio = scores[prevLevel].uncompensatedScore - - // Avoid absurdly large scores by placing a floor on the score that we'll - // adjust a level by. The value of 0.01 was chosen somewhat arbitrarily. - const minScore = 0.01 - if scores[prevLevel].compensatedScoreRatio >= compactionScoreThreshold { - if scores[level].uncompensatedScore >= minScore { - scores[prevLevel].compensatedScoreRatio /= scores[level].uncompensatedScore - } else { - scores[prevLevel].compensatedScoreRatio /= minScore - } - } - if scores[prevLevel].uncompensatedScoreRatio >= compactionScoreThreshold { - if scores[level].uncompensatedScore >= minScore { - scores[prevLevel].uncompensatedScoreRatio /= scores[level].uncompensatedScore - } else { - scores[prevLevel].uncompensatedScoreRatio /= minScore - } - } - prevLevel = level - } - // Set the score ratios for the lowest level. - // INVARIANT: prevLevel == numLevels-1 - scores[prevLevel].compensatedScoreRatio = scores[prevLevel].compensatedScore - scores[prevLevel].uncompensatedScoreRatio = scores[prevLevel].uncompensatedScore - - sort.Sort(sortCompactionLevelsByPriority(scores[:])) - return scores -} - -// calculateL0UncompensatedScore calculates a float score representing the -// relative priority of compacting L0. Level L0 is special in that files within -// L0 may overlap one another, so a different set of heuristics that take into -// account read amplification apply. -func calculateL0UncompensatedScore( - vers *version, opts *Options, inProgressCompactions []compactionInfo, -) float64 { - // Use the sublevel count to calculate the score. The base vs intra-L0 - // compaction determination happens in pickAuto, not here. - score := float64(2*vers.L0Sublevels.MaxDepthAfterOngoingCompactions()) / - float64(opts.L0CompactionThreshold) - - // Also calculate a score based on the file count but use it only if it - // produces a higher score than the sublevel-based one. This heuristic is - // designed to accommodate cases where L0 is accumulating non-overlapping - // files in L0. Letting too many non-overlapping files accumulate in few - // sublevels is undesirable, because: - // 1) we can produce a massive backlog to compact once files do overlap. - // 2) constructing L0 sublevels has a runtime that grows superlinearly with - // the number of files in L0 and must be done while holding D.mu. - noncompactingFiles := vers.Levels[0].Len() - for _, c := range inProgressCompactions { - for _, cl := range c.inputs { - if cl.level == 0 { - noncompactingFiles -= cl.files.Len() - } - } - } - fileScore := float64(noncompactingFiles) / float64(opts.L0CompactionFileThreshold) - if score < fileScore { - score = fileScore - } - return score -} - -// pickCompactionSeedFile picks a file from `level` in the `vers` to build a -// compaction around. Currently, this function implements a heuristic similar to -// RocksDB's kMinOverlappingRatio, seeking to minimize write amplification. This -// function is linear with respect to the number of files in `level` and -// `outputLevel`. -func pickCompactionSeedFile( - vers *version, opts *Options, level, outputLevel int, earliestSnapshotSeqNum uint64, -) (manifest.LevelFile, bool) { - // Select the file within the level to compact. We want to minimize write - // amplification, but also ensure that deletes are propagated to the - // bottom level in a timely fashion so as to reclaim disk space. A table's - // smallest sequence number provides a measure of its age. The ratio of - // overlapping-bytes / table-size gives an indication of write - // amplification (a smaller ratio is preferrable). - // - // The current heuristic is based off the the RocksDB kMinOverlappingRatio - // heuristic. It chooses the file with the minimum overlapping ratio with - // the target level, which minimizes write amplification. - // - // It uses a "compensated size" for the denominator, which is the file - // size but artificially inflated by an estimate of the space that may be - // reclaimed through compaction. Currently, we only compensate for range - // deletions and only with a rough estimate of the reclaimable bytes. This - // differs from RocksDB which only compensates for point tombstones and - // only if they exceed the number of non-deletion entries in table. - // - // TODO(peter): For concurrent compactions, we may want to try harder to - // pick a seed file whose resulting compaction bounds do not overlap with - // an in-progress compaction. - - cmp := opts.Comparer.Compare - startIter := vers.Levels[level].Iter() - outputIter := vers.Levels[outputLevel].Iter() - - var file manifest.LevelFile - smallestRatio := uint64(math.MaxUint64) - - outputFile := outputIter.First() - - for f := startIter.First(); f != nil; f = startIter.Next() { - var overlappingBytes uint64 - compacting := f.IsCompacting() - if compacting { - // Move on if this file is already being compacted. We'll likely - // still need to move past the overlapping output files regardless, - // but in cases where all start-level files are compacting we won't. - continue - } - - // Trim any output-level files smaller than f. - for outputFile != nil && sstableKeyCompare(cmp, outputFile.Largest, f.Smallest) < 0 { - outputFile = outputIter.Next() - } - - for outputFile != nil && sstableKeyCompare(cmp, outputFile.Smallest, f.Largest) <= 0 && !compacting { - overlappingBytes += outputFile.Size - compacting = compacting || outputFile.IsCompacting() - - // For files in the bottommost level of the LSM, the - // Stats.RangeDeletionsBytesEstimate field is set to the estimate - // of bytes /within/ the file itself that may be dropped by - // recompacting the file. These bytes from obsolete keys would not - // need to be rewritten if we compacted `f` into `outputFile`, so - // they don't contribute to write amplification. Subtracting them - // out of the overlapping bytes helps prioritize these compactions - // that are cheaper than their file sizes suggest. - if outputLevel == numLevels-1 && outputFile.LargestSeqNum < earliestSnapshotSeqNum { - overlappingBytes -= outputFile.Stats.RangeDeletionsBytesEstimate - } - - // If the file in the next level extends beyond f's largest key, - // break out and don't advance outputIter because f's successor - // might also overlap. - // - // Note, we stop as soon as we encounter an output-level file with a - // largest key beyond the input-level file's largest bound. We - // perform a simple user key comparison here using sstableKeyCompare - // which handles the potential for exclusive largest key bounds. - // There's some subtlety when the bounds are equal (eg, equal and - // inclusive, or equal and exclusive). Current Pebble doesn't split - // user keys across sstables within a level (and in format versions - // FormatSplitUserKeysMarkedCompacted and later we guarantee no - // split user keys exist within the entire LSM). In that case, we're - // assured that neither the input level nor the output level's next - // file shares the same user key, so compaction expansion will not - // include them in any compaction compacting `f`. - // - // NB: If we /did/ allow split user keys, or we're running on an - // old database with an earlier format major version where there are - // existing split user keys, this logic would be incorrect. Consider - // L1: [a#120,a#100] [a#80,a#60] - // L2: [a#55,a#45] [a#35,a#25] [a#15,a#5] - // While considering the first file in L1, [a#120,a#100], we'd skip - // past all of the files in L2. When considering the second file in - // L1, we'd improperly conclude that the second file overlaps - // nothing in the second level and is cheap to compact, when in - // reality we'd need to expand the compaction to include all 5 - // files. - if sstableKeyCompare(cmp, outputFile.Largest, f.Largest) > 0 { - break - } - outputFile = outputIter.Next() - } - - // If the input level file or one of the overlapping files is - // compacting, we're not going to be able to compact this file - // anyways, so skip it. - if compacting { - continue - } - - compSz := compensatedSize(f) - scaledRatio := overlappingBytes * 1024 / compSz - if scaledRatio < smallestRatio { - smallestRatio = scaledRatio - file = startIter.Take() - } - } - return file, file.FileMetadata != nil -} - -// pickAuto picks the best compaction, if any. -// -// On each call, pickAuto computes per-level size adjustments based on -// in-progress compactions, and computes a per-level score. The levels are -// iterated over in decreasing score order trying to find a valid compaction -// anchored at that level. -// -// If a score-based compaction cannot be found, pickAuto falls back to looking -// for an elision-only compaction to remove obsolete keys. -func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompaction) { - // Compaction concurrency is controlled by L0 read-amp. We allow one - // additional compaction per L0CompactionConcurrency sublevels, as well as - // one additional compaction per CompactionDebtConcurrency bytes of - // compaction debt. Compaction concurrency is tied to L0 sublevels as that - // signal is independent of the database size. We tack on the compaction - // debt as a second signal to prevent compaction concurrency from dropping - // significantly right after a base compaction finishes, and before those - // bytes have been compacted further down the LSM. - if n := len(env.inProgressCompactions); n > 0 { - l0ReadAmp := p.vers.L0Sublevels.MaxDepthAfterOngoingCompactions() - compactionDebt := p.estimatedCompactionDebt(0) - ccSignal1 := n * p.opts.Experimental.L0CompactionConcurrency - ccSignal2 := uint64(n) * p.opts.Experimental.CompactionDebtConcurrency - if l0ReadAmp < ccSignal1 && compactionDebt < ccSignal2 { - return nil - } - } - - scores := p.calculateLevelScores(env.inProgressCompactions) - - // TODO(bananabrick): Either remove, or change this into an event sent to the - // EventListener. - logCompaction := func(pc *pickedCompaction) { - var buf bytes.Buffer - for i := 0; i < numLevels; i++ { - if i != 0 && i < p.baseLevel { - continue - } - - var info *candidateLevelInfo - for j := range scores { - if scores[j].level == i { - info = &scores[j] - break - } - } - - marker := " " - if pc.startLevel.level == info.level { - marker = "*" - } - fmt.Fprintf(&buf, " %sL%d: %5.1f %5.1f %5.1f %5.1f %8s %8s", - marker, info.level, info.compensatedScoreRatio, info.compensatedScore, - info.uncompensatedScoreRatio, info.uncompensatedScore, - humanize.Bytes.Int64(int64(totalCompensatedSize( - p.vers.Levels[info.level].Iter(), - ))), - humanize.Bytes.Int64(p.levelMaxBytes[info.level]), - ) - - count := 0 - for i := range env.inProgressCompactions { - c := &env.inProgressCompactions[i] - if c.inputs[0].level != info.level { - continue - } - count++ - if count == 1 { - fmt.Fprintf(&buf, " [") - } else { - fmt.Fprintf(&buf, " ") - } - fmt.Fprintf(&buf, "L%d->L%d", c.inputs[0].level, c.outputLevel) - } - if count > 0 { - fmt.Fprintf(&buf, "]") - } - fmt.Fprintf(&buf, "\n") - } - p.opts.Logger.Infof("pickAuto: L%d->L%d\n%s", - pc.startLevel.level, pc.outputLevel.level, buf.String()) - } - - // Check for a score-based compaction. candidateLevelInfos are first sorted - // by whether they should be compacted, so if we find a level which shouldn't - // be compacted, we can break early. - for i := range scores { - info := &scores[i] - if !info.shouldCompact() { - break - } - if info.level == numLevels-1 { - continue - } - - if info.level == 0 { - pc = pickL0(env, p.opts, p.vers, p.baseLevel) - // Fail-safe to protect against compacting the same sstable - // concurrently. - if pc != nil && !inputRangeAlreadyCompacting(env, pc) { - p.addScoresToPickedCompactionMetrics(pc, scores) - pc.score = info.compensatedScoreRatio - // TODO(bananabrick): Create an EventListener for logCompaction. - if false { - logCompaction(pc) - } - return pc - } - continue - } - - // info.level > 0 - var ok bool - info.file, ok = pickCompactionSeedFile(p.vers, p.opts, info.level, info.outputLevel, env.earliestSnapshotSeqNum) - if !ok { - continue - } - - pc := pickAutoLPositive(env, p.opts, p.vers, *info, p.baseLevel, p.levelMaxBytes) - // Fail-safe to protect against compacting the same sstable concurrently. - if pc != nil && !inputRangeAlreadyCompacting(env, pc) { - p.addScoresToPickedCompactionMetrics(pc, scores) - pc.score = info.compensatedScoreRatio - // TODO(bananabrick): Create an EventListener for logCompaction. - if false { - logCompaction(pc) - } - return pc - } - } - - // Check for L6 files with tombstones that may be elided. These files may - // exist if a snapshot prevented the elision of a tombstone or because of - // a move compaction. These are low-priority compactions because they - // don't help us keep up with writes, just reclaim disk space. - if pc := p.pickElisionOnlyCompaction(env); pc != nil { - return pc - } - - if pc := p.pickReadTriggeredCompaction(env); pc != nil { - return pc - } - - // NB: This should only be run if a read compaction wasn't - // scheduled. - // - // We won't be scheduling a read compaction right now, and in - // read heavy workloads, compactions won't be scheduled frequently - // because flushes aren't frequent. So we need to signal to the - // iterator to schedule a compaction when it adds compactions to - // the read compaction queue. - // - // We need the nil check here because without it, we have some - // tests which don't set that variable fail. Since there's a - // chance that one of those tests wouldn't want extra compactions - // to be scheduled, I added this check here, instead of - // setting rescheduleReadCompaction in those tests. - if env.readCompactionEnv.rescheduleReadCompaction != nil { - *env.readCompactionEnv.rescheduleReadCompaction = true - } - - // At the lowest possible compaction-picking priority, look for files marked - // for compaction. Pebble will mark files for compaction if they have atomic - // compaction units that span multiple files. While current Pebble code does - // not construct such sstables, RocksDB and earlier versions of Pebble may - // have created them. These split user keys form sets of files that must be - // compacted together for correctness (referred to as "atomic compaction - // units" within the code). Rewrite them in-place. - // - // It's also possible that a file may have been marked for compaction by - // even earlier versions of Pebble code, since FileMetadata's - // MarkedForCompaction field is persisted in the manifest. That's okay. We - // previously would've ignored the designation, whereas now we'll re-compact - // the file in place. - if p.vers.Stats.MarkedForCompaction > 0 { - if pc := p.pickRewriteCompaction(env); pc != nil { - return pc - } - } - - return nil -} - -func (p *compactionPickerByScore) addScoresToPickedCompactionMetrics( - pc *pickedCompaction, candInfo [numLevels]candidateLevelInfo, -) { - - // candInfo is sorted by score, not by compaction level. - infoByLevel := [numLevels]candidateLevelInfo{} - for i := range candInfo { - level := candInfo[i].level - infoByLevel[level] = candInfo[i] - } - // Gather the compaction scores for the levels participating in the compaction. - pc.pickerMetrics.scores = make([]float64, len(pc.inputs)) - inputIdx := 0 - for i := range infoByLevel { - if pc.inputs[inputIdx].level == infoByLevel[i].level { - pc.pickerMetrics.scores[inputIdx] = infoByLevel[i].compensatedScoreRatio - inputIdx++ - } - if inputIdx == len(pc.inputs) { - break - } - } -} - -// elisionOnlyAnnotator implements the manifest.Annotator interface, -// annotating B-Tree nodes with the *fileMetadata of a file meeting the -// obsolete keys criteria for an elision-only compaction within the subtree. -// If multiple files meet the criteria, it chooses whichever file has the -// lowest LargestSeqNum. The lowest LargestSeqNum file will be the first -// eligible for an elision-only compaction once snapshots less than or equal -// to its LargestSeqNum are closed. -type elisionOnlyAnnotator struct{} - -var _ manifest.Annotator = elisionOnlyAnnotator{} - -func (a elisionOnlyAnnotator) Zero(interface{}) interface{} { - return nil -} - -func (a elisionOnlyAnnotator) Accumulate(f *fileMetadata, dst interface{}) (interface{}, bool) { - if f.IsCompacting() { - return dst, true - } - if !f.StatsValid() { - return dst, false - } - // Bottommost files are large and not worthwhile to compact just - // to remove a few tombstones. Consider a file ineligible if its - // own range deletions delete less than 10% of its data and its - // deletion tombstones make up less than 10% of its entries. - // - // TODO(jackson): This does not account for duplicate user keys - // which may be collapsed. Ideally, we would have 'obsolete keys' - // statistics that would include tombstones, the keys that are - // dropped by tombstones and duplicated user keys. See #847. - // - // Note that tables that contain exclusively range keys (i.e. no point keys, - // `NumEntries` and `RangeDeletionsBytesEstimate` are both zero) are excluded - // from elision-only compactions. - // TODO(travers): Consider an alternative heuristic for elision of range-keys. - if f.Stats.RangeDeletionsBytesEstimate*10 < f.Size && - f.Stats.NumDeletions*10 <= f.Stats.NumEntries { - return dst, true - } - if dst == nil { - return f, true - } else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum { - return f, true - } - return dst, true -} - -func (a elisionOnlyAnnotator) Merge(v interface{}, accum interface{}) interface{} { - if v == nil { - return accum - } - // If we haven't accumulated an eligible file yet, or f's LargestSeqNum is - // less than the accumulated file's, use f. - if accum == nil { - return v - } - f := v.(*fileMetadata) - accumV := accum.(*fileMetadata) - if accumV == nil || accumV.LargestSeqNum > f.LargestSeqNum { - return f - } - return accumV -} - -// markedForCompactionAnnotator implements the manifest.Annotator interface, -// annotating B-Tree nodes with the *fileMetadata of a file that is marked for -// compaction within the subtree. If multiple files meet the criteria, it -// chooses whichever file has the lowest LargestSeqNum. -type markedForCompactionAnnotator struct{} - -var _ manifest.Annotator = markedForCompactionAnnotator{} - -func (a markedForCompactionAnnotator) Zero(interface{}) interface{} { - return nil -} - -func (a markedForCompactionAnnotator) Accumulate( - f *fileMetadata, dst interface{}, -) (interface{}, bool) { - if !f.MarkedForCompaction { - // Not marked for compaction; return dst. - return dst, true - } - return markedMergeHelper(f, dst) -} - -func (a markedForCompactionAnnotator) Merge(v interface{}, accum interface{}) interface{} { - if v == nil { - return accum - } - accum, _ = markedMergeHelper(v.(*fileMetadata), accum) - return accum -} - -// REQUIRES: f is non-nil, and f.MarkedForCompaction=true. -func markedMergeHelper(f *fileMetadata, dst interface{}) (interface{}, bool) { - if dst == nil { - return f, true - } else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum { - return f, true - } - return dst, true -} - -// pickElisionOnlyCompaction looks for compactions of sstables in the -// bottommost level containing obsolete records that may now be dropped. -func (p *compactionPickerByScore) pickElisionOnlyCompaction( - env compactionEnv, -) (pc *pickedCompaction) { - if p.opts.private.disableElisionOnlyCompactions { - return nil - } - v := p.vers.Levels[numLevels-1].Annotation(elisionOnlyAnnotator{}) - if v == nil { - return nil - } - candidate := v.(*fileMetadata) - if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum { - return nil - } - lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate) - if lf == nil { - panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1)) - } - - // Construct a picked compaction of the elision candidate's atomic - // compaction unit. - pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel) - pc.kind = compactionKindElisionOnly - var isCompacting bool - pc.startLevel.files, isCompacting = expandToAtomicUnit(p.opts.Comparer.Compare, lf.Slice(), false /* disableIsCompacting */) - if isCompacting { - return nil - } - pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) - // Fail-safe to protect against compacting the same sstable concurrently. - if !inputRangeAlreadyCompacting(env, pc) { - return pc - } - return nil -} - -// pickRewriteCompaction attempts to construct a compaction that -// rewrites a file marked for compaction. pickRewriteCompaction will -// pull in adjacent files in the file's atomic compaction unit if -// necessary. A rewrite compaction outputs files to the same level as -// the input level. -func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction) { - for l := numLevels - 1; l >= 0; l-- { - v := p.vers.Levels[l].Annotation(markedForCompactionAnnotator{}) - if v == nil { - // Try the next level. - continue - } - candidate := v.(*fileMetadata) - if candidate.IsCompacting() { - // Try the next level. - continue - } - lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate) - if lf == nil { - panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1)) - } - - inputs := lf.Slice() - // L0 files generated by a flush have never been split such that - // adjacent files can contain the same user key. So we do not need to - // rewrite an atomic compaction unit for L0. Note that there is nothing - // preventing two different flushes from producing files that are - // non-overlapping from an InternalKey perspective, but span the same - // user key. However, such files cannot be in the same L0 sublevel, - // since each sublevel requires non-overlapping user keys (unlike other - // levels). - if l > 0 { - // Find this file's atomic compaction unit. This is only relevant - // for levels L1+. - var isCompacting bool - inputs, isCompacting = expandToAtomicUnit( - p.opts.Comparer.Compare, - inputs, - false, /* disableIsCompacting */ - ) - if isCompacting { - // Try the next level. - continue - } - } - - pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel) - pc.outputLevel.level = l - pc.kind = compactionKindRewrite - pc.startLevel.files = inputs - pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) - - // Fail-safe to protect against compacting the same sstable concurrently. - if !inputRangeAlreadyCompacting(env, pc) { - if pc.startLevel.level == 0 { - pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files) - } - return pc - } - } - return nil -} - -// pickAutoLPositive picks an automatic compaction for the candidate -// file in a positive-numbered level. This function must not be used for -// L0. -func pickAutoLPositive( - env compactionEnv, - opts *Options, - vers *version, - cInfo candidateLevelInfo, - baseLevel int, - levelMaxBytes [7]int64, -) (pc *pickedCompaction) { - if cInfo.level == 0 { - panic("pebble: pickAutoLPositive called for L0") - } - - pc = newPickedCompaction(opts, vers, cInfo.level, defaultOutputLevel(cInfo.level, baseLevel), baseLevel) - if pc.outputLevel.level != cInfo.outputLevel { - panic("pebble: compaction picked unexpected output level") - } - pc.startLevel.files = cInfo.file.Slice() - // Files in level 0 may overlap each other, so pick up all overlapping ones. - if pc.startLevel.level == 0 { - cmp := opts.Comparer.Compare - smallest, largest := manifest.KeyRange(cmp, pc.startLevel.files.Iter()) - pc.startLevel.files = vers.Overlaps(0, cmp, smallest.UserKey, - largest.UserKey, largest.IsExclusiveSentinel()) - if pc.startLevel.files.Empty() { - panic("pebble: empty compaction") - } - } - - if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) { - return nil - } - return pc.maybeAddLevel(opts, env.diskAvailBytes) -} - -// maybeAddLevel maybe adds a level to the picked compaction. -func (pc *pickedCompaction) maybeAddLevel(opts *Options, diskAvailBytes uint64) *pickedCompaction { - pc.pickerMetrics.singleLevelOverlappingRatio = pc.overlappingRatio() - if pc.outputLevel.level == numLevels-1 { - // Don't add a level if the current output level is in L6 - return pc - } - if !opts.Experimental.MultiLevelCompactionHeuristic.allowL0() && pc.startLevel.level == 0 { - return pc - } - if pc.compactionSize() > expandedCompactionByteSizeLimit( - opts, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel), diskAvailBytes) { - // Don't add a level if the current compaction exceeds the compaction size limit - return pc - } - return opts.Experimental.MultiLevelCompactionHeuristic.pick(pc, opts, diskAvailBytes) -} - -// MultiLevelHeuristic evaluates whether to add files from the next level into the compaction. -type MultiLevelHeuristic interface { - // Evaluate returns the preferred compaction. - pick(pc *pickedCompaction, opts *Options, diskAvailBytes uint64) *pickedCompaction - - // Returns if the heuristic allows L0 to be involved in ML compaction - allowL0() bool -} - -// NoMultiLevel will never add an additional level to the compaction. -type NoMultiLevel struct{} - -var _ MultiLevelHeuristic = (*NoMultiLevel)(nil) - -func (nml NoMultiLevel) pick( - pc *pickedCompaction, opts *Options, diskAvailBytes uint64, -) *pickedCompaction { - return pc -} - -func (nml NoMultiLevel) allowL0() bool { - return false -} - -func (pc *pickedCompaction) predictedWriteAmp() float64 { - var bytesToCompact uint64 - var higherLevelBytes uint64 - for i := range pc.inputs { - levelSize := pc.inputs[i].files.SizeSum() - bytesToCompact += levelSize - if i != len(pc.inputs)-1 { - higherLevelBytes += levelSize - } - } - return float64(bytesToCompact) / float64(higherLevelBytes) -} - -func (pc *pickedCompaction) overlappingRatio() float64 { - var higherLevelBytes uint64 - var lowestLevelBytes uint64 - for i := range pc.inputs { - levelSize := pc.inputs[i].files.SizeSum() - if i == len(pc.inputs)-1 { - lowestLevelBytes += levelSize - continue - } - higherLevelBytes += levelSize - } - return float64(lowestLevelBytes) / float64(higherLevelBytes) -} - -// WriteAmpHeuristic defines a multi level compaction heuristic which will add -// an additional level to the picked compaction if it reduces predicted write -// amp of the compaction + the addPropensity constant. -type WriteAmpHeuristic struct { - // addPropensity is a constant that affects the propensity to conduct multilevel - // compactions. If positive, a multilevel compaction may get picked even if - // the single level compaction has lower write amp, and vice versa. - AddPropensity float64 - - // AllowL0 if true, allow l0 to be involved in a ML compaction. - AllowL0 bool -} - -var _ MultiLevelHeuristic = (*WriteAmpHeuristic)(nil) - -// TODO(msbutler): microbenchmark the extent to which multilevel compaction -// picking slows down the compaction picking process. This should be as fast as -// possible since Compaction-picking holds d.mu, which prevents WAL rotations, -// in-progress flushes and compactions from completing, etc. Consider ways to -// deduplicate work, given that setupInputs has already been called. -func (wa WriteAmpHeuristic) pick( - pcOrig *pickedCompaction, opts *Options, diskAvailBytes uint64, -) *pickedCompaction { - pcMulti := pcOrig.clone() - if !pcMulti.setupMultiLevelCandidate(opts, diskAvailBytes) { - return pcOrig - } - picked := pcOrig - if pcMulti.predictedWriteAmp() <= pcOrig.predictedWriteAmp()+wa.AddPropensity { - picked = pcMulti - } - // Regardless of what compaction was picked, log the multilevelOverlapping ratio. - picked.pickerMetrics.multiLevelOverlappingRatio = pcMulti.overlappingRatio() - return picked -} - -func (wa WriteAmpHeuristic) allowL0() bool { - return wa.AllowL0 -} - -// Helper method to pick compactions originating from L0. Uses information about -// sublevels to generate a compaction. -func pickL0(env compactionEnv, opts *Options, vers *version, baseLevel int) (pc *pickedCompaction) { - // It is important to pass information about Lbase files to L0Sublevels - // so it can pick a compaction that does not conflict with an Lbase => Lbase+1 - // compaction. Without this, we observed reduced concurrency of L0=>Lbase - // compactions, and increasing read amplification in L0. - // - // TODO(bilal) Remove the minCompactionDepth parameter once fixing it at 1 - // has been shown to not cause a performance regression. - lcf, err := vers.L0Sublevels.PickBaseCompaction(1, vers.Levels[baseLevel].Slice()) - if err != nil { - opts.Logger.Infof("error when picking base compaction: %s", err) - return - } - if lcf != nil { - pc = newPickedCompactionFromL0(lcf, opts, vers, baseLevel, true) - pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) - if pc.startLevel.files.Empty() { - opts.Logger.Fatalf("empty compaction chosen") - } - return pc.maybeAddLevel(opts, env.diskAvailBytes) - } - - // Couldn't choose a base compaction. Try choosing an intra-L0 - // compaction. Note that we pass in L0CompactionThreshold here as opposed to - // 1, since choosing a single sublevel intra-L0 compaction is - // counterproductive. - lcf, err = vers.L0Sublevels.PickIntraL0Compaction(env.earliestUnflushedSeqNum, minIntraL0Count) - if err != nil { - opts.Logger.Infof("error when picking intra-L0 compaction: %s", err) - return - } - if lcf != nil { - pc = newPickedCompactionFromL0(lcf, opts, vers, 0, false) - if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) { - return nil - } - if pc.startLevel.files.Empty() { - opts.Logger.Fatalf("empty compaction chosen") - } - { - iter := pc.startLevel.files.Iter() - if iter.First() == nil || iter.Next() == nil { - // A single-file intra-L0 compaction is unproductive. - return nil - } - } - - pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) - } - return pc -} - -func pickManualCompaction( - vers *version, opts *Options, env compactionEnv, baseLevel int, manual *manualCompaction, -) (pc *pickedCompaction, retryLater bool) { - outputLevel := manual.level + 1 - if manual.level == 0 { - outputLevel = baseLevel - } else if manual.level < baseLevel { - // The start level for a compaction must be >= Lbase. A manual - // compaction could have been created adhering to that condition, and - // then an automatic compaction came in and compacted all of the - // sstables in Lbase to Lbase+1 which caused Lbase to change. Simply - // ignore this manual compaction as there is nothing to do (manual.level - // points to an empty level). - return nil, false - } - // This conflictsWithInProgress call is necessary for the manual compaction to - // be retried when it conflicts with an ongoing automatic compaction. Without - // it, the compaction is dropped due to pc.setupInputs returning false since - // the input/output range is already being compacted, and the manual - // compaction ends with a non-compacted LSM. - if conflictsWithInProgress(manual, outputLevel, env.inProgressCompactions, opts.Comparer.Compare) { - return nil, true - } - pc = newPickedCompaction(opts, vers, manual.level, defaultOutputLevel(manual.level, baseLevel), baseLevel) - manual.outputLevel = pc.outputLevel.level - pc.startLevel.files = vers.Overlaps(manual.level, opts.Comparer.Compare, manual.start, manual.end, false) - if pc.startLevel.files.Empty() { - // Nothing to do - return nil, false - } - if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel) { - // setupInputs returned false indicating there's a conflicting - // concurrent compaction. - return nil, true - } - if pc = pc.maybeAddLevel(opts, env.diskAvailBytes); pc == nil { - return nil, false - } - if pc.outputLevel.level != outputLevel { - if len(pc.extraLevels) > 0 { - // multilevel compactions relax this invariant - } else { - panic("pebble: compaction picked unexpected output level") - } - } - // Fail-safe to protect against compacting the same sstable concurrently. - if inputRangeAlreadyCompacting(env, pc) { - return nil, true - } - return pc, false -} - -func (p *compactionPickerByScore) pickReadTriggeredCompaction( - env compactionEnv, -) (pc *pickedCompaction) { - // If a flush is in-progress or expected to happen soon, it means more writes are taking place. We would - // soon be scheduling more write focussed compactions. In this case, skip read compactions as they are - // lower priority. - if env.readCompactionEnv.flushing || env.readCompactionEnv.readCompactions == nil { - return nil - } - for env.readCompactionEnv.readCompactions.size > 0 { - rc := env.readCompactionEnv.readCompactions.remove() - if pc = pickReadTriggeredCompactionHelper(p, rc, env); pc != nil { - break - } - } - return pc -} - -func pickReadTriggeredCompactionHelper( - p *compactionPickerByScore, rc *readCompaction, env compactionEnv, -) (pc *pickedCompaction) { - cmp := p.opts.Comparer.Compare - overlapSlice := p.vers.Overlaps(rc.level, cmp, rc.start, rc.end, false /* exclusiveEnd */) - if overlapSlice.Empty() { - // If there is no overlap, then the file with the key range - // must have been compacted away. So, we don't proceed to - // compact the same key range again. - return nil - } - - iter := overlapSlice.Iter() - var fileMatches bool - for f := iter.First(); f != nil; f = iter.Next() { - if f.FileNum == rc.fileNum { - fileMatches = true - break - } - } - if !fileMatches { - return nil - } - - pc = newPickedCompaction(p.opts, p.vers, rc.level, defaultOutputLevel(rc.level, p.baseLevel), p.baseLevel) - - pc.startLevel.files = overlapSlice - if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) { - return nil - } - if inputRangeAlreadyCompacting(env, pc) { - return nil - } - pc.kind = compactionKindRead - - // Prevent read compactions which are too wide. - outputOverlaps := pc.version.Overlaps( - pc.outputLevel.level, pc.cmp, pc.smallest.UserKey, - pc.largest.UserKey, pc.largest.IsExclusiveSentinel()) - if outputOverlaps.SizeSum() > pc.maxReadCompactionBytes { - return nil - } - - // Prevent compactions which start with a small seed file X, but overlap - // with over allowedCompactionWidth * X file sizes in the output layer. - const allowedCompactionWidth = 35 - if outputOverlaps.SizeSum() > overlapSlice.SizeSum()*allowedCompactionWidth { - return nil - } - - return pc -} - -func (p *compactionPickerByScore) forceBaseLevel1() { - p.baseLevel = 1 -} - -func inputRangeAlreadyCompacting(env compactionEnv, pc *pickedCompaction) bool { - for _, cl := range pc.inputs { - iter := cl.files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - if f.IsCompacting() { - return true - } - } - } - - // Look for active compactions outputting to the same region of the key - // space in the same output level. Two potential compactions may conflict - // without sharing input files if there are no files in the output level - // that overlap with the intersection of the compactions' key spaces. - // - // Consider an active L0->Lbase compaction compacting two L0 files one - // [a-f] and the other [t-z] into Lbase. - // - // L0 - // ↦ 000100 ↤ ↦ 000101 ↤ - // L1 - // ↦ 000004 ↤ - // a b c d e f g h i j k l m n o p q r s t u v w x y z - // - // If a new file 000102 [j-p] is flushed while the existing compaction is - // still ongoing, new file would not be in any compacting sublevel - // intervals and would not overlap with any Lbase files that are also - // compacting. However, this compaction cannot be picked because the - // compaction's output key space [j-p] would overlap the existing - // compaction's output key space [a-z]. - // - // L0 - // ↦ 000100* ↤ ↦ 000102 ↤ ↦ 000101* ↤ - // L1 - // ↦ 000004* ↤ - // a b c d e f g h i j k l m n o p q r s t u v w x y z - // - // * - currently compacting - if pc.outputLevel != nil && pc.outputLevel.level != 0 { - for _, c := range env.inProgressCompactions { - if pc.outputLevel.level != c.outputLevel { - continue - } - if base.InternalCompare(pc.cmp, c.largest, pc.smallest) < 0 || - base.InternalCompare(pc.cmp, c.smallest, pc.largest) > 0 { - continue - } - - // The picked compaction and the in-progress compaction c are - // outputting to the same region of the key space of the same - // level. - return true - } - } - return false -} - -// conflictsWithInProgress checks if there are any in-progress compactions with overlapping keyspace. -func conflictsWithInProgress( - manual *manualCompaction, outputLevel int, inProgressCompactions []compactionInfo, cmp Compare, -) bool { - for _, c := range inProgressCompactions { - if (c.outputLevel == manual.level || c.outputLevel == outputLevel) && - isUserKeysOverlapping(manual.start, manual.end, c.smallest.UserKey, c.largest.UserKey, cmp) { - return true - } - for _, in := range c.inputs { - if in.files.Empty() { - continue - } - iter := in.files.Iter() - smallest := iter.First().Smallest.UserKey - largest := iter.Last().Largest.UserKey - if (in.level == manual.level || in.level == outputLevel) && - isUserKeysOverlapping(manual.start, manual.end, smallest, largest, cmp) { - return true - } - } - } - return false -} - -func isUserKeysOverlapping(x1, x2, y1, y2 []byte, cmp Compare) bool { - return cmp(x1, y2) <= 0 && cmp(y1, x2) <= 0 -} diff --git a/vendor/github.com/cockroachdb/pebble/error_iter.go b/vendor/github.com/cockroachdb/pebble/error_iter.go deleted file mode 100644 index de0ed35..0000000 --- a/vendor/github.com/cockroachdb/pebble/error_iter.go +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" -) - -type errorIter struct { - err error -} - -// errorIter implements the base.InternalIterator interface. -var _ internalIterator = (*errorIter)(nil) - -func newErrorIter(err error) *errorIter { - return &errorIter{err: err} -} - -func (c *errorIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { - return nil, base.LazyValue{} -} - -func (c *errorIter) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - return nil, base.LazyValue{} -} - -func (c *errorIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { - return nil, base.LazyValue{} -} - -func (c *errorIter) First() (*InternalKey, base.LazyValue) { - return nil, base.LazyValue{} -} - -func (c *errorIter) Last() (*InternalKey, base.LazyValue) { - return nil, base.LazyValue{} -} - -func (c *errorIter) Next() (*InternalKey, base.LazyValue) { - return nil, base.LazyValue{} -} - -func (c *errorIter) Prev() (*InternalKey, base.LazyValue) { - return nil, base.LazyValue{} -} - -func (c *errorIter) NextPrefix([]byte) (*InternalKey, base.LazyValue) { - return nil, base.LazyValue{} -} - -func (c *errorIter) Error() error { - return c.err -} - -func (c *errorIter) Close() error { - return c.err -} - -func (c *errorIter) String() string { - return "error" -} - -func (c *errorIter) SetBounds(lower, upper []byte) {} - -type errorKeyspanIter struct { - err error -} - -// errorKeyspanIter implements the keyspan.FragmentIterator interface. -var _ keyspan.FragmentIterator = (*errorKeyspanIter)(nil) - -func newErrorKeyspanIter(err error) *errorKeyspanIter { - return &errorKeyspanIter{err: err} -} - -func (*errorKeyspanIter) SeekGE(key []byte) *keyspan.Span { return nil } -func (*errorKeyspanIter) SeekLT(key []byte) *keyspan.Span { return nil } -func (*errorKeyspanIter) First() *keyspan.Span { return nil } -func (*errorKeyspanIter) Last() *keyspan.Span { return nil } -func (*errorKeyspanIter) Next() *keyspan.Span { return nil } -func (*errorKeyspanIter) Prev() *keyspan.Span { return nil } -func (i *errorKeyspanIter) Error() error { return i.err } -func (i *errorKeyspanIter) Close() error { return i.err } -func (*errorKeyspanIter) String() string { return "error" } diff --git a/vendor/github.com/cockroachdb/pebble/event.go b/vendor/github.com/cockroachdb/pebble/event.go deleted file mode 100644 index d431d07..0000000 --- a/vendor/github.com/cockroachdb/pebble/event.go +++ /dev/null @@ -1,766 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "fmt" - "strings" - "time" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/humanize" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/vfs" - "github.com/cockroachdb/redact" -) - -// TableInfo exports the manifest.TableInfo type. -type TableInfo = manifest.TableInfo - -func tablesTotalSize(tables []TableInfo) uint64 { - var size uint64 - for i := range tables { - size += tables[i].Size - } - return size -} - -func formatFileNums(tables []TableInfo) string { - var buf strings.Builder - for i := range tables { - if i > 0 { - buf.WriteString(" ") - } - buf.WriteString(tables[i].FileNum.String()) - } - return buf.String() -} - -// LevelInfo contains info pertaining to a particular level. -type LevelInfo struct { - Level int - Tables []TableInfo - Score float64 -} - -func (i LevelInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i LevelInfo) SafeFormat(w redact.SafePrinter, _ rune) { - w.Printf("L%d [%s] (%s) Score=%.2f", - redact.Safe(i.Level), - redact.Safe(formatFileNums(i.Tables)), - redact.Safe(humanize.Bytes.Uint64(tablesTotalSize(i.Tables))), - redact.Safe(i.Score)) -} - -// CompactionInfo contains the info for a compaction event. -type CompactionInfo struct { - // JobID is the ID of the compaction job. - JobID int - // Reason is the reason for the compaction. - Reason string - // Input contains the input tables for the compaction organized by level. - Input []LevelInfo - // Output contains the output tables generated by the compaction. The output - // tables are empty for the compaction begin event. - Output LevelInfo - // Duration is the time spent compacting, including reading and writing - // sstables. - Duration time.Duration - // TotalDuration is the total wall-time duration of the compaction, - // including applying the compaction to the database. TotalDuration is - // always ≥ Duration. - TotalDuration time.Duration - Done bool - Err error - - SingleLevelOverlappingRatio float64 - MultiLevelOverlappingRatio float64 - - // Annotations specifies additional info to appear in a compaction's event log line - Annotations compactionAnnotations -} - -type compactionAnnotations []string - -// SafeFormat implements redact.SafeFormatter. -func (ca compactionAnnotations) SafeFormat(w redact.SafePrinter, _ rune) { - if len(ca) == 0 { - return - } - for i := range ca { - if i != 0 { - w.Print(" ") - } - w.Printf("%s", redact.SafeString(ca[i])) - } -} - -func (i CompactionInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i CompactionInfo) SafeFormat(w redact.SafePrinter, _ rune) { - if i.Err != nil { - w.Printf("[JOB %d] compaction(%s) to L%d error: %s", - redact.Safe(i.JobID), redact.SafeString(i.Reason), redact.Safe(i.Output.Level), i.Err) - return - } - - if !i.Done { - w.Printf("[JOB %d] compacting(%s) ", - redact.Safe(i.JobID), - redact.SafeString(i.Reason)) - w.Printf("%s", i.Annotations) - w.Printf("%s; ", levelInfos(i.Input)) - w.Printf("OverlappingRatio: Single %.2f, Multi %.2f", i.SingleLevelOverlappingRatio, i.MultiLevelOverlappingRatio) - return - } - outputSize := tablesTotalSize(i.Output.Tables) - w.Printf("[JOB %d] compacted(%s) ", redact.Safe(i.JobID), redact.SafeString(i.Reason)) - w.Printf("%s", i.Annotations) - w.Print(levelInfos(i.Input)) - w.Printf(" -> L%d [%s] (%s), in %.1fs (%.1fs total), output rate %s/s", - redact.Safe(i.Output.Level), - redact.Safe(formatFileNums(i.Output.Tables)), - redact.Safe(humanize.Bytes.Uint64(outputSize)), - redact.Safe(i.Duration.Seconds()), - redact.Safe(i.TotalDuration.Seconds()), - redact.Safe(humanize.Bytes.Uint64(uint64(float64(outputSize)/i.Duration.Seconds())))) -} - -type levelInfos []LevelInfo - -func (i levelInfos) SafeFormat(w redact.SafePrinter, _ rune) { - for j, levelInfo := range i { - if j > 0 { - w.Printf(" + ") - } - w.Print(levelInfo) - } -} - -// DiskSlowInfo contains the info for a disk slowness event when writing to a -// file. -type DiskSlowInfo = vfs.DiskSlowInfo - -// FlushInfo contains the info for a flush event. -type FlushInfo struct { - // JobID is the ID of the flush job. - JobID int - // Reason is the reason for the flush. - Reason string - // Input contains the count of input memtables that were flushed. - Input int - // InputBytes contains the total in-memory size of the memtable(s) that were - // flushed. This size includes skiplist indexing data structures. - InputBytes uint64 - // Output contains the ouptut table generated by the flush. The output info - // is empty for the flush begin event. - Output []TableInfo - // Duration is the time spent flushing. This duration includes writing and - // syncing all of the flushed keys to sstables. - Duration time.Duration - // TotalDuration is the total wall-time duration of the flush, including - // applying the flush to the database. TotalDuration is always ≥ Duration. - TotalDuration time.Duration - // Ingest is set to true if the flush is handling tables that were added to - // the flushable queue via an ingestion operation. - Ingest bool - // IngestLevels are the output levels for each ingested table in the flush. - // This field is only populated when Ingest is true. - IngestLevels []int - Done bool - Err error -} - -func (i FlushInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i FlushInfo) SafeFormat(w redact.SafePrinter, _ rune) { - if i.Err != nil { - w.Printf("[JOB %d] flush error: %s", redact.Safe(i.JobID), i.Err) - return - } - - plural := redact.SafeString("s") - if i.Input == 1 { - plural = "" - } - if !i.Done { - w.Printf("[JOB %d] ", redact.Safe(i.JobID)) - if !i.Ingest { - w.Printf("flushing %d memtable", redact.Safe(i.Input)) - w.SafeString(plural) - w.Printf(" (%s) to L0", redact.Safe(humanize.Bytes.Uint64(i.InputBytes))) - } else { - w.Printf("flushing %d ingested table%s", redact.Safe(i.Input), plural) - } - return - } - - outputSize := tablesTotalSize(i.Output) - if !i.Ingest { - if invariants.Enabled && len(i.IngestLevels) > 0 { - panic(errors.AssertionFailedf("pebble: expected len(IngestedLevels) == 0")) - } - w.Printf("[JOB %d] flushed %d memtable%s (%s) to L0 [%s] (%s), in %.1fs (%.1fs total), output rate %s/s", - redact.Safe(i.JobID), redact.Safe(i.Input), plural, - redact.Safe(humanize.Bytes.Uint64(i.InputBytes)), - redact.Safe(formatFileNums(i.Output)), - redact.Safe(humanize.Bytes.Uint64(outputSize)), - redact.Safe(i.Duration.Seconds()), - redact.Safe(i.TotalDuration.Seconds()), - redact.Safe(humanize.Bytes.Uint64(uint64(float64(outputSize)/i.Duration.Seconds())))) - } else { - if invariants.Enabled && len(i.IngestLevels) == 0 { - panic(errors.AssertionFailedf("pebble: expected len(IngestedLevels) > 0")) - } - w.Printf("[JOB %d] flushed %d ingested flushable%s", - redact.Safe(i.JobID), redact.Safe(len(i.Output)), plural) - for j, level := range i.IngestLevels { - file := i.Output[j] - if j > 0 { - w.Printf(" +") - } - w.Printf(" L%d:%s (%s)", level, file.FileNum, humanize.Bytes.Uint64(file.Size)) - } - w.Printf(" in %.1fs (%.1fs total), output rate %s/s", - redact.Safe(i.Duration.Seconds()), - redact.Safe(i.TotalDuration.Seconds()), - redact.Safe(humanize.Bytes.Uint64(uint64(float64(outputSize)/i.Duration.Seconds())))) - } -} - -// ManifestCreateInfo contains info about a manifest creation event. -type ManifestCreateInfo struct { - // JobID is the ID of the job the caused the manifest to be created. - JobID int - Path string - // The file number of the new Manifest. - FileNum FileNum - Err error -} - -func (i ManifestCreateInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i ManifestCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) { - if i.Err != nil { - w.Printf("[JOB %d] MANIFEST create error: %s", redact.Safe(i.JobID), i.Err) - return - } - w.Printf("[JOB %d] MANIFEST created %s", redact.Safe(i.JobID), i.FileNum) -} - -// ManifestDeleteInfo contains the info for a Manifest deletion event. -type ManifestDeleteInfo struct { - // JobID is the ID of the job the caused the Manifest to be deleted. - JobID int - Path string - FileNum FileNum - Err error -} - -func (i ManifestDeleteInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i ManifestDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) { - if i.Err != nil { - w.Printf("[JOB %d] MANIFEST delete error: %s", redact.Safe(i.JobID), i.Err) - return - } - w.Printf("[JOB %d] MANIFEST deleted %s", redact.Safe(i.JobID), i.FileNum) -} - -// TableCreateInfo contains the info for a table creation event. -type TableCreateInfo struct { - JobID int - // Reason is the reason for the table creation: "compacting", "flushing", or - // "ingesting". - Reason string - Path string - FileNum FileNum -} - -func (i TableCreateInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i TableCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) { - w.Printf("[JOB %d] %s: sstable created %s", - redact.Safe(i.JobID), redact.Safe(i.Reason), i.FileNum) -} - -// TableDeleteInfo contains the info for a table deletion event. -type TableDeleteInfo struct { - JobID int - Path string - FileNum FileNum - Err error -} - -func (i TableDeleteInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i TableDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) { - if i.Err != nil { - w.Printf("[JOB %d] sstable delete error %s: %s", - redact.Safe(i.JobID), i.FileNum, i.Err) - return - } - w.Printf("[JOB %d] sstable deleted %s", redact.Safe(i.JobID), i.FileNum) -} - -// TableIngestInfo contains the info for a table ingestion event. -type TableIngestInfo struct { - // JobID is the ID of the job the caused the table to be ingested. - JobID int - Tables []struct { - TableInfo - Level int - } - // GlobalSeqNum is the sequence number that was assigned to all entries in - // the ingested table. - GlobalSeqNum uint64 - // flushable indicates whether the ingested sstable was treated as a - // flushable. - flushable bool - Err error -} - -func (i TableIngestInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i TableIngestInfo) SafeFormat(w redact.SafePrinter, _ rune) { - if i.Err != nil { - w.Printf("[JOB %d] ingest error: %s", redact.Safe(i.JobID), i.Err) - return - } - - if i.flushable { - w.Printf("[JOB %d] ingested as flushable", redact.Safe(i.JobID)) - } else { - w.Printf("[JOB %d] ingested", redact.Safe(i.JobID)) - } - - for j := range i.Tables { - t := &i.Tables[j] - if j > 0 { - w.Printf(",") - } - levelStr := "" - if !i.flushable { - levelStr = fmt.Sprintf("L%d:", t.Level) - } - w.Printf(" %s%s (%s)", redact.Safe(levelStr), t.FileNum, - redact.Safe(humanize.Bytes.Uint64(t.Size))) - } -} - -// TableStatsInfo contains the info for a table stats loaded event. -type TableStatsInfo struct { - // JobID is the ID of the job that finished loading the initial tables' - // stats. - JobID int -} - -func (i TableStatsInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i TableStatsInfo) SafeFormat(w redact.SafePrinter, _ rune) { - w.Printf("[JOB %d] all initial table stats loaded", redact.Safe(i.JobID)) -} - -// TableValidatedInfo contains information on the result of a validation run -// on an sstable. -type TableValidatedInfo struct { - JobID int - Meta *fileMetadata -} - -func (i TableValidatedInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i TableValidatedInfo) SafeFormat(w redact.SafePrinter, _ rune) { - w.Printf("[JOB %d] validated table: %s", redact.Safe(i.JobID), i.Meta) -} - -// WALCreateInfo contains info about a WAL creation event. -type WALCreateInfo struct { - // JobID is the ID of the job the caused the WAL to be created. - JobID int - Path string - // The file number of the new WAL. - FileNum FileNum - // The file number of a previous WAL which was recycled to create this - // one. Zero if recycling did not take place. - RecycledFileNum FileNum - Err error -} - -func (i WALCreateInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i WALCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) { - if i.Err != nil { - w.Printf("[JOB %d] WAL create error: %s", redact.Safe(i.JobID), i.Err) - return - } - - if i.RecycledFileNum == 0 { - w.Printf("[JOB %d] WAL created %s", redact.Safe(i.JobID), i.FileNum) - return - } - - w.Printf("[JOB %d] WAL created %s (recycled %s)", - redact.Safe(i.JobID), i.FileNum, i.RecycledFileNum) -} - -// WALDeleteInfo contains the info for a WAL deletion event. -type WALDeleteInfo struct { - // JobID is the ID of the job the caused the WAL to be deleted. - JobID int - Path string - FileNum FileNum - Err error -} - -func (i WALDeleteInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i WALDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) { - if i.Err != nil { - w.Printf("[JOB %d] WAL delete error: %s", redact.Safe(i.JobID), i.Err) - return - } - w.Printf("[JOB %d] WAL deleted %s", redact.Safe(i.JobID), i.FileNum) -} - -// WriteStallBeginInfo contains the info for a write stall begin event. -type WriteStallBeginInfo struct { - Reason string -} - -func (i WriteStallBeginInfo) String() string { - return redact.StringWithoutMarkers(i) -} - -// SafeFormat implements redact.SafeFormatter. -func (i WriteStallBeginInfo) SafeFormat(w redact.SafePrinter, _ rune) { - w.Printf("write stall beginning: %s", redact.Safe(i.Reason)) -} - -// EventListener contains a set of functions that will be invoked when various -// significant DB events occur. Note that the functions should not run for an -// excessive amount of time as they are invoked synchronously by the DB and may -// block continued DB work. For a similar reason it is advisable to not perform -// any synchronous calls back into the DB. -type EventListener struct { - // BackgroundError is invoked whenever an error occurs during a background - // operation such as flush or compaction. - BackgroundError func(error) - - // CompactionBegin is invoked after the inputs to a compaction have been - // determined, but before the compaction has produced any output. - CompactionBegin func(CompactionInfo) - - // CompactionEnd is invoked after a compaction has completed and the result - // has been installed. - CompactionEnd func(CompactionInfo) - - // DiskSlow is invoked after a disk write operation on a file created with a - // disk health checking vfs.FS (see vfs.DefaultWithDiskHealthChecks) is - // observed to exceed the specified disk slowness threshold duration. DiskSlow - // is called on a goroutine that is monitoring slowness/stuckness. The callee - // MUST return without doing any IO, or blocking on anything (like a mutex) - // that is waiting on IO. This is imperative in order to reliably monitor for - // slowness, since if this goroutine gets stuck, the monitoring will stop - // working. - DiskSlow func(DiskSlowInfo) - - // FlushBegin is invoked after the inputs to a flush have been determined, - // but before the flush has produced any output. - FlushBegin func(FlushInfo) - - // FlushEnd is invoked after a flush has complated and the result has been - // installed. - FlushEnd func(FlushInfo) - - // FormatUpgrade is invoked after the database's FormatMajorVersion - // is upgraded. - FormatUpgrade func(FormatMajorVersion) - - // ManifestCreated is invoked after a manifest has been created. - ManifestCreated func(ManifestCreateInfo) - - // ManifestDeleted is invoked after a manifest has been deleted. - ManifestDeleted func(ManifestDeleteInfo) - - // TableCreated is invoked when a table has been created. - TableCreated func(TableCreateInfo) - - // TableDeleted is invoked after a table has been deleted. - TableDeleted func(TableDeleteInfo) - - // TableIngested is invoked after an externally created table has been - // ingested via a call to DB.Ingest(). - TableIngested func(TableIngestInfo) - - // TableStatsLoaded is invoked at most once, when the table stats - // collector has loaded statistics for all tables that existed at Open. - TableStatsLoaded func(TableStatsInfo) - - // TableValidated is invoked after validation runs on an sstable. - TableValidated func(TableValidatedInfo) - - // WALCreated is invoked after a WAL has been created. - WALCreated func(WALCreateInfo) - - // WALDeleted is invoked after a WAL has been deleted. - WALDeleted func(WALDeleteInfo) - - // WriteStallBegin is invoked when writes are intentionally delayed. - WriteStallBegin func(WriteStallBeginInfo) - - // WriteStallEnd is invoked when delayed writes are released. - WriteStallEnd func() -} - -// EnsureDefaults ensures that background error events are logged to the -// specified logger if a handler for those events hasn't been otherwise -// specified. Ensure all handlers are non-nil so that we don't have to check -// for nil-ness before invoking. -func (l *EventListener) EnsureDefaults(logger Logger) { - if l.BackgroundError == nil { - if logger != nil { - l.BackgroundError = func(err error) { - logger.Infof("background error: %s", err) - } - } else { - l.BackgroundError = func(error) {} - } - } - if l.CompactionBegin == nil { - l.CompactionBegin = func(info CompactionInfo) {} - } - if l.CompactionEnd == nil { - l.CompactionEnd = func(info CompactionInfo) {} - } - if l.DiskSlow == nil { - l.DiskSlow = func(info DiskSlowInfo) {} - } - if l.FlushBegin == nil { - l.FlushBegin = func(info FlushInfo) {} - } - if l.FlushEnd == nil { - l.FlushEnd = func(info FlushInfo) {} - } - if l.FormatUpgrade == nil { - l.FormatUpgrade = func(v FormatMajorVersion) {} - } - if l.ManifestCreated == nil { - l.ManifestCreated = func(info ManifestCreateInfo) {} - } - if l.ManifestDeleted == nil { - l.ManifestDeleted = func(info ManifestDeleteInfo) {} - } - if l.TableCreated == nil { - l.TableCreated = func(info TableCreateInfo) {} - } - if l.TableDeleted == nil { - l.TableDeleted = func(info TableDeleteInfo) {} - } - if l.TableIngested == nil { - l.TableIngested = func(info TableIngestInfo) {} - } - if l.TableStatsLoaded == nil { - l.TableStatsLoaded = func(info TableStatsInfo) {} - } - if l.TableValidated == nil { - l.TableValidated = func(validated TableValidatedInfo) {} - } - if l.WALCreated == nil { - l.WALCreated = func(info WALCreateInfo) {} - } - if l.WALDeleted == nil { - l.WALDeleted = func(info WALDeleteInfo) {} - } - if l.WriteStallBegin == nil { - l.WriteStallBegin = func(info WriteStallBeginInfo) {} - } - if l.WriteStallEnd == nil { - l.WriteStallEnd = func() {} - } -} - -// MakeLoggingEventListener creates an EventListener that logs all events to the -// specified logger. -func MakeLoggingEventListener(logger Logger) EventListener { - if logger == nil { - logger = DefaultLogger - } - - return EventListener{ - BackgroundError: func(err error) { - logger.Infof("background error: %s", err) - }, - CompactionBegin: func(info CompactionInfo) { - logger.Infof("%s", info) - }, - CompactionEnd: func(info CompactionInfo) { - logger.Infof("%s", info) - }, - DiskSlow: func(info DiskSlowInfo) { - logger.Infof("%s", info) - }, - FlushBegin: func(info FlushInfo) { - logger.Infof("%s", info) - }, - FlushEnd: func(info FlushInfo) { - logger.Infof("%s", info) - }, - FormatUpgrade: func(v FormatMajorVersion) { - logger.Infof("upgraded to format version: %s", v) - }, - ManifestCreated: func(info ManifestCreateInfo) { - logger.Infof("%s", info) - }, - ManifestDeleted: func(info ManifestDeleteInfo) { - logger.Infof("%s", info) - }, - TableCreated: func(info TableCreateInfo) { - logger.Infof("%s", info) - }, - TableDeleted: func(info TableDeleteInfo) { - logger.Infof("%s", info) - }, - TableIngested: func(info TableIngestInfo) { - logger.Infof("%s", info) - }, - TableStatsLoaded: func(info TableStatsInfo) { - logger.Infof("%s", info) - }, - TableValidated: func(info TableValidatedInfo) { - logger.Infof("%s", info) - }, - WALCreated: func(info WALCreateInfo) { - logger.Infof("%s", info) - }, - WALDeleted: func(info WALDeleteInfo) { - logger.Infof("%s", info) - }, - WriteStallBegin: func(info WriteStallBeginInfo) { - logger.Infof("%s", info) - }, - WriteStallEnd: func() { - logger.Infof("write stall ending") - }, - } -} - -// TeeEventListener wraps two EventListeners, forwarding all events to both. -func TeeEventListener(a, b EventListener) EventListener { - a.EnsureDefaults(nil) - b.EnsureDefaults(nil) - return EventListener{ - BackgroundError: func(err error) { - a.BackgroundError(err) - b.BackgroundError(err) - }, - CompactionBegin: func(info CompactionInfo) { - a.CompactionBegin(info) - b.CompactionBegin(info) - }, - CompactionEnd: func(info CompactionInfo) { - a.CompactionEnd(info) - b.CompactionEnd(info) - }, - DiskSlow: func(info DiskSlowInfo) { - a.DiskSlow(info) - b.DiskSlow(info) - }, - FlushBegin: func(info FlushInfo) { - a.FlushBegin(info) - b.FlushBegin(info) - }, - FlushEnd: func(info FlushInfo) { - a.FlushEnd(info) - b.FlushEnd(info) - }, - FormatUpgrade: func(v FormatMajorVersion) { - a.FormatUpgrade(v) - b.FormatUpgrade(v) - }, - ManifestCreated: func(info ManifestCreateInfo) { - a.ManifestCreated(info) - b.ManifestCreated(info) - }, - ManifestDeleted: func(info ManifestDeleteInfo) { - a.ManifestDeleted(info) - b.ManifestDeleted(info) - }, - TableCreated: func(info TableCreateInfo) { - a.TableCreated(info) - b.TableCreated(info) - }, - TableDeleted: func(info TableDeleteInfo) { - a.TableDeleted(info) - b.TableDeleted(info) - }, - TableIngested: func(info TableIngestInfo) { - a.TableIngested(info) - b.TableIngested(info) - }, - TableStatsLoaded: func(info TableStatsInfo) { - a.TableStatsLoaded(info) - b.TableStatsLoaded(info) - }, - TableValidated: func(info TableValidatedInfo) { - a.TableValidated(info) - b.TableValidated(info) - }, - WALCreated: func(info WALCreateInfo) { - a.WALCreated(info) - b.WALCreated(info) - }, - WALDeleted: func(info WALDeleteInfo) { - a.WALDeleted(info) - b.WALDeleted(info) - }, - WriteStallBegin: func(info WriteStallBeginInfo) { - a.WriteStallBegin(info) - b.WriteStallBegin(info) - }, - WriteStallEnd: func() { - a.WriteStallEnd() - b.WriteStallEnd() - }, - } -} diff --git a/vendor/github.com/cockroachdb/pebble/external_iterator.go b/vendor/github.com/cockroachdb/pebble/external_iterator.go deleted file mode 100644 index 1bd00c3..0000000 --- a/vendor/github.com/cockroachdb/pebble/external_iterator.go +++ /dev/null @@ -1,555 +0,0 @@ -// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "context" - "fmt" - "sort" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/sstable" -) - -// ExternalIterOption provide an interface to specify open-time options to -// NewExternalIter. -type ExternalIterOption interface { - // iterApply is called on the iterator during opening in order to set internal - // parameters. - iterApply(*Iterator) - // readerOptions returns any reader options added by this iter option. - readerOptions() []sstable.ReaderOption -} - -type externalIterReaderOptions struct { - opts []sstable.ReaderOption -} - -func (e *externalIterReaderOptions) iterApply(iterator *Iterator) { - // Do nothing. -} - -func (e *externalIterReaderOptions) readerOptions() []sstable.ReaderOption { - return e.opts -} - -// ExternalIterReaderOptions returns an ExternalIterOption that specifies -// sstable.ReaderOptions to be applied on sstable readers in NewExternalIter. -func ExternalIterReaderOptions(opts ...sstable.ReaderOption) ExternalIterOption { - return &externalIterReaderOptions{opts: opts} -} - -// ExternalIterForwardOnly is an ExternalIterOption that specifies this iterator -// will only be used for forward positioning operations (First, SeekGE, Next). -// This could enable optimizations that take advantage of this invariant. -// Behaviour when a reverse positioning operation is done on an iterator -// opened with this option is unpredictable, though in most cases it should. -type ExternalIterForwardOnly struct{} - -func (e ExternalIterForwardOnly) iterApply(iter *Iterator) { - iter.forwardOnly = true -} - -func (e ExternalIterForwardOnly) readerOptions() []sstable.ReaderOption { - return nil -} - -// NewExternalIter takes an input 2d array of sstable files which may overlap -// across subarrays but not within a subarray (at least as far as points are -// concerned; range keys are allowed to overlap arbitrarily even within a -// subarray), and returns an Iterator over the merged contents of the sstables. -// Input sstables may contain point keys, range keys, range deletions, etc. The -// input files slice must be sorted in reverse chronological ordering. A key in a -// file at a lower index subarray will shadow a key with an identical user key -// contained within a file at a higher index subarray. Each subarray must be -// sorted in internal key order, where lower index files contain keys that sort -// left of files with higher indexes. -// -// Input sstables must only contain keys with the zero sequence number. -// -// Iterators constructed through NewExternalIter do not support all iterator -// options, including block-property and table filters. NewExternalIter errors -// if an incompatible option is set. -func NewExternalIter( - o *Options, - iterOpts *IterOptions, - files [][]sstable.ReadableFile, - extraOpts ...ExternalIterOption, -) (it *Iterator, err error) { - return NewExternalIterWithContext(context.Background(), o, iterOpts, files, extraOpts...) -} - -// NewExternalIterWithContext is like NewExternalIter, and additionally -// accepts a context for tracing. -func NewExternalIterWithContext( - ctx context.Context, - o *Options, - iterOpts *IterOptions, - files [][]sstable.ReadableFile, - extraOpts ...ExternalIterOption, -) (it *Iterator, err error) { - if iterOpts != nil { - if err := validateExternalIterOpts(iterOpts); err != nil { - return nil, err - } - } - - var readers [][]*sstable.Reader - - // Ensure we close all the opened readers if we error out. - defer func() { - if err != nil { - for i := range readers { - for j := range readers[i] { - _ = readers[i][j].Close() - } - } - } - }() - seqNumOffset := 0 - var extraReaderOpts []sstable.ReaderOption - for i := range extraOpts { - extraReaderOpts = append(extraReaderOpts, extraOpts[i].readerOptions()...) - } - for _, levelFiles := range files { - seqNumOffset += len(levelFiles) - } - for _, levelFiles := range files { - var subReaders []*sstable.Reader - seqNumOffset -= len(levelFiles) - subReaders, err = openExternalTables(o, levelFiles, seqNumOffset, o.MakeReaderOptions(), extraReaderOpts...) - readers = append(readers, subReaders) - } - if err != nil { - return nil, err - } - - buf := iterAllocPool.Get().(*iterAlloc) - dbi := &buf.dbi - *dbi = Iterator{ - ctx: ctx, - alloc: buf, - merge: o.Merger.Merge, - comparer: *o.Comparer, - readState: nil, - keyBuf: buf.keyBuf, - prefixOrFullSeekKey: buf.prefixOrFullSeekKey, - boundsBuf: buf.boundsBuf, - batch: nil, - // Add the readers to the Iterator so that Close closes them, and - // SetOptions can re-construct iterators from them. - externalReaders: readers, - newIters: func( - ctx context.Context, f *manifest.FileMetadata, opts *IterOptions, - internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) { - // NB: External iterators are currently constructed without any - // `levelIters`. newIters should never be called. When we support - // organizing multiple non-overlapping files into a single level - // (see TODO below), we'll need to adjust this tableNewIters - // implementation to open iterators by looking up f in a map - // of readers indexed by *fileMetadata. - panic("unreachable") - }, - seqNum: base.InternalKeySeqNumMax, - } - if iterOpts != nil { - dbi.opts = *iterOpts - dbi.processBounds(iterOpts.LowerBound, iterOpts.UpperBound) - } - for i := range extraOpts { - extraOpts[i].iterApply(dbi) - } - finishInitializingExternal(ctx, dbi) - return dbi, nil -} - -func validateExternalIterOpts(iterOpts *IterOptions) error { - switch { - case iterOpts.TableFilter != nil: - return errors.Errorf("pebble: external iterator: TableFilter unsupported") - case iterOpts.PointKeyFilters != nil: - return errors.Errorf("pebble: external iterator: PointKeyFilters unsupported") - case iterOpts.RangeKeyFilters != nil: - return errors.Errorf("pebble: external iterator: RangeKeyFilters unsupported") - case iterOpts.OnlyReadGuaranteedDurable: - return errors.Errorf("pebble: external iterator: OnlyReadGuaranteedDurable unsupported") - case iterOpts.UseL6Filters: - return errors.Errorf("pebble: external iterator: UseL6Filters unsupported") - } - return nil -} - -func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterator, error) { - // TODO(jackson): In some instances we could generate fewer levels by using - // L0Sublevels code to organize nonoverlapping files into the same level. - // This would allow us to use levelIters and keep a smaller set of data and - // files in-memory. However, it would also require us to identify the bounds - // of all the files upfront. - - if !it.opts.pointKeys() { - return emptyIter, nil - } else if it.pointIter != nil { - return it.pointIter, nil - } - mlevels := it.alloc.mlevels[:0] - - if len(it.externalReaders) > cap(mlevels) { - mlevels = make([]mergingIterLevel, 0, len(it.externalReaders)) - } - for _, readers := range it.externalReaders { - var combinedIters []internalIterator - for _, r := range readers { - var ( - rangeDelIter keyspan.FragmentIterator - pointIter internalIterator - err error - ) - // We could set hideObsoletePoints=true, since we are reading at - // InternalKeySeqNumMax, but we don't bother since these sstables should - // not have obsolete points (so the performance optimization is - // unnecessary), and we don't want to bother constructing a - // BlockPropertiesFilterer that includes obsoleteKeyBlockPropertyFilter. - pointIter, err = r.NewIterWithBlockPropertyFiltersAndContextEtc( - ctx, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */ - false /* hideObsoletePoints */, false, /* useFilterBlock */ - &it.stats.InternalStats, sstable.TrivialReaderProvider{Reader: r}) - if err != nil { - return nil, err - } - rangeDelIter, err = r.NewRawRangeDelIter() - if err != nil { - return nil, err - } - if rangeDelIter == nil && pointIter != nil && it.forwardOnly { - // TODO(bilal): Consider implementing range key pausing in - // simpleLevelIter so we can reduce mergingIterLevels even more by - // sending all sstable iterators to combinedIters, not just those - // corresponding to sstables without range deletes. - combinedIters = append(combinedIters, pointIter) - continue - } - mlevels = append(mlevels, mergingIterLevel{ - iter: pointIter, - rangeDelIter: rangeDelIter, - }) - } - if len(combinedIters) == 1 { - mlevels = append(mlevels, mergingIterLevel{ - iter: combinedIters[0], - }) - } else if len(combinedIters) > 1 { - sli := &simpleLevelIter{ - cmp: it.cmp, - iters: combinedIters, - } - sli.init(it.opts) - mlevels = append(mlevels, mergingIterLevel{ - iter: sli, - rangeDelIter: nil, - }) - } - } - if len(mlevels) == 1 && mlevels[0].rangeDelIter == nil { - // Set closePointIterOnce to true. This is because we're bypassing the - // merging iter, which turns Close()s on it idempotent for any child - // iterators. The outer Iterator could call Close() on a point iter twice, - // which sstable iterators do not support (as they release themselves to - // a pool). - it.closePointIterOnce = true - return mlevels[0].iter, nil - } - - it.alloc.merging.init(&it.opts, &it.stats.InternalStats, it.comparer.Compare, it.comparer.Split, mlevels...) - it.alloc.merging.snapshot = base.InternalKeySeqNumMax - if len(mlevels) <= cap(it.alloc.levelsPositioned) { - it.alloc.merging.levelsPositioned = it.alloc.levelsPositioned[:len(mlevels)] - } - return &it.alloc.merging, nil -} - -func finishInitializingExternal(ctx context.Context, it *Iterator) { - pointIter, err := createExternalPointIter(ctx, it) - if err != nil { - it.pointIter = &errorIter{err: err} - } else { - it.pointIter = pointIter - } - it.iter = it.pointIter - - if it.opts.rangeKeys() { - it.rangeKeyMasking.init(it, it.comparer.Compare, it.comparer.Split) - var rangeKeyIters []keyspan.FragmentIterator - if it.rangeKey == nil { - // We could take advantage of the lack of overlaps in range keys within - // each slice in it.externalReaders, and generate keyspan.LevelIters - // out of those. However, since range keys are expected to be sparse to - // begin with, the performance gain might not be significant enough to - // warrant it. - // - // TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not - // operate on FileMetadatas (similar to simpleLevelIter), and implements - // this optimization. - for _, readers := range it.externalReaders { - for _, r := range readers { - if rki, err := r.NewRawRangeKeyIter(); err != nil { - rangeKeyIters = append(rangeKeyIters, &errorKeyspanIter{err: err}) - } else if rki != nil { - rangeKeyIters = append(rangeKeyIters, rki) - } - } - } - if len(rangeKeyIters) > 0 { - it.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) - it.rangeKey.init(it.comparer.Compare, it.comparer.Split, &it.opts) - it.rangeKey.rangeKeyIter = it.rangeKey.iterConfig.Init( - &it.comparer, - base.InternalKeySeqNumMax, - it.opts.LowerBound, it.opts.UpperBound, - &it.hasPrefix, &it.prefixOrFullSeekKey, - false /* internalKeys */, &it.rangeKey.internal, - ) - for i := range rangeKeyIters { - it.rangeKey.iterConfig.AddLevel(rangeKeyIters[i]) - } - } - } - if it.rangeKey != nil { - it.rangeKey.iiter.Init(&it.comparer, it.iter, it.rangeKey.rangeKeyIter, - keyspan.InterleavingIterOpts{ - Mask: &it.rangeKeyMasking, - LowerBound: it.opts.LowerBound, - UpperBound: it.opts.UpperBound, - }) - it.iter = &it.rangeKey.iiter - } - } -} - -func openExternalTables( - o *Options, - files []sstable.ReadableFile, - seqNumOffset int, - readerOpts sstable.ReaderOptions, - extraReaderOpts ...sstable.ReaderOption, -) (readers []*sstable.Reader, err error) { - readers = make([]*sstable.Reader, 0, len(files)) - for i := range files { - readable, err := sstable.NewSimpleReadable(files[i]) - if err != nil { - return readers, err - } - r, err := sstable.NewReader(readable, readerOpts, extraReaderOpts...) - if err != nil { - return readers, err - } - // Use the index of the file in files as the sequence number for all of - // its keys. - r.Properties.GlobalSeqNum = uint64(len(files) - i + seqNumOffset) - readers = append(readers, r) - } - return readers, err -} - -// simpleLevelIter is similar to a levelIter in that it merges the points -// from multiple point iterators that are non-overlapping in the key ranges -// they return. It is only expected to support forward iteration and forward -// regular seeking; reverse iteration and prefix seeking is not supported. -// Intended to be a low-overhead, non-FileMetadata dependent option for -// NewExternalIter. To optimize seeking and forward iteration, it maintains -// two slices of child iterators; one of all iterators, and a subset of it that -// contains just the iterators that contain point keys within the current -// bounds. -// -// Note that this levelIter does not support pausing at file boundaries -// in case of range tombstones in this file that could apply to points outside -// of this file (and outside of this level). This is sufficient for optimizing -// the main use cases of NewExternalIter, however for completeness it would make -// sense to build this pausing functionality in. -type simpleLevelIter struct { - cmp Compare - err error - lowerBound []byte - iters []internalIterator - filtered []internalIterator - firstKeys [][]byte - firstKeysBuf []byte - currentIdx int -} - -var _ internalIterator = &simpleLevelIter{} - -// init initializes this simpleLevelIter. -func (s *simpleLevelIter) init(opts IterOptions) { - s.currentIdx = 0 - s.lowerBound = opts.LowerBound - s.resetFilteredIters() -} - -func (s *simpleLevelIter) resetFilteredIters() { - s.filtered = s.filtered[:0] - s.firstKeys = s.firstKeys[:0] - s.firstKeysBuf = s.firstKeysBuf[:0] - s.err = nil - for i := range s.iters { - var iterKey *base.InternalKey - if s.lowerBound != nil { - iterKey, _ = s.iters[i].SeekGE(s.lowerBound, base.SeekGEFlagsNone) - } else { - iterKey, _ = s.iters[i].First() - } - if iterKey != nil { - s.filtered = append(s.filtered, s.iters[i]) - bufStart := len(s.firstKeysBuf) - s.firstKeysBuf = append(s.firstKeysBuf, iterKey.UserKey...) - s.firstKeys = append(s.firstKeys, s.firstKeysBuf[bufStart:bufStart+len(iterKey.UserKey)]) - } else if err := s.iters[i].Error(); err != nil { - s.err = err - } - } -} - -func (s *simpleLevelIter) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - if s.err != nil { - return nil, base.LazyValue{} - } - // Find the first file that is entirely >= key. The file before that could - // contain the key we're looking for. - n := sort.Search(len(s.firstKeys), func(i int) bool { - return s.cmp(key, s.firstKeys[i]) <= 0 - }) - if n > 0 { - s.currentIdx = n - 1 - } else { - s.currentIdx = n - } - if s.currentIdx < len(s.filtered) { - if iterKey, val := s.filtered[s.currentIdx].SeekGE(key, flags); iterKey != nil { - return iterKey, val - } - if err := s.filtered[s.currentIdx].Error(); err != nil { - s.err = err - } - s.currentIdx++ - } - return s.skipEmptyFileForward(key, flags) -} - -func (s *simpleLevelIter) skipEmptyFileForward( - seekKey []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - var iterKey *base.InternalKey - var val base.LazyValue - for s.currentIdx >= 0 && s.currentIdx < len(s.filtered) && s.err == nil { - if seekKey != nil { - iterKey, val = s.filtered[s.currentIdx].SeekGE(seekKey, flags) - } else if s.lowerBound != nil { - iterKey, val = s.filtered[s.currentIdx].SeekGE(s.lowerBound, flags) - } else { - iterKey, val = s.filtered[s.currentIdx].First() - } - if iterKey != nil { - return iterKey, val - } - if err := s.filtered[s.currentIdx].Error(); err != nil { - s.err = err - } - s.currentIdx++ - } - return nil, base.LazyValue{} -} - -func (s *simpleLevelIter) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -func (s *simpleLevelIter) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -func (s *simpleLevelIter) First() (*base.InternalKey, base.LazyValue) { - if s.err != nil { - return nil, base.LazyValue{} - } - s.currentIdx = 0 - return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone) -} - -func (s *simpleLevelIter) Last() (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -func (s *simpleLevelIter) Next() (*base.InternalKey, base.LazyValue) { - if s.err != nil { - return nil, base.LazyValue{} - } - if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { - return nil, base.LazyValue{} - } - if iterKey, val := s.filtered[s.currentIdx].Next(); iterKey != nil { - return iterKey, val - } - s.currentIdx++ - return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone) -} - -func (s *simpleLevelIter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { - if s.err != nil { - return nil, base.LazyValue{} - } - if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { - return nil, base.LazyValue{} - } - if iterKey, val := s.filtered[s.currentIdx].NextPrefix(succKey); iterKey != nil { - return iterKey, val - } - s.currentIdx++ - return s.skipEmptyFileForward(succKey /* seekKey */, base.SeekGEFlagsNone) -} - -func (s *simpleLevelIter) Prev() (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -func (s *simpleLevelIter) Error() error { - if s.currentIdx >= 0 && s.currentIdx < len(s.filtered) { - s.err = firstError(s.err, s.filtered[s.currentIdx].Error()) - } - return s.err -} - -func (s *simpleLevelIter) Close() error { - var err error - for i := range s.iters { - err = firstError(err, s.iters[i].Close()) - } - return err -} - -func (s *simpleLevelIter) SetBounds(lower, upper []byte) { - s.currentIdx = -1 - s.lowerBound = lower - for i := range s.iters { - s.iters[i].SetBounds(lower, upper) - } - s.resetFilteredIters() -} - -func (s *simpleLevelIter) String() string { - if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) { - return "simpleLevelIter: current=" - } - return fmt.Sprintf("simpleLevelIter: current=%s", s.filtered[s.currentIdx]) -} - -var _ internalIterator = &simpleLevelIter{} diff --git a/vendor/github.com/cockroachdb/pebble/filenames.go b/vendor/github.com/cockroachdb/pebble/filenames.go deleted file mode 100644 index 07d74c8..0000000 --- a/vendor/github.com/cockroachdb/pebble/filenames.go +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "fmt" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/vfs" -) - -type fileType = base.FileType - -// FileNum is an identifier for a file within a database. -type FileNum = base.FileNum - -const ( - fileTypeLog = base.FileTypeLog - fileTypeLock = base.FileTypeLock - fileTypeTable = base.FileTypeTable - fileTypeManifest = base.FileTypeManifest - fileTypeCurrent = base.FileTypeCurrent - fileTypeOptions = base.FileTypeOptions - fileTypeTemp = base.FileTypeTemp - fileTypeOldTemp = base.FileTypeOldTemp -) - -// setCurrentFile sets the CURRENT file to point to the manifest with -// provided file number. -// -// NB: This is a low-level routine and typically not what you want to -// use. Newer versions of Pebble running newer format major versions do -// not use the CURRENT file. See setCurrentFunc in version_set.go. -func setCurrentFile(dirname string, fs vfs.FS, fileNum base.DiskFileNum) error { - newFilename := base.MakeFilepath(fs, dirname, fileTypeCurrent, fileNum) - oldFilename := base.MakeFilepath(fs, dirname, fileTypeTemp, fileNum) - fs.Remove(oldFilename) - f, err := fs.Create(oldFilename) - if err != nil { - return err - } - if _, err := fmt.Fprintf(f, "MANIFEST-%s\n", fileNum); err != nil { - return err - } - if err := f.Sync(); err != nil { - return err - } - if err := f.Close(); err != nil { - return err - } - return fs.Rename(oldFilename, newFilename) -} diff --git a/vendor/github.com/cockroachdb/pebble/flushable.go b/vendor/github.com/cockroachdb/pebble/flushable.go deleted file mode 100644 index 09abee3..0000000 --- a/vendor/github.com/cockroachdb/pebble/flushable.go +++ /dev/null @@ -1,249 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "context" - "fmt" - "sync/atomic" - "time" - - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" -) - -// flushable defines the interface for immutable memtables. -type flushable interface { - newIter(o *IterOptions) internalIterator - newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator - newRangeDelIter(o *IterOptions) keyspan.FragmentIterator - newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator - containsRangeKeys() bool - // inuseBytes returns the number of inuse bytes by the flushable. - inuseBytes() uint64 - // totalBytes returns the total number of bytes allocated by the flushable. - totalBytes() uint64 - // readyForFlush returns true when the flushable is ready for flushing. See - // memTable.readyForFlush for one implementation which needs to check whether - // there are any outstanding write references. - readyForFlush() bool -} - -// flushableEntry wraps a flushable and adds additional metadata and -// functionality that is common to all flushables. -type flushableEntry struct { - flushable - // Channel which is closed when the flushable has been flushed. - flushed chan struct{} - // flushForced indicates whether a flush was forced on this memtable (either - // manual, or due to ingestion). Protected by DB.mu. - flushForced bool - // delayedFlushForcedAt indicates whether a timer has been set to force a - // flush on this memtable at some point in the future. Protected by DB.mu. - // Holds the timestamp of when the flush will be issued. - delayedFlushForcedAt time.Time - // logNum corresponds to the WAL that contains the records present in the - // receiver. - logNum FileNum - // logSize is the size in bytes of the associated WAL. Protected by DB.mu. - logSize uint64 - // The current logSeqNum at the time the memtable was created. This is - // guaranteed to be less than or equal to any seqnum stored in the memtable. - logSeqNum uint64 - // readerRefs tracks the read references on the flushable. The two sources of - // reader references are DB.mu.mem.queue and readState.memtables. The memory - // reserved by the flushable in the cache is released when the reader refs - // drop to zero. If the flushable is referencing sstables, then the file - // refount is also decreased once the reader refs drops to 0. If the - // flushable is a memTable, when the reader refs drops to zero, the writer - // refs will already be zero because the memtable will have been flushed and - // that only occurs once the writer refs drops to zero. - readerRefs atomic.Int32 - // Closure to invoke to release memory accounting. - releaseMemAccounting func() - // unrefFiles, if not nil, should be invoked to decrease the ref count of - // files which are backing the flushable. - unrefFiles func() []*fileBacking - // deleteFnLocked should be called if the caller is holding DB.mu. - deleteFnLocked func(obsolete []*fileBacking) - // deleteFn should be called if the caller is not holding DB.mu. - deleteFn func(obsolete []*fileBacking) -} - -func (e *flushableEntry) readerRef() { - switch v := e.readerRefs.Add(1); { - case v <= 1: - panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) - } -} - -// db.mu must not be held when this is called. -func (e *flushableEntry) readerUnref(deleteFiles bool) { - e.readerUnrefHelper(deleteFiles, e.deleteFn) -} - -// db.mu must be held when this is called. -func (e *flushableEntry) readerUnrefLocked(deleteFiles bool) { - e.readerUnrefHelper(deleteFiles, e.deleteFnLocked) -} - -func (e *flushableEntry) readerUnrefHelper( - deleteFiles bool, deleteFn func(obsolete []*fileBacking), -) { - switch v := e.readerRefs.Add(-1); { - case v < 0: - panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) - case v == 0: - if e.releaseMemAccounting == nil { - panic("pebble: memtable reservation already released") - } - e.releaseMemAccounting() - e.releaseMemAccounting = nil - if e.unrefFiles != nil { - obsolete := e.unrefFiles() - e.unrefFiles = nil - if deleteFiles { - deleteFn(obsolete) - } - } - } -} - -type flushableList []*flushableEntry - -// ingestedFlushable is the implementation of the flushable interface for the -// ingesting sstables which are added to the flushable list. -type ingestedFlushable struct { - files []physicalMeta - comparer *Comparer - newIters tableNewIters - newRangeKeyIters keyspan.TableNewSpanIter - - // Since the level slice is immutable, we construct and set it once. It - // should be safe to read from slice in future reads. - slice manifest.LevelSlice - // hasRangeKeys is set on ingestedFlushable construction. - hasRangeKeys bool -} - -func newIngestedFlushable( - files []*fileMetadata, - comparer *Comparer, - newIters tableNewIters, - newRangeKeyIters keyspan.TableNewSpanIter, -) *ingestedFlushable { - var physicalFiles []physicalMeta - var hasRangeKeys bool - for _, f := range files { - if f.HasRangeKeys { - hasRangeKeys = true - } - physicalFiles = append(physicalFiles, f.PhysicalMeta()) - } - - ret := &ingestedFlushable{ - files: physicalFiles, - comparer: comparer, - newIters: newIters, - newRangeKeyIters: newRangeKeyIters, - // slice is immutable and can be set once and used many times. - slice: manifest.NewLevelSliceKeySorted(comparer.Compare, files), - hasRangeKeys: hasRangeKeys, - } - - return ret -} - -// TODO(sumeer): ingestedFlushable iters also need to plumb context for -// tracing. - -// newIter is part of the flushable interface. -func (s *ingestedFlushable) newIter(o *IterOptions) internalIterator { - var opts IterOptions - if o != nil { - opts = *o - } - // TODO(bananabrick): The manifest.Level in newLevelIter is only used for - // logging. Update the manifest.Level encoding to account for levels which - // aren't truly levels in the lsm. Right now, the encoding only supports - // L0 sublevels, and the rest of the levels in the lsm. - return newLevelIter( - opts, s.comparer, s.newIters, s.slice.Iter(), manifest.Level(0), internalIterOpts{}, - ) -} - -// newFlushIter is part of the flushable interface. -func (s *ingestedFlushable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator { - // newFlushIter is only used for writing memtables to disk as sstables. - // Since ingested sstables are already present on disk, they don't need to - // make use of a flush iter. - panic("pebble: not implemented") -} - -func (s *ingestedFlushable) constructRangeDelIter( - file *manifest.FileMetadata, _ keyspan.SpanIterOptions, -) (keyspan.FragmentIterator, error) { - // Note that the keyspan level iter expects a non-nil iterator to be - // returned even if there is an error. So, we return the emptyKeyspanIter. - iter, rangeDelIter, err := s.newIters(context.Background(), file, nil, internalIterOpts{}) - if err != nil { - return emptyKeyspanIter, err - } - iter.Close() - if rangeDelIter == nil { - return emptyKeyspanIter, nil - } - return rangeDelIter, nil -} - -// newRangeDelIter is part of the flushable interface. -// TODO(bananabrick): Using a level iter instead of a keyspan level iter to -// surface range deletes is more efficient. -func (s *ingestedFlushable) newRangeDelIter(_ *IterOptions) keyspan.FragmentIterator { - return keyspan.NewLevelIter( - keyspan.SpanIterOptions{}, s.comparer.Compare, - s.constructRangeDelIter, s.slice.Iter(), manifest.Level(0), - manifest.KeyTypePoint, - ) -} - -// newRangeKeyIter is part of the flushable interface. -func (s *ingestedFlushable) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator { - if !s.containsRangeKeys() { - return nil - } - - return keyspan.NewLevelIter( - keyspan.SpanIterOptions{}, s.comparer.Compare, s.newRangeKeyIters, - s.slice.Iter(), manifest.Level(0), manifest.KeyTypeRange, - ) -} - -// containsRangeKeys is part of the flushable interface. -func (s *ingestedFlushable) containsRangeKeys() bool { - return s.hasRangeKeys -} - -// inuseBytes is part of the flushable interface. -func (s *ingestedFlushable) inuseBytes() uint64 { - // inuseBytes is only used when memtables are flushed to disk as sstables. - panic("pebble: not implemented") -} - -// totalBytes is part of the flushable interface. -func (s *ingestedFlushable) totalBytes() uint64 { - // We don't allocate additional bytes for the ingestedFlushable. - return 0 -} - -// readyForFlush is part of the flushable interface. -func (s *ingestedFlushable) readyForFlush() bool { - // ingestedFlushable should always be ready to flush. However, note that - // memtables before the ingested sstables in the memtable queue must be - // flushed before an ingestedFlushable can be flushed. This is because the - // ingested sstables need an updated view of the Version to - // determine where to place the files in the lsm. - return true -} diff --git a/vendor/github.com/cockroachdb/pebble/get_iter.go b/vendor/github.com/cockroachdb/pebble/get_iter.go deleted file mode 100644 index 99c5d7c..0000000 --- a/vendor/github.com/cockroachdb/pebble/get_iter.go +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "context" - "fmt" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" -) - -// getIter is an internal iterator used to perform gets. It iterates through -// the values for a particular key, level by level. It is not a general purpose -// internalIterator, but specialized for Get operations so that it loads data -// lazily. -type getIter struct { - logger Logger - comparer *Comparer - newIters tableNewIters - snapshot uint64 - key []byte - iter internalIterator - rangeDelIter keyspan.FragmentIterator - tombstone *keyspan.Span - levelIter levelIter - level int - batch *Batch - mem flushableList - l0 []manifest.LevelSlice - version *version - iterKey *InternalKey - iterValue base.LazyValue - err error -} - -// TODO(sumeer): CockroachDB code doesn't use getIter, but, for completeness, -// make this implement InternalIteratorWithStats. - -// getIter implements the base.InternalIterator interface. -var _ base.InternalIterator = (*getIter)(nil) - -func (g *getIter) String() string { - return fmt.Sprintf("len(l0)=%d, len(mem)=%d, level=%d", len(g.l0), len(g.mem), g.level) -} - -func (g *getIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { - panic("pebble: SeekGE unimplemented") -} - -func (g *getIter) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - panic("pebble: SeekPrefixGE unimplemented") -} - -func (g *getIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { - panic("pebble: SeekLT unimplemented") -} - -func (g *getIter) First() (*InternalKey, base.LazyValue) { - return g.Next() -} - -func (g *getIter) Last() (*InternalKey, base.LazyValue) { - panic("pebble: Last unimplemented") -} - -func (g *getIter) Next() (*InternalKey, base.LazyValue) { - if g.iter != nil { - g.iterKey, g.iterValue = g.iter.Next() - } - - for { - if g.iter != nil { - // We have to check rangeDelIter on each iteration because a single - // user-key can be spread across multiple tables in a level. A range - // tombstone will appear in the table corresponding to its start - // key. Every call to levelIter.Next() potentially switches to a new - // table and thus reinitializes rangeDelIter. - if g.rangeDelIter != nil { - g.tombstone = keyspan.Get(g.comparer.Compare, g.rangeDelIter, g.key) - if g.err = g.rangeDelIter.Close(); g.err != nil { - return nil, base.LazyValue{} - } - g.rangeDelIter = nil - } - - if g.iterKey != nil { - key := g.iterKey - if g.tombstone != nil && g.tombstone.CoversAt(g.snapshot, key.SeqNum()) { - // We have a range tombstone covering this key. Rather than return a - // point or range deletion here, we return false and close our - // internal iterator which will make Valid() return false, - // effectively stopping iteration. - g.err = g.iter.Close() - g.iter = nil - return nil, base.LazyValue{} - } - if g.comparer.Equal(g.key, key.UserKey) { - if !key.Visible(g.snapshot, base.InternalKeySeqNumMax) { - g.iterKey, g.iterValue = g.iter.Next() - continue - } - return g.iterKey, g.iterValue - } - } - // We've advanced the iterator passed the desired key. Move on to the - // next memtable / level. - g.err = g.iter.Close() - g.iter = nil - if g.err != nil { - return nil, base.LazyValue{} - } - } - - // Create an iterator from the batch. - if g.batch != nil { - if g.batch.index == nil { - g.err = ErrNotIndexed - g.iterKey, g.iterValue = nil, base.LazyValue{} - return nil, base.LazyValue{} - } - g.iter = g.batch.newInternalIter(nil) - g.rangeDelIter = g.batch.newRangeDelIter( - nil, - // Get always reads the entirety of the batch's history, so no - // batch keys should be filtered. - base.InternalKeySeqNumMax, - ) - g.iterKey, g.iterValue = g.iter.SeekGE(g.key, base.SeekGEFlagsNone) - g.batch = nil - continue - } - - // If we have a tombstone from a previous level it is guaranteed to delete - // keys in lower levels. - if g.tombstone != nil && g.tombstone.VisibleAt(g.snapshot) { - return nil, base.LazyValue{} - } - - // Create iterators from memtables from newest to oldest. - if n := len(g.mem); n > 0 { - m := g.mem[n-1] - g.iter = m.newIter(nil) - g.rangeDelIter = m.newRangeDelIter(nil) - g.mem = g.mem[:n-1] - g.iterKey, g.iterValue = g.iter.SeekGE(g.key, base.SeekGEFlagsNone) - continue - } - - if g.level == 0 { - // Create iterators from L0 from newest to oldest. - if n := len(g.l0); n > 0 { - files := g.l0[n-1].Iter() - g.l0 = g.l0[:n-1] - iterOpts := IterOptions{logger: g.logger, snapshotForHideObsoletePoints: g.snapshot} - g.levelIter.init(context.Background(), iterOpts, g.comparer, g.newIters, - files, manifest.L0Sublevel(n), internalIterOpts{}) - g.levelIter.initRangeDel(&g.rangeDelIter) - bc := levelIterBoundaryContext{} - g.levelIter.initBoundaryContext(&bc) - g.iter = &g.levelIter - - // Compute the key prefix for bloom filtering if split function is - // specified, or use the user key as default. - prefix := g.key - if g.comparer.Split != nil { - prefix = g.key[:g.comparer.Split(g.key)] - } - g.iterKey, g.iterValue = g.iter.SeekPrefixGE(prefix, g.key, base.SeekGEFlagsNone) - if bc.isSyntheticIterBoundsKey || bc.isIgnorableBoundaryKey { - g.iterKey = nil - g.iterValue = base.LazyValue{} - } - continue - } - g.level++ - } - - if g.level >= numLevels { - return nil, base.LazyValue{} - } - if g.version.Levels[g.level].Empty() { - g.level++ - continue - } - - iterOpts := IterOptions{logger: g.logger, snapshotForHideObsoletePoints: g.snapshot} - g.levelIter.init(context.Background(), iterOpts, g.comparer, g.newIters, - g.version.Levels[g.level].Iter(), manifest.Level(g.level), internalIterOpts{}) - g.levelIter.initRangeDel(&g.rangeDelIter) - bc := levelIterBoundaryContext{} - g.levelIter.initBoundaryContext(&bc) - g.level++ - g.iter = &g.levelIter - - // Compute the key prefix for bloom filtering if split function is - // specified, or use the user key as default. - prefix := g.key - if g.comparer.Split != nil { - prefix = g.key[:g.comparer.Split(g.key)] - } - g.iterKey, g.iterValue = g.iter.SeekPrefixGE(prefix, g.key, base.SeekGEFlagsNone) - if bc.isSyntheticIterBoundsKey || bc.isIgnorableBoundaryKey { - g.iterKey = nil - g.iterValue = base.LazyValue{} - } - } -} - -func (g *getIter) Prev() (*InternalKey, base.LazyValue) { - panic("pebble: Prev unimplemented") -} - -func (g *getIter) NextPrefix([]byte) (*InternalKey, base.LazyValue) { - panic("pebble: NextPrefix unimplemented") -} - -func (g *getIter) Valid() bool { - return g.iterKey != nil && g.err == nil -} - -func (g *getIter) Error() error { - return g.err -} - -func (g *getIter) Close() error { - if g.iter != nil { - if err := g.iter.Close(); err != nil && g.err == nil { - g.err = err - } - g.iter = nil - } - return g.err -} - -func (g *getIter) SetBounds(lower, upper []byte) { - panic("pebble: SetBounds unimplemented") -} diff --git a/vendor/github.com/cockroachdb/pebble/ingest.go b/vendor/github.com/cockroachdb/pebble/ingest.go deleted file mode 100644 index 2278a65..0000000 --- a/vendor/github.com/cockroachdb/pebble/ingest.go +++ /dev/null @@ -1,2381 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "context" - "sort" - "time" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/internal/private" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/remote" - "github.com/cockroachdb/pebble/sstable" -) - -func sstableKeyCompare(userCmp Compare, a, b InternalKey) int { - c := userCmp(a.UserKey, b.UserKey) - if c != 0 { - return c - } - if a.IsExclusiveSentinel() { - if !b.IsExclusiveSentinel() { - return -1 - } - } else if b.IsExclusiveSentinel() { - return +1 - } - return 0 -} - -// KeyRange encodes a key range in user key space. A KeyRange's Start is -// inclusive while its End is exclusive. -type KeyRange struct { - Start, End []byte -} - -// Valid returns true if the KeyRange is defined. -func (k *KeyRange) Valid() bool { - return k.Start != nil && k.End != nil -} - -// Contains returns whether the specified key exists in the KeyRange. -func (k *KeyRange) Contains(cmp base.Compare, key InternalKey) bool { - v := cmp(key.UserKey, k.End) - return (v < 0 || (v == 0 && key.IsExclusiveSentinel())) && cmp(k.Start, key.UserKey) <= 0 -} - -// OverlapsInternalKeyRange checks if the specified internal key range has an -// overlap with the KeyRange. Note that we aren't checking for full containment -// of smallest-largest within k, rather just that there's some intersection -// between the two ranges. -func (k *KeyRange) OverlapsInternalKeyRange(cmp base.Compare, smallest, largest InternalKey) bool { - v := cmp(k.Start, largest.UserKey) - return v <= 0 && !(largest.IsExclusiveSentinel() && v == 0) && - cmp(k.End, smallest.UserKey) > 0 -} - -// Overlaps checks if the specified file has an overlap with the KeyRange. -// Note that we aren't checking for full containment of m within k, rather just -// that there's some intersection between m and k's bounds. -func (k *KeyRange) Overlaps(cmp base.Compare, m *fileMetadata) bool { - return k.OverlapsInternalKeyRange(cmp, m.Smallest, m.Largest) -} - -// OverlapsKeyRange checks if this span overlaps with the provided KeyRange. -// Note that we aren't checking for full containment of either span in the other, -// just that there's a key x that is in both key ranges. -func (k *KeyRange) OverlapsKeyRange(cmp Compare, span KeyRange) bool { - return cmp(k.Start, span.End) < 0 && cmp(k.End, span.Start) > 0 -} - -func ingestValidateKey(opts *Options, key *InternalKey) error { - if key.Kind() == InternalKeyKindInvalid { - return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s", - key.Pretty(opts.Comparer.FormatKey)) - } - if key.SeqNum() != 0 { - return base.CorruptionErrorf("pebble: external sstable has non-zero seqnum: %s", - key.Pretty(opts.Comparer.FormatKey)) - } - return nil -} - -// ingestSynthesizeShared constructs a fileMetadata for one shared sstable owned -// or shared by another node. -func ingestSynthesizeShared( - opts *Options, sm SharedSSTMeta, fileNum base.DiskFileNum, -) (*fileMetadata, error) { - if sm.Size == 0 { - // Disallow 0 file sizes - return nil, errors.New("pebble: cannot ingest shared file with size 0") - } - // Don't load table stats. Doing a round trip to shared storage, one SST - // at a time is not worth it as it slows down ingestion. - meta := &fileMetadata{ - FileNum: fileNum.FileNum(), - CreationTime: time.Now().Unix(), - Virtual: true, - Size: sm.Size, - } - meta.InitProviderBacking(fileNum) - // Set the underlying FileBacking's size to the same size as the virtualized - // view of the sstable. This ensures that we don't over-prioritize this - // sstable for compaction just yet, as we do not have a clear sense of what - // parts of this sstable are referenced by other nodes. - meta.FileBacking.Size = sm.Size - if sm.LargestRangeKey.Valid() && sm.LargestRangeKey.UserKey != nil { - // Initialize meta.{HasRangeKeys,Smallest,Largest}, etc. - // - // NB: We create new internal keys and pass them into ExternalRangeKeyBounds - // so that we can sub a zero sequence number into the bounds. We can set - // the sequence number to anything here; it'll be reset in ingestUpdateSeqNum - // anyway. However we do need to use the same sequence number across all - // bound keys at this step so that we end up with bounds that are consistent - // across point/range keys. - smallestRangeKey := base.MakeInternalKey(sm.SmallestRangeKey.UserKey, 0, sm.SmallestRangeKey.Kind()) - largestRangeKey := base.MakeExclusiveSentinelKey(sm.LargestRangeKey.Kind(), sm.LargestRangeKey.UserKey) - meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallestRangeKey, largestRangeKey) - } - if sm.LargestPointKey.Valid() && sm.LargestPointKey.UserKey != nil { - // Initialize meta.{HasPointKeys,Smallest,Largest}, etc. - // - // See point above in the ExtendRangeKeyBounds call on why we use a zero - // sequence number here. - smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, sm.SmallestPointKey.Kind()) - largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, sm.LargestPointKey.Kind()) - if sm.LargestPointKey.IsExclusiveSentinel() { - largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey) - } - meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallestPointKey, largestPointKey) - } - if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { - return nil, err - } - return meta, nil -} - -// ingestLoad1External loads the fileMetadata for one external sstable. -// Sequence number and target level calculation happens during prepare/apply. -func ingestLoad1External( - opts *Options, - e ExternalFile, - fileNum base.DiskFileNum, - objprovider objstorage.Provider, - jobID int, -) (*fileMetadata, error) { - if e.Size == 0 { - // Disallow 0 file sizes - return nil, errors.New("pebble: cannot ingest external file with size 0") - } - if !e.HasRangeKey && !e.HasPointKey { - return nil, errors.New("pebble: cannot ingest external file with no point or range keys") - } - // Don't load table stats. Doing a round trip to shared storage, one SST - // at a time is not worth it as it slows down ingestion. - meta := &fileMetadata{} - meta.FileNum = fileNum.FileNum() - meta.CreationTime = time.Now().Unix() - meta.Virtual = true - meta.Size = e.Size - meta.InitProviderBacking(fileNum) - - // Try to resolve a reference to the external file. - backing, err := objprovider.CreateExternalObjectBacking(e.Locator, e.ObjName) - if err != nil { - return nil, err - } - metas, err := objprovider.AttachRemoteObjects([]objstorage.RemoteObjectToAttach{{ - FileNum: fileNum, - FileType: fileTypeTable, - Backing: backing, - }}) - if err != nil { - return nil, err - } - if opts.EventListener.TableCreated != nil { - opts.EventListener.TableCreated(TableCreateInfo{ - JobID: jobID, - Reason: "ingesting", - Path: objprovider.Path(metas[0]), - FileNum: fileNum.FileNum(), - }) - } - // In the name of keeping this ingestion as fast as possible, we avoid - // *all* existence checks and synthesize a file metadata with smallest/largest - // keys that overlap whatever the passed-in span was. - smallestCopy := make([]byte, len(e.SmallestUserKey)) - copy(smallestCopy, e.SmallestUserKey) - largestCopy := make([]byte, len(e.LargestUserKey)) - copy(largestCopy, e.LargestUserKey) - if e.HasPointKey { - meta.ExtendPointKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindMax), - base.MakeRangeDeleteSentinelKey(largestCopy)) - } - if e.HasRangeKey { - meta.ExtendRangeKeyBounds(opts.Comparer.Compare, base.MakeInternalKey(smallestCopy, 0, InternalKeyKindRangeKeySet), - base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyDelete, largestCopy)) - } - - // Set the underlying FileBacking's size to the same size as the virtualized - // view of the sstable. This ensures that we don't over-prioritize this - // sstable for compaction just yet, as we do not have a clear sense of - // what parts of this sstable are referenced by other nodes. - meta.FileBacking.Size = e.Size - - if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { - return nil, err - } - return meta, nil -} - -// ingestLoad1 creates the FileMetadata for one file. This file will be owned -// by this store. -func ingestLoad1( - opts *Options, - fmv FormatMajorVersion, - readable objstorage.Readable, - cacheID uint64, - fileNum base.DiskFileNum, -) (*fileMetadata, error) { - cacheOpts := private.SSTableCacheOpts(cacheID, fileNum).(sstable.ReaderOption) - r, err := sstable.NewReader(readable, opts.MakeReaderOptions(), cacheOpts) - if err != nil { - return nil, err - } - defer r.Close() - - // Avoid ingesting tables with format versions this DB doesn't support. - tf, err := r.TableFormat() - if err != nil { - return nil, err - } - if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() { - return nil, errors.Newf( - "pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)", - tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(), - ) - } - - meta := &fileMetadata{} - meta.FileNum = fileNum.FileNum() - meta.Size = uint64(readable.Size()) - meta.CreationTime = time.Now().Unix() - meta.InitPhysicalBacking() - - // Avoid loading into the table cache for collecting stats if we - // don't need to. If there are no range deletions, we have all the - // information to compute the stats here. - // - // This is helpful in tests for avoiding awkwardness around deletion of - // ingested files from MemFS. MemFS implements the Windows semantics of - // disallowing removal of an open file. Under MemFS, if we don't populate - // meta.Stats here, the file will be loaded into the table cache for - // calculating stats before we can remove the original link. - maybeSetStatsFromProperties(meta.PhysicalMeta(), &r.Properties) - - { - iter, err := r.NewIter(nil /* lower */, nil /* upper */) - if err != nil { - return nil, err - } - defer iter.Close() - var smallest InternalKey - if key, _ := iter.First(); key != nil { - if err := ingestValidateKey(opts, key); err != nil { - return nil, err - } - smallest = (*key).Clone() - } - if err := iter.Error(); err != nil { - return nil, err - } - if key, _ := iter.Last(); key != nil { - if err := ingestValidateKey(opts, key); err != nil { - return nil, err - } - meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, key.Clone()) - } - if err := iter.Error(); err != nil { - return nil, err - } - } - - iter, err := r.NewRawRangeDelIter() - if err != nil { - return nil, err - } - if iter != nil { - defer iter.Close() - var smallest InternalKey - if s := iter.First(); s != nil { - key := s.SmallestKey() - if err := ingestValidateKey(opts, &key); err != nil { - return nil, err - } - smallest = key.Clone() - } - if err := iter.Error(); err != nil { - return nil, err - } - if s := iter.Last(); s != nil { - k := s.SmallestKey() - if err := ingestValidateKey(opts, &k); err != nil { - return nil, err - } - largest := s.LargestKey().Clone() - meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest) - } - } - - // Update the range-key bounds for the table. - { - iter, err := r.NewRawRangeKeyIter() - if err != nil { - return nil, err - } - if iter != nil { - defer iter.Close() - var smallest InternalKey - if s := iter.First(); s != nil { - key := s.SmallestKey() - if err := ingestValidateKey(opts, &key); err != nil { - return nil, err - } - smallest = key.Clone() - } - if err := iter.Error(); err != nil { - return nil, err - } - if s := iter.Last(); s != nil { - k := s.SmallestKey() - if err := ingestValidateKey(opts, &k); err != nil { - return nil, err - } - // As range keys are fragmented, the end key of the last range key in - // the table provides the upper bound for the table. - largest := s.LargestKey().Clone() - meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest) - } - if err := iter.Error(); err != nil { - return nil, err - } - } - } - - if !meta.HasPointKeys && !meta.HasRangeKeys { - return nil, nil - } - - // Sanity check that the various bounds on the file were set consistently. - if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { - return nil, err - } - - return meta, nil -} - -type ingestLoadResult struct { - localMeta, sharedMeta []*fileMetadata - externalMeta []*fileMetadata - localPaths []string - sharedLevels []uint8 - fileCount int -} - -func ingestLoad( - opts *Options, - fmv FormatMajorVersion, - paths []string, - shared []SharedSSTMeta, - external []ExternalFile, - cacheID uint64, - pending []base.DiskFileNum, - objProvider objstorage.Provider, - jobID int, -) (ingestLoadResult, error) { - meta := make([]*fileMetadata, 0, len(paths)) - newPaths := make([]string, 0, len(paths)) - for i := range paths { - f, err := opts.FS.Open(paths[i]) - if err != nil { - return ingestLoadResult{}, err - } - - readable, err := sstable.NewSimpleReadable(f) - if err != nil { - return ingestLoadResult{}, err - } - m, err := ingestLoad1(opts, fmv, readable, cacheID, pending[i]) - if err != nil { - return ingestLoadResult{}, err - } - if m != nil { - meta = append(meta, m) - newPaths = append(newPaths, paths[i]) - } - } - if len(shared) == 0 && len(external) == 0 { - return ingestLoadResult{localMeta: meta, localPaths: newPaths, fileCount: len(meta)}, nil - } - - // Sort the shared files according to level. - sort.Sort(sharedByLevel(shared)) - - sharedMeta := make([]*fileMetadata, 0, len(shared)) - levels := make([]uint8, 0, len(shared)) - for i := range shared { - m, err := ingestSynthesizeShared(opts, shared[i], pending[len(paths)+i]) - if err != nil { - return ingestLoadResult{}, err - } - if shared[i].Level < sharedLevelsStart { - return ingestLoadResult{}, errors.New("cannot ingest shared file in level below sharedLevelsStart") - } - sharedMeta = append(sharedMeta, m) - levels = append(levels, shared[i].Level) - } - externalMeta := make([]*fileMetadata, 0, len(external)) - for i := range external { - m, err := ingestLoad1External(opts, external[i], pending[len(paths)+len(shared)+i], objProvider, jobID) - if err != nil { - return ingestLoadResult{}, err - } - externalMeta = append(externalMeta, m) - } - result := ingestLoadResult{ - localMeta: meta, - sharedMeta: sharedMeta, - externalMeta: externalMeta, - localPaths: newPaths, - sharedLevels: levels, - fileCount: len(meta) + len(sharedMeta) + len(externalMeta), - } - return result, nil -} - -// Struct for sorting metadatas by smallest user keys, while ensuring the -// matching path also gets swapped to the same index. For use in -// ingestSortAndVerify. -type metaAndPaths struct { - meta []*fileMetadata - paths []string - cmp Compare -} - -func (m metaAndPaths) Len() int { - return len(m.meta) -} - -func (m metaAndPaths) Less(i, j int) bool { - return m.cmp(m.meta[i].Smallest.UserKey, m.meta[j].Smallest.UserKey) < 0 -} - -func (m metaAndPaths) Swap(i, j int) { - m.meta[i], m.meta[j] = m.meta[j], m.meta[i] - if m.paths != nil { - m.paths[i], m.paths[j] = m.paths[j], m.paths[i] - } -} - -func ingestSortAndVerify(cmp Compare, lr ingestLoadResult, exciseSpan KeyRange) error { - // Verify that all the shared files (i.e. files in sharedMeta) - // fit within the exciseSpan. - for i := range lr.sharedMeta { - f := lr.sharedMeta[i] - if !exciseSpan.Contains(cmp, f.Smallest) || !exciseSpan.Contains(cmp, f.Largest) { - return errors.AssertionFailedf("pebble: shared file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String()) - } - } - if len(lr.externalMeta) > 0 { - if len(lr.localMeta) > 0 || len(lr.sharedMeta) > 0 { - // Currently we only support external ingests on their own. If external - // files are present alongside local/shared files, return an error. - return errors.AssertionFailedf("pebble: external files cannot be ingested atomically alongside other types of files") - } - sort.Sort(&metaAndPaths{ - meta: lr.externalMeta, - cmp: cmp, - }) - for i := 1; i < len(lr.externalMeta); i++ { - if sstableKeyCompare(cmp, lr.externalMeta[i-1].Largest, lr.externalMeta[i].Smallest) >= 0 { - return errors.AssertionFailedf("pebble: external sstables have overlapping ranges") - } - } - return nil - } - if len(lr.localMeta) <= 1 || len(lr.localPaths) <= 1 { - return nil - } - - sort.Sort(&metaAndPaths{ - meta: lr.localMeta, - paths: lr.localPaths, - cmp: cmp, - }) - - for i := 1; i < len(lr.localPaths); i++ { - if sstableKeyCompare(cmp, lr.localMeta[i-1].Largest, lr.localMeta[i].Smallest) >= 0 { - return errors.AssertionFailedf("pebble: local ingestion sstables have overlapping ranges") - } - } - if len(lr.sharedMeta) == 0 { - return nil - } - filesInLevel := make([]*fileMetadata, 0, len(lr.sharedMeta)) - for l := sharedLevelsStart; l < numLevels; l++ { - filesInLevel = filesInLevel[:0] - for i := range lr.sharedMeta { - if lr.sharedLevels[i] == uint8(l) { - filesInLevel = append(filesInLevel, lr.sharedMeta[i]) - } - } - sort.Slice(filesInLevel, func(i, j int) bool { - return cmp(filesInLevel[i].Smallest.UserKey, filesInLevel[j].Smallest.UserKey) < 0 - }) - for i := 1; i < len(filesInLevel); i++ { - if sstableKeyCompare(cmp, filesInLevel[i-1].Largest, filesInLevel[i].Smallest) >= 0 { - return errors.AssertionFailedf("pebble: external shared sstables have overlapping ranges") - } - } - } - return nil -} - -func ingestCleanup(objProvider objstorage.Provider, meta []*fileMetadata) error { - var firstErr error - for i := range meta { - if err := objProvider.Remove(fileTypeTable, meta[i].FileBacking.DiskFileNum); err != nil { - firstErr = firstError(firstErr, err) - } - } - return firstErr -} - -// ingestLink creates new objects which are backed by either hardlinks to or -// copies of the ingested files. It also attaches shared objects to the provider. -func ingestLink( - jobID int, - opts *Options, - objProvider objstorage.Provider, - lr ingestLoadResult, - shared []SharedSSTMeta, -) error { - for i := range lr.localPaths { - objMeta, err := objProvider.LinkOrCopyFromLocal( - context.TODO(), opts.FS, lr.localPaths[i], fileTypeTable, lr.localMeta[i].FileBacking.DiskFileNum, - objstorage.CreateOptions{PreferSharedStorage: true}, - ) - if err != nil { - if err2 := ingestCleanup(objProvider, lr.localMeta[:i]); err2 != nil { - opts.Logger.Infof("ingest cleanup failed: %v", err2) - } - return err - } - if opts.EventListener.TableCreated != nil { - opts.EventListener.TableCreated(TableCreateInfo{ - JobID: jobID, - Reason: "ingesting", - Path: objProvider.Path(objMeta), - FileNum: lr.localMeta[i].FileNum, - }) - } - } - sharedObjs := make([]objstorage.RemoteObjectToAttach, 0, len(shared)) - for i := range shared { - backing, err := shared[i].Backing.Get() - if err != nil { - return err - } - sharedObjs = append(sharedObjs, objstorage.RemoteObjectToAttach{ - FileNum: lr.sharedMeta[i].FileBacking.DiskFileNum, - FileType: fileTypeTable, - Backing: backing, - }) - } - sharedObjMetas, err := objProvider.AttachRemoteObjects(sharedObjs) - if err != nil { - return err - } - for i := range sharedObjMetas { - // One corner case around file sizes we need to be mindful of, is that - // if one of the shareObjs was initially created by us (and has boomeranged - // back from another node), we'll need to update the FileBacking's size - // to be the true underlying size. Otherwise, we could hit errors when we - // open the db again after a crash/restart (see checkConsistency in open.go), - // plus it more accurately allows us to prioritize compactions of files - // that were originally created by us. - if sharedObjMetas[i].IsShared() && !objProvider.IsSharedForeign(sharedObjMetas[i]) { - size, err := objProvider.Size(sharedObjMetas[i]) - if err != nil { - return err - } - lr.sharedMeta[i].FileBacking.Size = uint64(size) - } - if opts.EventListener.TableCreated != nil { - opts.EventListener.TableCreated(TableCreateInfo{ - JobID: jobID, - Reason: "ingesting", - Path: objProvider.Path(sharedObjMetas[i]), - FileNum: lr.sharedMeta[i].FileNum, - }) - } - } - // We do not need to do anything about lr.externalMetas. Those were already - // linked in ingestLoad. - - return nil -} - -func ingestMemtableOverlaps(cmp Compare, mem flushable, keyRanges []internalKeyRange) bool { - iter := mem.newIter(nil) - rangeDelIter := mem.newRangeDelIter(nil) - rkeyIter := mem.newRangeKeyIter(nil) - - closeIters := func() error { - err := iter.Close() - if rangeDelIter != nil { - err = firstError(err, rangeDelIter.Close()) - } - if rkeyIter != nil { - err = firstError(err, rkeyIter.Close()) - } - return err - } - - for _, kr := range keyRanges { - if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, cmp) { - closeIters() - return true - } - } - - // Assume overlap if any iterator errored out. - return closeIters() != nil -} - -func ingestUpdateSeqNum( - cmp Compare, format base.FormatKey, seqNum uint64, loadResult ingestLoadResult, -) error { - setSeqFn := func(k base.InternalKey) base.InternalKey { - return base.MakeInternalKey(k.UserKey, seqNum, k.Kind()) - } - updateMetadata := func(m *fileMetadata) error { - // NB: we set the fields directly here, rather than via their Extend* - // methods, as we are updating sequence numbers. - if m.HasPointKeys { - m.SmallestPointKey = setSeqFn(m.SmallestPointKey) - } - if m.HasRangeKeys { - m.SmallestRangeKey = setSeqFn(m.SmallestRangeKey) - } - m.Smallest = setSeqFn(m.Smallest) - // Only update the seqnum for the largest key if that key is not an - // "exclusive sentinel" (i.e. a range deletion sentinel or a range key - // boundary), as doing so effectively drops the exclusive sentinel (by - // lowering the seqnum from the max value), and extends the bounds of the - // table. - // NB: as the largest range key is always an exclusive sentinel, it is never - // updated. - if m.HasPointKeys && !m.LargestPointKey.IsExclusiveSentinel() { - m.LargestPointKey = setSeqFn(m.LargestPointKey) - } - if !m.Largest.IsExclusiveSentinel() { - m.Largest = setSeqFn(m.Largest) - } - // Setting smallestSeqNum == largestSeqNum triggers the setting of - // Properties.GlobalSeqNum when an sstable is loaded. - m.SmallestSeqNum = seqNum - m.LargestSeqNum = seqNum - // Ensure the new bounds are consistent. - if err := m.Validate(cmp, format); err != nil { - return err - } - seqNum++ - return nil - } - - // Shared sstables are required to be sorted by level ascending. We then - // iterate the shared sstables in reverse, assigning the lower sequence - // numbers to the shared sstables that will be ingested into the lower - // (larger numbered) levels first. This ensures sequence number shadowing is - // correct. - for i := len(loadResult.sharedMeta) - 1; i >= 0; i-- { - if i-1 >= 0 && loadResult.sharedLevels[i-1] > loadResult.sharedLevels[i] { - panic(errors.AssertionFailedf("shared files %s, %s out of order", loadResult.sharedMeta[i-1], loadResult.sharedMeta[i])) - } - if err := updateMetadata(loadResult.sharedMeta[i]); err != nil { - return err - } - } - for i := range loadResult.localMeta { - if err := updateMetadata(loadResult.localMeta[i]); err != nil { - return err - } - } - for i := range loadResult.externalMeta { - if err := updateMetadata(loadResult.externalMeta[i]); err != nil { - return err - } - } - return nil -} - -// Denotes an internal key range. Smallest and largest are both inclusive. -type internalKeyRange struct { - smallest, largest InternalKey -} - -func overlapWithIterator( - iter internalIterator, - rangeDelIter *keyspan.FragmentIterator, - rkeyIter keyspan.FragmentIterator, - keyRange internalKeyRange, - cmp Compare, -) bool { - // Check overlap with point operations. - // - // When using levelIter, it seeks to the SST whose boundaries - // contain keyRange.smallest.UserKey(S). - // It then tries to find a point in that SST that is >= S. - // If there's no such point it means the SST ends in a tombstone in which case - // levelIter.SeekGE generates a boundary range del sentinel. - // The comparison of this boundary with keyRange.largest(L) below - // is subtle but maintains correctness. - // 1) boundary < L, - // since boundary is also > S (initial seek), - // whatever the boundary's start key may be, we're always overlapping. - // 2) boundary > L, - // overlap with boundary cannot be determined since we don't know boundary's start key. - // We require checking for overlap with rangeDelIter. - // 3) boundary == L and L is not sentinel, - // means boundary < L and hence is similar to 1). - // 4) boundary == L and L is sentinel, - // we'll always overlap since for any values of i,j ranges [i, k) and [j, k) always overlap. - key, _ := iter.SeekGE(keyRange.smallest.UserKey, base.SeekGEFlagsNone) - if key != nil { - c := sstableKeyCompare(cmp, *key, keyRange.largest) - if c <= 0 { - return true - } - } - // Assume overlap if iterator errored. - if err := iter.Error(); err != nil { - return true - } - - computeOverlapWithSpans := func(rIter keyspan.FragmentIterator) bool { - // NB: The spans surfaced by the fragment iterator are non-overlapping. - span := rIter.SeekLT(keyRange.smallest.UserKey) - if span == nil { - span = rIter.Next() - } - for ; span != nil; span = rIter.Next() { - if span.Empty() { - continue - } - key := span.SmallestKey() - c := sstableKeyCompare(cmp, key, keyRange.largest) - if c > 0 { - // The start of the span is after the largest key in the - // ingested table. - return false - } - if cmp(span.End, keyRange.smallest.UserKey) > 0 { - // The end of the span is greater than the smallest in the - // table. Note that the span end key is exclusive, thus ">0" - // instead of ">=0". - return true - } - } - // Assume overlap if iterator errored. - if err := rIter.Error(); err != nil { - return true - } - return false - } - - // rkeyIter is either a range key level iter, or a range key iterator - // over a single file. - if rkeyIter != nil { - if computeOverlapWithSpans(rkeyIter) { - return true - } - } - - // Check overlap with range deletions. - if rangeDelIter == nil || *rangeDelIter == nil { - return false - } - return computeOverlapWithSpans(*rangeDelIter) -} - -// ingestTargetLevel returns the target level for a file being ingested. -// If suggestSplit is true, it accounts for ingest-time splitting as part of -// its target level calculation, and if a split candidate is found, that file -// is returned as the splitFile. -func ingestTargetLevel( - newIters tableNewIters, - newRangeKeyIter keyspan.TableNewSpanIter, - iterOps IterOptions, - comparer *Comparer, - v *version, - baseLevel int, - compactions map[*compaction]struct{}, - meta *fileMetadata, - suggestSplit bool, -) (targetLevel int, splitFile *fileMetadata, err error) { - // Find the lowest level which does not have any files which overlap meta. We - // search from L0 to L6 looking for whether there are any files in the level - // which overlap meta. We want the "lowest" level (where lower means - // increasing level number) in order to reduce write amplification. - // - // There are 2 kinds of overlap we need to check for: file boundary overlap - // and data overlap. Data overlap implies file boundary overlap. Note that it - // is always possible to ingest into L0. - // - // To place meta at level i where i > 0: - // - there must not be any data overlap with levels <= i, since that will - // violate the sequence number invariant. - // - no file boundary overlap with level i, since that will violate the - // invariant that files do not overlap in levels i > 0. - // - if there is only a file overlap at a given level, and no data overlap, - // we can still slot a file at that level. We return the fileMetadata with - // which we have file boundary overlap (must be only one file, as sstable - // bounds are usually tight on user keys) and the caller is expected to split - // that sstable into two virtual sstables, allowing this file to go into that - // level. Note that if we have file boundary overlap with two files, which - // should only happen on rare occasions, we treat it as data overlap and - // don't use this optimization. - // - // The file boundary overlap check is simpler to conceptualize. Consider the - // following example, in which the ingested file lies completely before or - // after the file being considered. - // - // |--| |--| ingested file: [a,b] or [f,g] - // |-----| existing file: [c,e] - // _____________________ - // a b c d e f g - // - // In both cases the ingested file can move to considering the next level. - // - // File boundary overlap does not necessarily imply data overlap. The check - // for data overlap is a little more nuanced. Consider the following examples: - // - // 1. No data overlap: - // - // |-| |--| ingested file: [cc-d] or [ee-ff] - // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] - // _____________________ - // a b c d e f g - // - // In this case the ingested files can "fall through" this level. The checks - // continue at the next level. - // - // 2. Data overlap: - // - // |--| ingested file: [d-e] - // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] - // _____________________ - // a b c d e f g - // - // In this case the file cannot be ingested into this level as the point 'dd' - // is in the way. - // - // It is worth noting that the check for data overlap is only approximate. In - // the previous example, the ingested table [d-e] could contain only the - // points 'd' and 'e', in which case the table would be eligible for - // considering lower levels. However, such a fine-grained check would need to - // be exhaustive (comparing points and ranges in both the ingested existing - // tables) and such a check is prohibitively expensive. Thus Pebble treats any - // existing point that falls within the ingested table bounds as being "data - // overlap". - - // This assertion implicitly checks that we have the current version of - // the metadata. - if v.L0Sublevels == nil { - return 0, nil, errors.AssertionFailedf("could not read L0 sublevels") - } - // Check for overlap over the keys of L0 by iterating over the sublevels. - for subLevel := 0; subLevel < len(v.L0SublevelFiles); subLevel++ { - iter := newLevelIter(iterOps, comparer, newIters, - v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), internalIterOpts{}) - - var rangeDelIter keyspan.FragmentIterator - // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE - // sets it up for the target file. - iter.initRangeDel(&rangeDelIter) - - levelIter := keyspan.LevelIter{} - levelIter.Init( - keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter, - v.L0Sublevels.Levels[subLevel].Iter(), manifest.Level(0), manifest.KeyTypeRange, - ) - - kr := internalKeyRange{ - smallest: meta.Smallest, - largest: meta.Largest, - } - overlap := overlapWithIterator(iter, &rangeDelIter, &levelIter, kr, comparer.Compare) - err := iter.Close() // Closes range del iter as well. - err = firstError(err, levelIter.Close()) - if err != nil { - return 0, nil, err - } - if overlap { - return targetLevel, nil, nil - } - } - - level := baseLevel - for ; level < numLevels; level++ { - levelIter := newLevelIter(iterOps, comparer, newIters, - v.Levels[level].Iter(), manifest.Level(level), internalIterOpts{}) - var rangeDelIter keyspan.FragmentIterator - // Pass in a non-nil pointer to rangeDelIter so that levelIter.findFileGE - // sets it up for the target file. - levelIter.initRangeDel(&rangeDelIter) - - rkeyLevelIter := &keyspan.LevelIter{} - rkeyLevelIter.Init( - keyspan.SpanIterOptions{}, comparer.Compare, newRangeKeyIter, - v.Levels[level].Iter(), manifest.Level(level), manifest.KeyTypeRange, - ) - - kr := internalKeyRange{ - smallest: meta.Smallest, - largest: meta.Largest, - } - overlap := overlapWithIterator(levelIter, &rangeDelIter, rkeyLevelIter, kr, comparer.Compare) - err := levelIter.Close() // Closes range del iter as well. - err = firstError(err, rkeyLevelIter.Close()) - if err != nil { - return 0, nil, err - } - if overlap { - return targetLevel, splitFile, nil - } - - // Check boundary overlap. - var candidateSplitFile *fileMetadata - boundaryOverlaps := v.Overlaps(level, comparer.Compare, meta.Smallest.UserKey, - meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel()) - if !boundaryOverlaps.Empty() { - // We are already guaranteed to not have any data overlaps with files - // in boundaryOverlaps, otherwise we'd have returned in the above if - // statements. Use this, plus boundaryOverlaps.Len() == 1 to detect for - // the case where we can slot this file into the current level despite - // a boundary overlap, by splitting one existing file into two virtual - // sstables. - if suggestSplit && boundaryOverlaps.Len() == 1 { - iter := boundaryOverlaps.Iter() - candidateSplitFile = iter.First() - } else { - // We either don't want to suggest ingest-time splits (i.e. - // !suggestSplit), or we boundary-overlapped with more than one file. - continue - } - } - - // Check boundary overlap with any ongoing compactions. We consider an - // overlapping compaction that's writing files to an output level as - // equivalent to boundary overlap with files in that output level. - // - // We cannot check for data overlap with the new SSTs compaction will produce - // since compaction hasn't been done yet. However, there's no need to check - // since all keys in them will be from levels in [c.startLevel, - // c.outputLevel], and all those levels have already had their data overlap - // tested negative (else we'd have returned earlier). - // - // An alternative approach would be to cancel these compactions and proceed - // with an ingest-time split on this level if necessary. However, compaction - // cancellation can result in significant wasted effort and is best avoided - // unless necessary. - overlaps := false - for c := range compactions { - if c.outputLevel == nil || level != c.outputLevel.level { - continue - } - if comparer.Compare(meta.Smallest.UserKey, c.largest.UserKey) <= 0 && - comparer.Compare(meta.Largest.UserKey, c.smallest.UserKey) >= 0 { - overlaps = true - break - } - } - if !overlaps { - targetLevel = level - splitFile = candidateSplitFile - } - } - return targetLevel, splitFile, nil -} - -// Ingest ingests a set of sstables into the DB. Ingestion of the files is -// atomic and semantically equivalent to creating a single batch containing all -// of the mutations in the sstables. Ingestion may require the memtable to be -// flushed. The ingested sstable files are moved into the DB and must reside on -// the same filesystem as the DB. Sstables can be created for ingestion using -// sstable.Writer. On success, Ingest removes the input paths. -// -// Two types of sstables are accepted for ingestion(s): one is sstables present -// in the instance's vfs.FS and can be referenced locally. The other is sstables -// present in remote.Storage, referred to as shared or foreign sstables. These -// shared sstables can be linked through objstorageprovider.Provider, and do not -// need to already be present on the local vfs.FS. Foreign sstables must all fit -// in an excise span, and are destined for a level specified in SharedSSTMeta. -// -// All sstables *must* be Sync()'d by the caller after all bytes are written -// and before its file handle is closed; failure to do so could violate -// durability or lead to corrupted on-disk state. This method cannot, in a -// platform-and-FS-agnostic way, ensure that all sstables in the input are -// properly synced to disk. Opening new file handles and Sync()-ing them -// does not always guarantee durability; see the discussion here on that: -// https://github.com/cockroachdb/pebble/pull/835#issuecomment-663075379 -// -// Ingestion loads each sstable into the lowest level of the LSM which it -// doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable, -// ingestion forces the memtable to flush, and then waits for the flush to -// occur. In some cases, such as with no foreign sstables and no excise span, -// ingestion that gets blocked on a memtable can join the flushable queue and -// finish even before the memtable has been flushed. -// -// The steps for ingestion are: -// -// 1. Allocate file numbers for every sstable being ingested. -// 2. Load the metadata for all sstables being ingested. -// 3. Sort the sstables by smallest key, verifying non overlap (for local -// sstables). -// 4. Hard link (or copy) the local sstables into the DB directory. -// 5. Allocate a sequence number to use for all of the entries in the -// local sstables. This is the step where overlap with memtables is -// determined. If there is overlap, we remember the most recent memtable -// that overlaps. -// 6. Update the sequence number in the ingested local sstables. (Remote -// sstables get fixed sequence numbers that were determined at load time.) -// 7. Wait for the most recent memtable that overlaps to flush (if any). -// 8. Add the ingested sstables to the version (DB.ingestApply). -// 8.1. If an excise span was specified, figure out what sstables in the -// current version overlap with the excise span, and create new virtual -// sstables out of those sstables that exclude the excised span (DB.excise). -// 9. Publish the ingestion sequence number. -// -// Note that if the mutable memtable overlaps with ingestion, a flush of the -// memtable is forced equivalent to DB.Flush. Additionally, subsequent -// mutations that get sequence numbers larger than the ingestion sequence -// number get queued up behind the ingestion waiting for it to complete. This -// can produce a noticeable hiccup in performance. See -// https://github.com/cockroachdb/pebble/issues/25 for an idea for how to fix -// this hiccup. -func (d *DB) Ingest(paths []string) error { - if err := d.closed.Load(); err != nil { - panic(err) - } - if d.opts.ReadOnly { - return ErrReadOnly - } - _, err := d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */) - return err -} - -// IngestOperationStats provides some information about where in the LSM the -// bytes were ingested. -type IngestOperationStats struct { - // Bytes is the total bytes in the ingested sstables. - Bytes uint64 - // ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested - // into L0. This value is approximate when flushable ingests are active and - // an ingest overlaps an entry in the flushable queue. Currently, this - // approximation is very rough, only including tables that overlapped the - // memtable. This estimate may be improved with #2112. - ApproxIngestedIntoL0Bytes uint64 - // MemtableOverlappingFiles is the count of ingested sstables - // that overlapped keys in the memtables. - MemtableOverlappingFiles int -} - -// ExternalFile are external sstables that can be referenced through -// objprovider and ingested as remote files that will not be refcounted or -// cleaned up. For use with online restore. Note that the underlying sstable -// could contain keys outside the [Smallest,Largest) bounds; however Pebble -// is expected to only read the keys within those bounds. -type ExternalFile struct { - // Locator is the shared.Locator that can be used with objProvider to - // resolve a reference to this external sstable. - Locator remote.Locator - // ObjName is the unique name of this sstable on Locator. - ObjName string - // Size of the referenced proportion of the virtualized sstable. An estimate - // is acceptable in lieu of the backing file size. - Size uint64 - // SmallestUserKey and LargestUserKey are the [smallest,largest) user key - // bounds of the sstable. Both these bounds are loose i.e. it's possible for - // the sstable to not span the entirety of this range. However, multiple - // ExternalFiles in one ingestion must all have non-overlapping - // [smallest, largest) spans. Note that this Largest bound is exclusive. - SmallestUserKey, LargestUserKey []byte - // HasPointKey and HasRangeKey denote whether this file contains point keys - // or range keys. If both structs are false, an error is returned during - // ingestion. - HasPointKey, HasRangeKey bool -} - -// IngestWithStats does the same as Ingest, and additionally returns -// IngestOperationStats. -func (d *DB) IngestWithStats(paths []string) (IngestOperationStats, error) { - if err := d.closed.Load(); err != nil { - panic(err) - } - if d.opts.ReadOnly { - return IngestOperationStats{}, ErrReadOnly - } - return d.ingest(paths, ingestTargetLevel, nil /* shared */, KeyRange{}, nil /* external */) -} - -// IngestExternalFiles does the same as IngestWithStats, and additionally -// accepts external files (with locator info that can be resolved using -// d.opts.SharedStorage). These files must also be non-overlapping with -// each other, and must be resolvable through d.objProvider. -func (d *DB) IngestExternalFiles(external []ExternalFile) (IngestOperationStats, error) { - if err := d.closed.Load(); err != nil { - panic(err) - } - - if d.opts.ReadOnly { - return IngestOperationStats{}, ErrReadOnly - } - if d.opts.Experimental.RemoteStorage == nil { - return IngestOperationStats{}, errors.New("pebble: cannot ingest external files without shared storage configured") - } - return d.ingest(nil, ingestTargetLevel, nil /* shared */, KeyRange{}, external) -} - -// IngestAndExcise does the same as IngestWithStats, and additionally accepts a -// list of shared files to ingest that can be read from a remote.Storage through -// a Provider. All the shared files must live within exciseSpan, and any existing -// keys in exciseSpan are deleted by turning existing sstables into virtual -// sstables (if not virtual already) and shrinking their spans to exclude -// exciseSpan. See the comment at Ingest for a more complete picture of the -// ingestion process. -// -// Panics if this DB instance was not instantiated with a remote.Storage and -// shared sstables are present. -func (d *DB) IngestAndExcise( - paths []string, shared []SharedSSTMeta, exciseSpan KeyRange, -) (IngestOperationStats, error) { - if err := d.closed.Load(); err != nil { - panic(err) - } - if d.opts.ReadOnly { - return IngestOperationStats{}, ErrReadOnly - } - return d.ingest(paths, ingestTargetLevel, shared, exciseSpan, nil /* external */) -} - -// Both DB.mu and commitPipeline.mu must be held while this is called. -func (d *DB) newIngestedFlushableEntry( - meta []*fileMetadata, seqNum uint64, logNum FileNum, -) (*flushableEntry, error) { - // Update the sequence number for all of the sstables in the - // metadata. Writing the metadata to the manifest when the - // version edit is applied is the mechanism that persists the - // sequence number. The sstables themselves are left unmodified. - // In this case, a version edit will only be written to the manifest - // when the flushable is eventually flushed. If Pebble restarts in that - // time, then we'll lose the ingest sequence number information. But this - // information will also be reconstructed on node restart. - if err := ingestUpdateSeqNum( - d.cmp, d.opts.Comparer.FormatKey, seqNum, ingestLoadResult{localMeta: meta}, - ); err != nil { - return nil, err - } - - f := newIngestedFlushable(meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter) - - // NB: The logNum/seqNum are the WAL number which we're writing this entry - // to and the sequence number within the WAL which we'll write this entry - // to. - entry := d.newFlushableEntry(f, logNum, seqNum) - // The flushable entry starts off with a single reader ref, so increment - // the FileMetadata.Refs. - for _, file := range f.files { - file.Ref() - } - entry.unrefFiles = func() []*fileBacking { - var obsolete []*fileBacking - for _, file := range f.files { - if file.Unref() == 0 { - obsolete = append(obsolete, file.FileMetadata.FileBacking) - } - } - return obsolete - } - - entry.flushForced = true - entry.releaseMemAccounting = func() {} - return entry, nil -} - -// Both DB.mu and commitPipeline.mu must be held while this is called. Since -// we're holding both locks, the order in which we rotate the memtable or -// recycle the WAL in this function is irrelevant as long as the correct log -// numbers are assigned to the appropriate flushable. -func (d *DB) handleIngestAsFlushable(meta []*fileMetadata, seqNum uint64) error { - b := d.NewBatch() - for _, m := range meta { - b.ingestSST(m.FileNum) - } - b.setSeqNum(seqNum) - - // If the WAL is disabled, then the logNum used to create the flushable - // entry doesn't matter. We just use the logNum assigned to the current - // mutable memtable. If the WAL is enabled, then this logNum will be - // overwritten by the logNum of the log which will contain the log entry - // for the ingestedFlushable. - logNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum - if !d.opts.DisableWAL { - // We create a new WAL for the flushable instead of reusing the end of - // the previous WAL. This simplifies the increment of the minimum - // unflushed log number, and also simplifies WAL replay. - var prevLogSize uint64 - logNum, prevLogSize = d.rotateWAL() - // As the rotator of the WAL, we're responsible for updating the - // previous flushable queue tail's log size. - d.mu.mem.queue[len(d.mu.mem.queue)-1].logSize = prevLogSize - - d.mu.Unlock() - err := d.commit.directWrite(b) - if err != nil { - d.opts.Logger.Fatalf("%v", err) - } - d.mu.Lock() - } - - entry, err := d.newIngestedFlushableEntry(meta, seqNum, logNum) - if err != nil { - return err - } - nextSeqNum := seqNum + uint64(b.Count()) - - // Set newLogNum to the logNum of the previous flushable. This value is - // irrelevant if the WAL is disabled. If the WAL is enabled, then we set - // the appropriate value below. - newLogNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum - if !d.opts.DisableWAL { - // newLogNum will be the WAL num of the next mutable memtable which - // comes after the ingestedFlushable in the flushable queue. The mutable - // memtable will be created below. - // - // The prevLogSize returned by rotateWAL is the WAL to which the - // flushable ingest keys were appended. This intermediary WAL is only - // used to record the flushable ingest and nothing else. - newLogNum, entry.logSize = d.rotateWAL() - } - - currMem := d.mu.mem.mutable - // NB: Placing ingested sstables above the current memtables - // requires rotating of the existing memtables/WAL. There is - // some concern of churning through tiny memtables due to - // ingested sstables being placed on top of them, but those - // memtables would have to be flushed anyways. - d.mu.mem.queue = append(d.mu.mem.queue, entry) - d.rotateMemtable(newLogNum, nextSeqNum, currMem) - d.updateReadStateLocked(d.opts.DebugCheck) - d.maybeScheduleFlush() - return nil -} - -// See comment at Ingest() for details on how this works. -func (d *DB) ingest( - paths []string, - targetLevelFunc ingestTargetLevelFunc, - shared []SharedSSTMeta, - exciseSpan KeyRange, - external []ExternalFile, -) (IngestOperationStats, error) { - if len(shared) > 0 && d.opts.Experimental.RemoteStorage == nil { - panic("cannot ingest shared sstables with nil SharedStorage") - } - if (exciseSpan.Valid() || len(shared) > 0 || len(external) > 0) && d.FormatMajorVersion() < FormatVirtualSSTables { - return IngestOperationStats{}, errors.New("pebble: format major version too old for excise, shared or external sstable ingestion") - } - // Allocate file numbers for all of the files being ingested and mark them as - // pending in order to prevent them from being deleted. Note that this causes - // the file number ordering to be out of alignment with sequence number - // ordering. The sorting of L0 tables by sequence number avoids relying on - // that (busted) invariant. - d.mu.Lock() - pendingOutputs := make([]base.DiskFileNum, len(paths)+len(shared)+len(external)) - for i := 0; i < len(paths)+len(shared)+len(external); i++ { - pendingOutputs[i] = d.mu.versions.getNextFileNum().DiskFileNum() - } - - jobID := d.mu.nextJobID - d.mu.nextJobID++ - d.mu.Unlock() - - // Load the metadata for all the files being ingested. This step detects - // and elides empty sstables. - loadResult, err := ingestLoad(d.opts, d.FormatMajorVersion(), paths, shared, external, d.cacheID, pendingOutputs, d.objProvider, jobID) - if err != nil { - return IngestOperationStats{}, err - } - - if loadResult.fileCount == 0 { - // All of the sstables to be ingested were empty. Nothing to do. - return IngestOperationStats{}, nil - } - - // Verify the sstables do not overlap. - if err := ingestSortAndVerify(d.cmp, loadResult, exciseSpan); err != nil { - return IngestOperationStats{}, err - } - - // Hard link the sstables into the DB directory. Since the sstables aren't - // referenced by a version, they won't be used. If the hard linking fails - // (e.g. because the files reside on a different filesystem), ingestLink will - // fall back to copying, and if that fails we undo our work and return an - // error. - if err := ingestLink(jobID, d.opts, d.objProvider, loadResult, shared); err != nil { - return IngestOperationStats{}, err - } - - // Make the new tables durable. We need to do this at some point before we - // update the MANIFEST (via logAndApply), otherwise a crash can have the - // tables referenced in the MANIFEST, but not present in the provider. - if err := d.objProvider.Sync(); err != nil { - return IngestOperationStats{}, err - } - - // metaFlushableOverlaps is a slice parallel to meta indicating which of the - // ingested sstables overlap some table in the flushable queue. It's used to - // approximate ingest-into-L0 stats when using flushable ingests. - metaFlushableOverlaps := make([]bool, loadResult.fileCount) - var mem *flushableEntry - var mut *memTable - // asFlushable indicates whether the sstable was ingested as a flushable. - var asFlushable bool - prepare := func(seqNum uint64) { - // Note that d.commit.mu is held by commitPipeline when calling prepare. - - d.mu.Lock() - defer d.mu.Unlock() - - // Check to see if any files overlap with any of the memtables. The queue - // is ordered from oldest to newest with the mutable memtable being the - // last element in the slice. We want to wait for the newest table that - // overlaps. - - for i := len(d.mu.mem.queue) - 1; i >= 0; i-- { - m := d.mu.mem.queue[i] - iter := m.newIter(nil) - rangeDelIter := m.newRangeDelIter(nil) - rkeyIter := m.newRangeKeyIter(nil) - - checkForOverlap := func(i int, meta *fileMetadata) { - if metaFlushableOverlaps[i] { - // This table already overlapped a more recent flushable. - return - } - kr := internalKeyRange{ - smallest: meta.Smallest, - largest: meta.Largest, - } - if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) { - // If this is the first table to overlap a flushable, save - // the flushable. This ingest must be ingested or flushed - // after it. - if mem == nil { - mem = m - } - metaFlushableOverlaps[i] = true - } - } - for i := range loadResult.localMeta { - checkForOverlap(i, loadResult.localMeta[i]) - } - for i := range loadResult.sharedMeta { - checkForOverlap(len(loadResult.localMeta)+i, loadResult.sharedMeta[i]) - } - for i := range loadResult.externalMeta { - checkForOverlap(len(loadResult.localMeta)+len(loadResult.sharedMeta)+i, loadResult.externalMeta[i]) - } - if exciseSpan.Valid() { - kr := internalKeyRange{ - smallest: base.MakeInternalKey(exciseSpan.Start, InternalKeySeqNumMax, InternalKeyKindMax), - largest: base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, exciseSpan.End), - } - if overlapWithIterator(iter, &rangeDelIter, rkeyIter, kr, d.cmp) { - if mem == nil { - mem = m - } - } - } - err := iter.Close() - if rangeDelIter != nil { - err = firstError(err, rangeDelIter.Close()) - } - if rkeyIter != nil { - err = firstError(err, rkeyIter.Close()) - } - if err != nil { - d.opts.Logger.Infof("ingest error reading flushable for log %s: %s", m.logNum, err) - } - } - - if mem == nil { - // No overlap with any of the queued flushables, so no need to queue - // after them. - - // New writes with higher sequence numbers may be concurrently - // committed. We must ensure they don't flush before this ingest - // completes. To do that, we ref the mutable memtable as a writer, - // preventing its flushing (and the flushing of all subsequent - // flushables in the queue). Once we've acquired the manifest lock - // to add the ingested sstables to the LSM, we can unref as we're - // guaranteed that the flush won't edit the LSM before this ingest. - mut = d.mu.mem.mutable - mut.writerRef() - return - } - // The ingestion overlaps with some entry in the flushable queue. - if d.FormatMajorVersion() < FormatFlushableIngest || - d.opts.Experimental.DisableIngestAsFlushable() || - len(shared) > 0 || exciseSpan.Valid() || len(external) > 0 || - (len(d.mu.mem.queue) > d.opts.MemTableStopWritesThreshold-1) { - // We're not able to ingest as a flushable, - // so we must synchronously flush. - // - // TODO(bilal): Currently, if any of the files being ingested are shared or - // there's an excise span present, we cannot use flushable ingests and need - // to wait synchronously. Either remove this caveat by fleshing out - // flushable ingest logic to also account for these cases, or remove this - // comment. Tracking issue: https://github.com/cockroachdb/pebble/issues/2676 - if mem.flushable == d.mu.mem.mutable { - err = d.makeRoomForWrite(nil) - } - // New writes with higher sequence numbers may be concurrently - // committed. We must ensure they don't flush before this ingest - // completes. To do that, we ref the mutable memtable as a writer, - // preventing its flushing (and the flushing of all subsequent - // flushables in the queue). Once we've acquired the manifest lock - // to add the ingested sstables to the LSM, we can unref as we're - // guaranteed that the flush won't edit the LSM before this ingest. - mut = d.mu.mem.mutable - mut.writerRef() - mem.flushForced = true - d.maybeScheduleFlush() - return - } - // Since there aren't too many memtables already queued up, we can - // slide the ingested sstables on top of the existing memtables. - asFlushable = true - err = d.handleIngestAsFlushable(loadResult.localMeta, seqNum) - } - - var ve *versionEdit - apply := func(seqNum uint64) { - if err != nil || asFlushable { - // An error occurred during prepare. - if mut != nil { - if mut.writerUnref() { - d.mu.Lock() - d.maybeScheduleFlush() - d.mu.Unlock() - } - } - return - } - - // Update the sequence numbers for all ingested sstables' - // metadata. When the version edit is applied, the metadata is - // written to the manifest, persisting the sequence number. - // The sstables themselves are left unmodified. - if err = ingestUpdateSeqNum( - d.cmp, d.opts.Comparer.FormatKey, seqNum, loadResult, - ); err != nil { - if mut != nil { - if mut.writerUnref() { - d.mu.Lock() - d.maybeScheduleFlush() - d.mu.Unlock() - } - } - return - } - - // If we overlapped with a memtable in prepare wait for the flush to - // finish. - if mem != nil { - <-mem.flushed - } - - // Assign the sstables to the correct level in the LSM and apply the - // version edit. - ve, err = d.ingestApply(jobID, loadResult, targetLevelFunc, mut, exciseSpan) - } - - // Only one ingest can occur at a time because if not, one would block waiting - // for the other to finish applying. This blocking would happen while holding - // the commit mutex which would prevent unrelated batches from writing their - // changes to the WAL and memtable. This will cause a bigger commit hiccup - // during ingestion. - d.commit.ingestSem <- struct{}{} - d.commit.AllocateSeqNum(loadResult.fileCount, prepare, apply) - <-d.commit.ingestSem - - if err != nil { - if err2 := ingestCleanup(d.objProvider, loadResult.localMeta); err2 != nil { - d.opts.Logger.Infof("ingest cleanup failed: %v", err2) - } - } else { - // Since we either created a hard link to the ingesting files, or copied - // them over, it is safe to remove the originals paths. - for _, path := range loadResult.localPaths { - if err2 := d.opts.FS.Remove(path); err2 != nil { - d.opts.Logger.Infof("ingest failed to remove original file: %s", err2) - } - } - } - - if invariants.Enabled { - for _, sharedMeta := range loadResult.sharedMeta { - d.checkVirtualBounds(sharedMeta) - } - } - - info := TableIngestInfo{ - JobID: jobID, - Err: err, - flushable: asFlushable, - } - if len(loadResult.localMeta) > 0 { - info.GlobalSeqNum = loadResult.localMeta[0].SmallestSeqNum - } else if len(loadResult.sharedMeta) > 0 { - info.GlobalSeqNum = loadResult.sharedMeta[0].SmallestSeqNum - } else { - info.GlobalSeqNum = loadResult.externalMeta[0].SmallestSeqNum - } - var stats IngestOperationStats - if ve != nil { - info.Tables = make([]struct { - TableInfo - Level int - }, len(ve.NewFiles)) - for i := range ve.NewFiles { - e := &ve.NewFiles[i] - info.Tables[i].Level = e.Level - info.Tables[i].TableInfo = e.Meta.TableInfo() - stats.Bytes += e.Meta.Size - if e.Level == 0 { - stats.ApproxIngestedIntoL0Bytes += e.Meta.Size - } - if i < len(metaFlushableOverlaps) && metaFlushableOverlaps[i] { - stats.MemtableOverlappingFiles++ - } - } - } else if asFlushable { - // NB: If asFlushable == true, there are no shared sstables. - info.Tables = make([]struct { - TableInfo - Level int - }, len(loadResult.localMeta)) - for i, f := range loadResult.localMeta { - info.Tables[i].Level = -1 - info.Tables[i].TableInfo = f.TableInfo() - stats.Bytes += f.Size - // We don't have exact stats on which files will be ingested into - // L0, because actual ingestion into the LSM has been deferred until - // flush time. Instead, we infer based on memtable overlap. - // - // TODO(jackson): If we optimistically compute data overlap (#2112) - // before entering the commit pipeline, we can use that overlap to - // improve our approximation by incorporating overlap with L0, not - // just memtables. - if metaFlushableOverlaps[i] { - stats.ApproxIngestedIntoL0Bytes += f.Size - stats.MemtableOverlappingFiles++ - } - } - } - d.opts.EventListener.TableIngested(info) - - return stats, err -} - -// excise updates ve to include a replacement of the file m with new virtual -// sstables that exclude exciseSpan, returning a slice of newly-created files if -// any. If the entirety of m is deleted by exciseSpan, no new sstables are added -// and m is deleted. Note that ve is updated in-place. -// -// The manifest lock must be held when calling this method. -func (d *DB) excise( - exciseSpan KeyRange, m *fileMetadata, ve *versionEdit, level int, -) ([]manifest.NewFileEntry, error) { - numCreatedFiles := 0 - // Check if there's actually an overlap between m and exciseSpan. - if !exciseSpan.Overlaps(d.cmp, m) { - return nil, nil - } - ve.DeletedFiles[deletedFileEntry{ - Level: level, - FileNum: m.FileNum, - }] = m - // Fast path: m sits entirely within the exciseSpan, so just delete it. - if exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) { - return nil, nil - } - var iter internalIterator - var rangeDelIter keyspan.FragmentIterator - var rangeKeyIter keyspan.FragmentIterator - needsBacking := false - // Create a file to the left of the excise span, if necessary. - // The bounds of this file will be [m.Smallest, lastKeyBefore(exciseSpan.Start)]. - // - // We create bounds that are tight on user keys, and we make the effort to find - // the last key in the original sstable that's smaller than exciseSpan.Start - // even though it requires some sstable reads. We could choose to create - // virtual sstables on loose userKey bounds, in which case we could just set - // leftFile.Largest to an exclusive sentinel at exciseSpan.Start. The biggest - // issue with that approach would be that it'd lead to lots of small virtual - // sstables in the LSM that have no guarantee on containing even a single user - // key within the file bounds. This has the potential to increase both read and - // write-amp as we will be opening up these sstables only to find no relevant - // keys in the read path, and compacting sstables on top of them instead of - // directly into the space occupied by them. We choose to incur the cost of - // calculating tight bounds at this time instead of creating more work in the - // future. - // - // TODO(bilal): Some of this work can happen without grabbing the manifest - // lock; we could grab one currentVersion, release the lock, calculate excised - // files, then grab the lock again and recalculate for just the files that - // have changed since our previous calculation. Do this optimiaztino as part of - // https://github.com/cockroachdb/pebble/issues/2112 . - if d.cmp(m.Smallest.UserKey, exciseSpan.Start) < 0 { - leftFile := &fileMetadata{ - Virtual: true, - FileBacking: m.FileBacking, - FileNum: d.mu.versions.getNextFileNum(), - // Note that these are loose bounds for smallest/largest seqnums, but they're - // sufficient for maintaining correctness. - SmallestSeqNum: m.SmallestSeqNum, - LargestSeqNum: m.LargestSeqNum, - } - if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.SmallestPointKey) { - // This file will contain point keys - smallestPointKey := m.SmallestPointKey - var err error - iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{level: manifest.Level(level)}, internalIterOpts{}) - if err != nil { - return nil, err - } - var key *InternalKey - if iter != nil { - defer iter.Close() - key, _ = iter.SeekLT(exciseSpan.Start, base.SeekLTFlagsNone) - } else { - iter = emptyIter - } - if key != nil { - leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, key.Clone()) - } - // Store the min of (exciseSpan.Start, rdel.End) in lastRangeDel. This - // needs to be a copy if the key is owned by the range del iter. - var lastRangeDel []byte - if rangeDelIter != nil { - defer rangeDelIter.Close() - rdel := rangeDelIter.SeekLT(exciseSpan.Start) - if rdel != nil { - lastRangeDel = append(lastRangeDel[:0], rdel.End...) - if d.cmp(lastRangeDel, exciseSpan.Start) > 0 { - lastRangeDel = exciseSpan.Start - } - } - } else { - rangeDelIter = emptyKeyspanIter - } - if lastRangeDel != nil { - leftFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, lastRangeDel)) - } - } - if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.SmallestRangeKey) { - // This file will contain range keys - var err error - smallestRangeKey := m.SmallestRangeKey - rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{}) - if err != nil { - return nil, err - } - // Store the min of (exciseSpan.Start, rkey.End) in lastRangeKey. This - // needs to be a copy if the key is owned by the range key iter. - var lastRangeKey []byte - var lastRangeKeyKind InternalKeyKind - defer rangeKeyIter.Close() - rkey := rangeKeyIter.SeekLT(exciseSpan.Start) - if rkey != nil { - lastRangeKey = append(lastRangeKey[:0], rkey.End...) - if d.cmp(lastRangeKey, exciseSpan.Start) > 0 { - lastRangeKey = exciseSpan.Start - } - lastRangeKeyKind = rkey.Keys[0].Kind() - } - if lastRangeKey != nil { - leftFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, base.MakeExclusiveSentinelKey(lastRangeKeyKind, lastRangeKey)) - } - } - if leftFile.HasRangeKeys || leftFile.HasPointKeys { - var err error - leftFile.Size, err = d.tableCache.estimateSize(m, leftFile.Smallest.UserKey, leftFile.Largest.UserKey) - if err != nil { - return nil, err - } - if leftFile.Size == 0 { - // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size, - // such as if the excised file only has range keys/dels and no point - // keys. This can cause panics in places where we divide by file sizes. - // Correct for it here. - leftFile.Size = 1 - } - if err := leftFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { - return nil, err - } - leftFile.ValidateVirtual(m) - d.checkVirtualBounds(leftFile) - ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: leftFile}) - needsBacking = true - numCreatedFiles++ - } - } - // Create a file to the right, if necessary. - if exciseSpan.Contains(d.cmp, m.Largest) { - // No key exists to the right of the excise span in this file. - if needsBacking && !m.Virtual { - // If m is virtual, then its file backing is already known to the manifest. - // We don't need to create another file backing. Note that there must be - // only one CreatedBackingTables entry per backing sstable. This is - // indicated by the VersionEdit.CreatedBackingTables invariant. - ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) - } - return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil - } - // Create a new file, rightFile, between [firstKeyAfter(exciseSpan.End), m.Largest]. - // - // See comment before the definition of leftFile for the motivation behind - // calculating tight user-key bounds. - rightFile := &fileMetadata{ - Virtual: true, - FileBacking: m.FileBacking, - FileNum: d.mu.versions.getNextFileNum(), - // Note that these are loose bounds for smallest/largest seqnums, but they're - // sufficient for maintaining correctness. - SmallestSeqNum: m.SmallestSeqNum, - LargestSeqNum: m.LargestSeqNum, - } - if m.HasPointKeys && !exciseSpan.Contains(d.cmp, m.LargestPointKey) { - // This file will contain point keys - largestPointKey := m.LargestPointKey - var err error - if iter == nil && rangeDelIter == nil { - iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{level: manifest.Level(level)}, internalIterOpts{}) - if err != nil { - return nil, err - } - if iter != nil { - defer iter.Close() - } else { - iter = emptyIter - } - if rangeDelIter != nil { - defer rangeDelIter.Close() - } else { - rangeDelIter = emptyKeyspanIter - } - } - key, _ := iter.SeekGE(exciseSpan.End, base.SeekGEFlagsNone) - if key != nil { - rightFile.ExtendPointKeyBounds(d.cmp, key.Clone(), largestPointKey) - } - // Store the max of (exciseSpan.End, rdel.Start) in firstRangeDel. This - // needs to be a copy if the key is owned by the range del iter. - var firstRangeDel []byte - rdel := rangeDelIter.SeekGE(exciseSpan.End) - if rdel != nil { - firstRangeDel = append(firstRangeDel[:0], rdel.Start...) - if d.cmp(firstRangeDel, exciseSpan.End) < 0 { - firstRangeDel = exciseSpan.End - } - } - if firstRangeDel != nil { - smallestPointKey := rdel.SmallestKey() - smallestPointKey.UserKey = firstRangeDel - rightFile.ExtendPointKeyBounds(d.cmp, smallestPointKey, largestPointKey) - } - } - if m.HasRangeKeys && !exciseSpan.Contains(d.cmp, m.LargestRangeKey) { - // This file will contain range keys. - largestRangeKey := m.LargestRangeKey - if rangeKeyIter == nil { - var err error - rangeKeyIter, err = d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{}) - if err != nil { - return nil, err - } - defer rangeKeyIter.Close() - } - // Store the max of (exciseSpan.End, rkey.Start) in firstRangeKey. This - // needs to be a copy if the key is owned by the range key iter. - var firstRangeKey []byte - rkey := rangeKeyIter.SeekGE(exciseSpan.End) - if rkey != nil { - firstRangeKey = append(firstRangeKey[:0], rkey.Start...) - if d.cmp(firstRangeKey, exciseSpan.End) < 0 { - firstRangeKey = exciseSpan.End - } - } - if firstRangeKey != nil { - smallestRangeKey := rkey.SmallestKey() - smallestRangeKey.UserKey = firstRangeKey - // We call ExtendRangeKeyBounds so any internal boundType fields are - // set correctly. Note that this is mildly wasteful as we'll be comparing - // rightFile.{Smallest,Largest}RangeKey with themselves, which can be - // avoided if we exported ExtendOverallKeyBounds or so. - rightFile.ExtendRangeKeyBounds(d.cmp, smallestRangeKey, largestRangeKey) - } - } - if rightFile.HasRangeKeys || rightFile.HasPointKeys { - var err error - rightFile.Size, err = d.tableCache.estimateSize(m, rightFile.Smallest.UserKey, rightFile.Largest.UserKey) - if err != nil { - return nil, err - } - if rightFile.Size == 0 { - // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size, - // such as if the excised file only has range keys/dels and no point keys. - // This can cause panics in places where we divide by file sizes. Correct - // for it here. - rightFile.Size = 1 - } - rightFile.ValidateVirtual(m) - d.checkVirtualBounds(rightFile) - ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: level, Meta: rightFile}) - needsBacking = true - numCreatedFiles++ - } - - if needsBacking && !m.Virtual { - // If m is virtual, then its file backing is already known to the manifest. - // We don't need to create another file backing. Note that there must be - // only one CreatedBackingTables entry per backing sstable. This is - // indicated by the VersionEdit.CreatedBackingTables invariant. - ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) - } - - if err := rightFile.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { - return nil, err - } - return ve.NewFiles[len(ve.NewFiles)-numCreatedFiles:], nil -} - -type ingestTargetLevelFunc func( - newIters tableNewIters, - newRangeKeyIter keyspan.TableNewSpanIter, - iterOps IterOptions, - comparer *Comparer, - v *version, - baseLevel int, - compactions map[*compaction]struct{}, - meta *fileMetadata, - suggestSplit bool, -) (int, *fileMetadata, error) - -type ingestSplitFile struct { - // ingestFile is the file being ingested. - ingestFile *fileMetadata - // splitFile is the file that needs to be split to allow ingestFile to slot - // into `level` level. - splitFile *fileMetadata - // The level where ingestFile will go (and where splitFile already is). - level int -} - -// ingestSplit splits files specified in `files` and updates ve in-place to -// account for existing files getting split into two virtual sstables. The map -// `replacedFiles` contains an in-progress map of all files that have been -// replaced with new virtual sstables in this version edit so far, which is also -// updated in-place. -// -// d.mu as well as the manifest lock must be held when calling this method. -func (d *DB) ingestSplit( - ve *versionEdit, - updateMetrics func(*fileMetadata, int, []newFileEntry), - files []ingestSplitFile, - replacedFiles map[base.FileNum][]newFileEntry, -) error { - for _, s := range files { - // replacedFiles can be thought of as a tree, where we start iterating with - // s.splitFile and run its fileNum through replacedFiles, then find which of - // the replaced files overlaps with s.ingestFile, which becomes the new - // splitFile, then we check splitFile's replacements in replacedFiles again - // for overlap with s.ingestFile, and so on until we either can't find the - // current splitFile in replacedFiles (i.e. that's the file that now needs to - // be split), or we don't find a file that overlaps with s.ingestFile, which - // means a prior ingest split already produced enough room for s.ingestFile - // to go into this level without necessitating another ingest split. - splitFile := s.splitFile - for splitFile != nil { - replaced, ok := replacedFiles[splitFile.FileNum] - if !ok { - break - } - updatedSplitFile := false - for i := range replaced { - if replaced[i].Meta.Overlaps(d.cmp, s.ingestFile.Smallest.UserKey, s.ingestFile.Largest.UserKey, s.ingestFile.Largest.IsExclusiveSentinel()) { - if updatedSplitFile { - // This should never happen because the earlier ingestTargetLevel - // function only finds split file candidates that are guaranteed to - // have no data overlap, only boundary overlap. See the comments - // in that method to see the definitions of data vs boundary - // overlap. That, plus the fact that files in `replaced` are - // guaranteed to have file bounds that are tight on user keys - // (as that's what `d.excise` produces), means that the only case - // where we overlap with two or more files in `replaced` is if we - // actually had data overlap all along, or if the ingestion files - // were overlapping, either of which is an invariant violation. - panic("updated with two files in ingestSplit") - } - splitFile = replaced[i].Meta - updatedSplitFile = true - } - } - if !updatedSplitFile { - // None of the replaced files overlapped with the file being ingested. - // This can happen if we've already excised a span overlapping with - // this file, or if we have consecutive ingested files that can slide - // within the same gap between keys in an existing file. For instance, - // if an existing file has keys a and g and we're ingesting b-c, d-e, - // the first loop iteration will split the existing file into one that - // ends in a and another that starts at g, and the second iteration will - // fall into this case and require no splitting. - // - // No splitting necessary. - splitFile = nil - } - } - if splitFile == nil { - continue - } - // NB: excise operates on [start, end). We're splitting at [start, end] - // (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation - // of exclusive vs inclusive end bounds should not make a difference here - // as we're guaranteed to not have any data overlap between splitFile and - // s.ingestFile, so panic if we do see a newly added file with an endKey - // equalling s.ingestFile.Largest, and !s.ingestFile.Largest.IsExclusiveSentinel() - added, err := d.excise(KeyRange{Start: s.ingestFile.Smallest.UserKey, End: s.ingestFile.Largest.UserKey}, splitFile, ve, s.level) - if err != nil { - return err - } - if _, ok := ve.DeletedFiles[deletedFileEntry{ - Level: s.level, - FileNum: splitFile.FileNum, - }]; !ok { - panic("did not split file that was expected to be split") - } - replacedFiles[splitFile.FileNum] = added - for i := range added { - if s.ingestFile.Overlaps(d.cmp, added[i].Meta.Smallest.UserKey, added[i].Meta.Largest.UserKey, added[i].Meta.Largest.IsExclusiveSentinel()) { - panic("ingest-time split produced a file that overlaps with ingested file") - } - } - updateMetrics(splitFile, s.level, added) - } - // Flatten the version edit by removing any entries from ve.NewFiles that - // are also in ve.DeletedFiles. - newNewFiles := ve.NewFiles[:0] - for i := range ve.NewFiles { - fn := ve.NewFiles[i].Meta.FileNum - deEntry := deletedFileEntry{Level: ve.NewFiles[i].Level, FileNum: fn} - if _, ok := ve.DeletedFiles[deEntry]; ok { - delete(ve.DeletedFiles, deEntry) - } else { - newNewFiles = append(newNewFiles, ve.NewFiles[i]) - } - } - ve.NewFiles = newNewFiles - return nil -} - -func (d *DB) ingestApply( - jobID int, - lr ingestLoadResult, - findTargetLevel ingestTargetLevelFunc, - mut *memTable, - exciseSpan KeyRange, -) (*versionEdit, error) { - d.mu.Lock() - defer d.mu.Unlock() - - ve := &versionEdit{ - NewFiles: make([]newFileEntry, lr.fileCount), - } - if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) { - ve.DeletedFiles = map[manifest.DeletedFileEntry]*manifest.FileMetadata{} - } - metrics := make(map[int]*LevelMetrics) - - // Lock the manifest for writing before we use the current version to - // determine the target level. This prevents two concurrent ingestion jobs - // from using the same version to determine the target level, and also - // provides serialization with concurrent compaction and flush jobs. - // logAndApply unconditionally releases the manifest lock, but any earlier - // returns must unlock the manifest. - d.mu.versions.logLock() - - if mut != nil { - // Unref the mutable memtable to allows its flush to proceed. Now that we've - // acquired the manifest lock, we can be certain that if the mutable - // memtable has received more recent conflicting writes, the flush won't - // beat us to applying to the manifest resulting in sequence number - // inversion. Even though we call maybeScheduleFlush right now, this flush - // will apply after our ingestion. - if mut.writerUnref() { - d.maybeScheduleFlush() - } - } - - shouldIngestSplit := d.opts.Experimental.IngestSplit != nil && - d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables - current := d.mu.versions.currentVersion() - baseLevel := d.mu.versions.picker.getBaseLevel() - iterOps := IterOptions{logger: d.opts.Logger} - // filesToSplit is a list where each element is a pair consisting of a file - // being ingested and a file being split to make room for an ingestion into - // that level. Each ingested file will appear at most once in this list. It - // is possible for split files to appear twice in this list. - filesToSplit := make([]ingestSplitFile, 0) - checkCompactions := false - for i := 0; i < lr.fileCount; i++ { - // Determine the lowest level in the LSM for which the sstable doesn't - // overlap any existing files in the level. - var m *fileMetadata - sharedIdx := -1 - sharedLevel := -1 - externalFile := false - if i < len(lr.localMeta) { - // local file. - m = lr.localMeta[i] - } else if (i - len(lr.localMeta)) < len(lr.sharedMeta) { - // shared file. - sharedIdx = i - len(lr.localMeta) - m = lr.sharedMeta[sharedIdx] - sharedLevel = int(lr.sharedLevels[sharedIdx]) - } else { - // external file. - externalFile = true - m = lr.externalMeta[i-(len(lr.localMeta)+len(lr.sharedMeta))] - } - f := &ve.NewFiles[i] - var err error - if sharedIdx >= 0 { - f.Level = sharedLevel - if f.Level < sharedLevelsStart { - panic("cannot slot a shared file higher than the highest shared level") - } - ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) - } else { - if externalFile { - ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.FileBacking) - } - var splitFile *fileMetadata - if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest) && exciseSpan.Contains(d.cmp, m.Largest) { - // This file fits perfectly within the excise span. We can slot it at - // L6, or sharedLevelsStart - 1 if we have shared files. - if len(lr.sharedMeta) > 0 { - f.Level = sharedLevelsStart - 1 - if baseLevel > f.Level { - f.Level = 0 - } - } else { - f.Level = 6 - } - } else { - // TODO(bilal): findTargetLevel does disk IO (reading files for data - // overlap) even though we're holding onto d.mu. Consider unlocking - // d.mu while we do this. We already hold versions.logLock so we should - // not see any version applications while we're at this. The one - // complication here would be pulling out the mu.compact.inProgress - // check from findTargetLevel, as that requires d.mu to be held. - f.Level, splitFile, err = findTargetLevel( - d.newIters, d.tableNewRangeKeyIter, iterOps, d.opts.Comparer, current, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit) - } - - if splitFile != nil { - if invariants.Enabled { - if lf := current.Levels[f.Level].Find(d.cmp, splitFile); lf == nil { - panic("splitFile returned is not in level it should be") - } - } - // We take advantage of the fact that we won't drop the db mutex - // between now and the call to logAndApply. So, no files should - // get added to a new in-progress compaction at this point. We can - // avoid having to iterate on in-progress compactions to cancel them - // if none of the files being split have a compacting state. - if splitFile.IsCompacting() { - checkCompactions = true - } - filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitFile, level: f.Level}) - } - } - if err != nil { - d.mu.versions.logUnlock() - return nil, err - } - f.Meta = m - levelMetrics := metrics[f.Level] - if levelMetrics == nil { - levelMetrics = &LevelMetrics{} - metrics[f.Level] = levelMetrics - } - levelMetrics.NumFiles++ - levelMetrics.Size += int64(m.Size) - levelMetrics.BytesIngested += m.Size - levelMetrics.TablesIngested++ - } - // replacedFiles maps files excised due to exciseSpan (or splitFiles returned - // by ingestTargetLevel), to files that were created to replace it. This map - // is used to resolve references to split files in filesToSplit, as it is - // possible for a file that we want to split to no longer exist or have a - // newer fileMetadata due to a split induced by another ingestion file, or an - // excise. - replacedFiles := make(map[base.FileNum][]newFileEntry) - updateLevelMetricsOnExcise := func(m *fileMetadata, level int, added []newFileEntry) { - levelMetrics := metrics[level] - if levelMetrics == nil { - levelMetrics = &LevelMetrics{} - metrics[level] = levelMetrics - } - levelMetrics.NumFiles-- - levelMetrics.Size -= int64(m.Size) - for i := range added { - levelMetrics.NumFiles++ - levelMetrics.Size += int64(added[i].Meta.Size) - } - } - if exciseSpan.Valid() { - // Iterate through all levels and find files that intersect with exciseSpan. - // - // TODO(bilal): We could drop the DB mutex here as we don't need it for - // excises; we only need to hold the version lock which we already are - // holding. However releasing the DB mutex could mess with the - // ingestTargetLevel calculation that happened above, as it assumed that it - // had a complete view of in-progress compactions that wouldn't change - // until logAndApply is called. If we were to drop the mutex now, we could - // schedule another in-progress compaction that would go into the chosen target - // level and lead to file overlap within level (which would panic in - // logAndApply). We should drop the db mutex here, do the excise, then - // re-grab the DB mutex and rerun just the in-progress compaction check to - // see if any new compactions are conflicting with our chosen target levels - // for files, and if they are, we should signal those compactions to error - // out. - for level := range current.Levels { - overlaps := current.Overlaps(level, d.cmp, exciseSpan.Start, exciseSpan.End, true /* exclusiveEnd */) - iter := overlaps.Iter() - - for m := iter.First(); m != nil; m = iter.Next() { - newFiles, err := d.excise(exciseSpan, m, ve, level) - if err != nil { - return nil, err - } - - if _, ok := ve.DeletedFiles[deletedFileEntry{ - Level: level, - FileNum: m.FileNum, - }]; !ok { - // We did not excise this file. - continue - } - replacedFiles[m.FileNum] = newFiles - updateLevelMetricsOnExcise(m, level, newFiles) - } - } - } - if len(filesToSplit) > 0 { - // For the same reasons as the above call to excise, we hold the db mutex - // while calling this method. - if err := d.ingestSplit(ve, updateLevelMetricsOnExcise, filesToSplit, replacedFiles); err != nil { - return nil, err - } - } - if len(filesToSplit) > 0 || exciseSpan.Valid() { - for c := range d.mu.compact.inProgress { - if c.versionEditApplied { - continue - } - // Check if this compaction overlaps with the excise span. Note that just - // checking if the inputs individually overlap with the excise span - // isn't sufficient; for instance, a compaction could have [a,b] and [e,f] - // as inputs and write it all out as [a,b,e,f] in one sstable. If we're - // doing a [c,d) excise at the same time as this compaction, we will have - // to error out the whole compaction as we can't guarantee it hasn't/won't - // write a file overlapping with the excise span. - if exciseSpan.OverlapsInternalKeyRange(d.cmp, c.smallest, c.largest) { - c.cancel.Store(true) - } - // Check if this compaction's inputs have been replaced due to an - // ingest-time split. In that case, cancel the compaction as a newly picked - // compaction would need to include any new files that slid in between - // previously-existing files. Note that we cancel any compaction that has a - // file that was ingest-split as an input, even if it started before this - // ingestion. - if checkCompactions { - for i := range c.inputs { - iter := c.inputs[i].files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - if _, ok := replacedFiles[f.FileNum]; ok { - c.cancel.Store(true) - break - } - } - } - } - } - // Check for any EventuallyFileOnlySnapshots that could be watching for - // an excise on this span. - if exciseSpan.Valid() { - for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next { - if s.efos == nil { - continue - } - efos := s.efos - // TODO(bilal): We can make this faster by taking advantage of the sorted - // nature of protectedRanges to do a sort.Search, or even maintaining a - // global list of all protected ranges instead of having to peer into every - // snapshot. - for i := range efos.protectedRanges { - if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) { - efos.excised.Store(true) - break - } - } - } - } - } - if err := d.mu.versions.logAndApply(jobID, ve, metrics, false /* forceRotation */, func() []compactionInfo { - return d.getInProgressCompactionInfoLocked(nil) - }); err != nil { - return nil, err - } - - d.mu.versions.metrics.Ingest.Count++ - - d.updateReadStateLocked(d.opts.DebugCheck) - // updateReadStateLocked could have generated obsolete tables, schedule a - // cleanup job if necessary. - d.deleteObsoleteFiles(jobID) - d.updateTableStatsLocked(ve.NewFiles) - // The ingestion may have pushed a level over the threshold for compaction, - // so check to see if one is necessary and schedule it. - d.maybeScheduleCompaction() - var toValidate []manifest.NewFileEntry - dedup := make(map[base.DiskFileNum]struct{}) - for _, entry := range ve.NewFiles { - if _, ok := dedup[entry.Meta.FileBacking.DiskFileNum]; !ok { - toValidate = append(toValidate, entry) - dedup[entry.Meta.FileBacking.DiskFileNum] = struct{}{} - } - } - d.maybeValidateSSTablesLocked(toValidate) - return ve, nil -} - -// maybeValidateSSTablesLocked adds the slice of newFileEntrys to the pending -// queue of files to be validated, when the feature is enabled. -// -// Note that if two entries with the same backing file are added twice, then the -// block checksums for the backing file will be validated twice. -// -// DB.mu must be locked when calling. -func (d *DB) maybeValidateSSTablesLocked(newFiles []newFileEntry) { - // Only add to the validation queue when the feature is enabled. - if !d.opts.Experimental.ValidateOnIngest { - return - } - - d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...) - if d.shouldValidateSSTablesLocked() { - go d.validateSSTables() - } -} - -// shouldValidateSSTablesLocked returns true if SSTable validation should run. -// DB.mu must be locked when calling. -func (d *DB) shouldValidateSSTablesLocked() bool { - return !d.mu.tableValidation.validating && - d.closed.Load() == nil && - d.opts.Experimental.ValidateOnIngest && - len(d.mu.tableValidation.pending) > 0 -} - -// validateSSTables runs a round of validation on the tables in the pending -// queue. -func (d *DB) validateSSTables() { - d.mu.Lock() - if !d.shouldValidateSSTablesLocked() { - d.mu.Unlock() - return - } - - pending := d.mu.tableValidation.pending - d.mu.tableValidation.pending = nil - d.mu.tableValidation.validating = true - jobID := d.mu.nextJobID - d.mu.nextJobID++ - rs := d.loadReadState() - - // Drop DB.mu before performing IO. - d.mu.Unlock() - - // Validate all tables in the pending queue. This could lead to a situation - // where we are starving IO from other tasks due to having to page through - // all the blocks in all the sstables in the queue. - // TODO(travers): Add some form of pacing to avoid IO starvation. - for _, f := range pending { - // The file may have been moved or deleted since it was ingested, in - // which case we skip. - if !rs.current.Contains(f.Level, d.cmp, f.Meta) { - // Assume the file was moved to a lower level. It is rare enough - // that a table is moved or deleted between the time it was ingested - // and the time the validation routine runs that the overall cost of - // this inner loop is tolerably low, when amortized over all - // ingested tables. - found := false - for i := f.Level + 1; i < numLevels; i++ { - if rs.current.Contains(i, d.cmp, f.Meta) { - found = true - break - } - } - if !found { - continue - } - } - - var err error - if f.Meta.Virtual { - err = d.tableCache.withVirtualReader( - f.Meta.VirtualMeta(), func(v sstable.VirtualReader) error { - return v.ValidateBlockChecksumsOnBacking() - }) - } else { - err = d.tableCache.withReader( - f.Meta.PhysicalMeta(), func(r *sstable.Reader) error { - return r.ValidateBlockChecksums() - }) - } - - if err != nil { - // TODO(travers): Hook into the corruption reporting pipeline, once - // available. See pebble#1192. - d.opts.Logger.Fatalf("pebble: encountered corruption during ingestion: %s", err) - } - - d.opts.EventListener.TableValidated(TableValidatedInfo{ - JobID: jobID, - Meta: f.Meta, - }) - } - rs.unref() - - d.mu.Lock() - defer d.mu.Unlock() - d.mu.tableValidation.validating = false - d.mu.tableValidation.cond.Broadcast() - if d.shouldValidateSSTablesLocked() { - go d.validateSSTables() - } -} diff --git a/vendor/github.com/cockroachdb/pebble/internal.go b/vendor/github.com/cockroachdb/pebble/internal.go deleted file mode 100644 index 61a4284..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal.go +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import "github.com/cockroachdb/pebble/internal/base" - -// InternalKeyKind exports the base.InternalKeyKind type. -type InternalKeyKind = base.InternalKeyKind - -// These constants are part of the file format, and should not be changed. -const ( - InternalKeyKindDelete = base.InternalKeyKindDelete - InternalKeyKindSet = base.InternalKeyKindSet - InternalKeyKindMerge = base.InternalKeyKindMerge - InternalKeyKindLogData = base.InternalKeyKindLogData - InternalKeyKindSingleDelete = base.InternalKeyKindSingleDelete - InternalKeyKindRangeDelete = base.InternalKeyKindRangeDelete - InternalKeyKindMax = base.InternalKeyKindMax - InternalKeyKindSetWithDelete = base.InternalKeyKindSetWithDelete - InternalKeyKindRangeKeySet = base.InternalKeyKindRangeKeySet - InternalKeyKindRangeKeyUnset = base.InternalKeyKindRangeKeyUnset - InternalKeyKindRangeKeyDelete = base.InternalKeyKindRangeKeyDelete - InternalKeyKindIngestSST = base.InternalKeyKindIngestSST - InternalKeyKindDeleteSized = base.InternalKeyKindDeleteSized - InternalKeyKindInvalid = base.InternalKeyKindInvalid - InternalKeySeqNumBatch = base.InternalKeySeqNumBatch - InternalKeySeqNumMax = base.InternalKeySeqNumMax - InternalKeyRangeDeleteSentinel = base.InternalKeyRangeDeleteSentinel -) - -// InternalKey exports the base.InternalKey type. -type InternalKey = base.InternalKey - -type internalIterator = base.InternalIterator - -// ErrCorruption is a marker to indicate that data in a file (WAL, MANIFEST, -// sstable) isn't in the expected format. -var ErrCorruption = base.ErrCorruption - -// AttributeAndLen exports the base.AttributeAndLen type. -type AttributeAndLen = base.AttributeAndLen - -// ShortAttribute exports the base.ShortAttribute type. -type ShortAttribute = base.ShortAttribute - -// LazyFetcher exports the base.LazyFetcher type. This export is needed since -// LazyValue.Clone requires a pointer to a LazyFetcher struct to avoid -// allocations. No code outside Pebble needs to peer into a LazyFetcher. -type LazyFetcher = base.LazyFetcher diff --git a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/iterator.go b/vendor/github.com/cockroachdb/pebble/internal/arenaskl/iterator.go deleted file mode 100644 index bad4909..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/iterator.go +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright 2017 Dgraph Labs, Inc. and Contributors - * Modifications copyright (C) 2017 Andy Kimball and Contributors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package arenaskl - -import ( - "sync" - - "github.com/cockroachdb/pebble/internal/base" -) - -type splice struct { - prev *node - next *node -} - -func (s *splice) init(prev, next *node) { - s.prev = prev - s.next = next -} - -// Iterator is an iterator over the skiplist object. Use Skiplist.NewIter -// to construct an iterator. The current state of the iterator can be cloned by -// simply value copying the struct. All iterator methods are thread-safe. -type Iterator struct { - list *Skiplist - nd *node - key base.InternalKey - lower []byte - upper []byte -} - -// Iterator implements the base.InternalIterator interface. -var _ base.InternalIterator = (*Iterator)(nil) - -var iterPool = sync.Pool{ - New: func() interface{} { - return &Iterator{} - }, -} - -// Close resets the iterator. -func (it *Iterator) Close() error { - it.list = nil - it.nd = nil - it.lower = nil - it.upper = nil - iterPool.Put(it) - return nil -} - -func (it *Iterator) String() string { - return "memtable" -} - -// Error returns any accumulated error. -func (it *Iterator) Error() error { - return nil -} - -// SeekGE moves the iterator to the first entry whose key is greater than or -// equal to the given key. Returns the key and value if the iterator is -// pointing at a valid entry, and (nil, nil) otherwise. Note that SeekGE only -// checks the upper bound. It is up to the caller to ensure that key is greater -// than or equal to the lower bound. -func (it *Iterator) SeekGE(key []byte, flags base.SeekGEFlags) (*base.InternalKey, base.LazyValue) { - if flags.TrySeekUsingNext() { - if it.nd == it.list.tail { - // Iterator is done. - return nil, base.LazyValue{} - } - less := it.list.cmp(it.key.UserKey, key) < 0 - // Arbitrary constant. By measuring the seek cost as a function of the - // number of elements in the skip list, and fitting to a model, we - // could adjust the number of nexts based on the current size of the - // skip list. - const numNexts = 5 - for i := 0; less && i < numNexts; i++ { - k, _ := it.Next() - if k == nil { - // Iterator is done. - return nil, base.LazyValue{} - } - less = it.list.cmp(it.key.UserKey, key) < 0 - } - if !less { - return &it.key, base.MakeInPlaceValue(it.value()) - } - } - _, it.nd, _ = it.seekForBaseSplice(key) - if it.nd == it.list.tail { - return nil, base.LazyValue{} - } - it.decodeKey() - if it.upper != nil && it.list.cmp(it.upper, it.key.UserKey) <= 0 { - it.nd = it.list.tail - return nil, base.LazyValue{} - } - return &it.key, base.MakeInPlaceValue(it.value()) -} - -// SeekPrefixGE moves the iterator to the first entry whose key is greater than -// or equal to the given key. This method is equivalent to SeekGE and is -// provided so that an arenaskl.Iterator implements the -// internal/base.InternalIterator interface. -func (it *Iterator) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - return it.SeekGE(key, flags) -} - -// SeekLT moves the iterator to the last entry whose key is less than the given -// key. Returns the key and value if the iterator is pointing at a valid entry, -// and (nil, nil) otherwise. Note that SeekLT only checks the lower bound. It -// is up to the caller to ensure that key is less than the upper bound. -func (it *Iterator) SeekLT(key []byte, flags base.SeekLTFlags) (*base.InternalKey, base.LazyValue) { - // NB: the top-level Iterator has already adjusted key based on - // the upper-bound. - it.nd, _, _ = it.seekForBaseSplice(key) - if it.nd == it.list.head { - return nil, base.LazyValue{} - } - it.decodeKey() - if it.lower != nil && it.list.cmp(it.lower, it.key.UserKey) > 0 { - it.nd = it.list.head - return nil, base.LazyValue{} - } - return &it.key, base.MakeInPlaceValue(it.value()) -} - -// First seeks position at the first entry in list. Returns the key and value -// if the iterator is pointing at a valid entry, and (nil, nil) otherwise. Note -// that First only checks the upper bound. It is up to the caller to ensure -// that key is greater than or equal to the lower bound (e.g. via a call to SeekGE(lower)). -func (it *Iterator) First() (*base.InternalKey, base.LazyValue) { - it.nd = it.list.getNext(it.list.head, 0) - if it.nd == it.list.tail { - return nil, base.LazyValue{} - } - it.decodeKey() - if it.upper != nil && it.list.cmp(it.upper, it.key.UserKey) <= 0 { - it.nd = it.list.tail - return nil, base.LazyValue{} - } - return &it.key, base.MakeInPlaceValue(it.value()) -} - -// Last seeks position at the last entry in list. Returns the key and value if -// the iterator is pointing at a valid entry, and (nil, nil) otherwise. Note -// that Last only checks the lower bound. It is up to the caller to ensure that -// key is less than the upper bound (e.g. via a call to SeekLT(upper)). -func (it *Iterator) Last() (*base.InternalKey, base.LazyValue) { - it.nd = it.list.getPrev(it.list.tail, 0) - if it.nd == it.list.head { - return nil, base.LazyValue{} - } - it.decodeKey() - if it.lower != nil && it.list.cmp(it.lower, it.key.UserKey) > 0 { - it.nd = it.list.head - return nil, base.LazyValue{} - } - return &it.key, base.MakeInPlaceValue(it.value()) -} - -// Next advances to the next position. Returns the key and value if the -// iterator is pointing at a valid entry, and (nil, nil) otherwise. -// Note: flushIterator.Next mirrors the implementation of Iterator.Next -// due to performance. Keep the two in sync. -func (it *Iterator) Next() (*base.InternalKey, base.LazyValue) { - it.nd = it.list.getNext(it.nd, 0) - if it.nd == it.list.tail { - return nil, base.LazyValue{} - } - it.decodeKey() - if it.upper != nil && it.list.cmp(it.upper, it.key.UserKey) <= 0 { - it.nd = it.list.tail - return nil, base.LazyValue{} - } - return &it.key, base.MakeInPlaceValue(it.value()) -} - -// NextPrefix advances to the next position with a new prefix. Returns the key -// and value if the iterator is pointing at a valid entry, and (nil, nil) -// otherwise. -func (it *Iterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { - return it.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext()) -} - -// Prev moves to the previous position. Returns the key and value if the -// iterator is pointing at a valid entry, and (nil, nil) otherwise. -func (it *Iterator) Prev() (*base.InternalKey, base.LazyValue) { - it.nd = it.list.getPrev(it.nd, 0) - if it.nd == it.list.head { - return nil, base.LazyValue{} - } - it.decodeKey() - if it.lower != nil && it.list.cmp(it.lower, it.key.UserKey) > 0 { - it.nd = it.list.head - return nil, base.LazyValue{} - } - return &it.key, base.MakeInPlaceValue(it.value()) -} - -// value returns the value at the current position. -func (it *Iterator) value() []byte { - return it.nd.getValue(it.list.arena) -} - -// Head true iff the iterator is positioned at the sentinel head node. -func (it *Iterator) Head() bool { - return it.nd == it.list.head -} - -// Tail true iff the iterator is positioned at the sentinel tail node. -func (it *Iterator) Tail() bool { - return it.nd == it.list.tail -} - -// SetBounds sets the lower and upper bounds for the iterator. Note that the -// result of Next and Prev will be undefined until the iterator has been -// repositioned with SeekGE, SeekPrefixGE, SeekLT, First, or Last. -func (it *Iterator) SetBounds(lower, upper []byte) { - it.lower = lower - it.upper = upper -} - -func (it *Iterator) decodeKey() { - it.key.UserKey = it.list.arena.getBytes(it.nd.keyOffset, it.nd.keySize) - it.key.Trailer = it.nd.keyTrailer -} - -func (it *Iterator) seekForBaseSplice(key []byte) (prev, next *node, found bool) { - ikey := base.MakeSearchKey(key) - level := int(it.list.Height() - 1) - - prev = it.list.head - for { - prev, next, found = it.list.findSpliceForLevel(ikey, level, prev) - - if found { - if level != 0 { - // next is pointing at the target node, but we need to find previous on - // the bottom level. - prev = it.list.getPrev(next, 0) - } - break - } - - if level == 0 { - break - } - - level-- - } - - return -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/comparer.go b/vendor/github.com/cockroachdb/pebble/internal/base/comparer.go deleted file mode 100644 index a630962..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/base/comparer.go +++ /dev/null @@ -1,260 +0,0 @@ -// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package base - -import ( - "bytes" - "encoding/binary" - "fmt" - "strconv" - "unicode/utf8" -) - -// Compare returns -1, 0, or +1 depending on whether a is 'less than', 'equal -// to' or 'greater than' b. The two arguments can only be 'equal' if their -// contents are exactly equal. Furthermore, the empty slice must be 'less than' -// any non-empty slice. Compare is used to compare user keys, such as those -// passed as arguments to the various DB methods, as well as those returned -// from Separator, Successor, and Split. -type Compare func(a, b []byte) int - -// Equal returns true if a and b are equivalent. For a given Compare, -// Equal(a,b) must return true iff Compare(a,b) returns zero, that is, -// Equal is a (potentially faster) specialization of Compare. -type Equal func(a, b []byte) bool - -// AbbreviatedKey returns a fixed length prefix of a user key such that AbbreviatedKey(a) -// < AbbreviatedKey(b) iff a < b and AbbreviatedKey(a) > AbbreviatedKey(b) iff a > b. If -// AbbreviatedKey(a) == AbbreviatedKey(b) an additional comparison is required to -// determine if the two keys are actually equal. -// -// This helps optimize indexed batch comparisons for cache locality. If a Split -// function is specified, AbbreviatedKey usually returns the first eight bytes -// of the user key prefix in the order that gives the correct ordering. -type AbbreviatedKey func(key []byte) uint64 - -// FormatKey returns a formatter for the user key. -type FormatKey func(key []byte) fmt.Formatter - -// FormatValue returns a formatter for the user value. The key is also -// specified for the value formatter in order to support value formatting that -// is dependent on the key. -type FormatValue func(key, value []byte) fmt.Formatter - -// Separator is used to construct SSTable index blocks. A trivial implementation -// is `return a`, but appending fewer bytes leads to smaller SSTables. -// -// Given keys a, b for which Compare(a, b) < 0, Separator returns a key k such -// that: -// -// 1. Compare(a, k) <= 0, and -// 2. Compare(k, b) < 0. -// -// As a special case, b may be nil in which case the second condition is dropped. -// -// For example, if dst, a and b are the []byte equivalents of the strings -// "aqua", "black" and "blue", then the result may be "aquablb". -// Similarly, if the arguments were "aqua", "green" and "", then the result -// may be "aquah". -type Separator func(dst, a, b []byte) []byte - -// Successor returns a shortened key given a key a, such that Compare(k, a) >= -// 0. A simple implementation may return a unchanged. The dst parameter may be -// used to store the returned key, though it is valid to pass nil. The returned -// key must be valid to pass to Compare. -type Successor func(dst, a []byte) []byte - -// ImmediateSuccessor is invoked with a prefix key ([Split(a) == len(a)]) and -// returns the smallest key that is larger than the given prefix a. -// ImmediateSuccessor must return a prefix key k such that: -// -// Split(k) == len(k) and Compare(k, a) > 0 -// -// and there exists no representable k2 such that: -// -// Split(k2) == len(k2) and Compare(k2, a) > 0 and Compare(k2, k) < 0 -// -// As an example, an implementation built on the natural byte ordering using -// bytes.Compare could append a `\0` to `a`. -// -// The dst parameter may be used to store the returned key, though it is valid -// to pass nil. The returned key must be valid to pass to Compare. -type ImmediateSuccessor func(dst, a []byte) []byte - -// Split returns the length of the prefix of the user key that corresponds to -// the key portion of an MVCC encoding scheme to enable the use of prefix bloom -// filters. -// -// The method will only ever be called with valid MVCC keys, that is, keys that -// the user could potentially store in the database. Pebble does not know which -// keys are MVCC keys and which are not, and may call Split on both MVCC keys -// and non-MVCC keys. -// -// A trivial MVCC scheme is one in which Split() returns len(a). This -// corresponds to assigning a constant version to each key in the database. For -// performance reasons, it is preferable to use a `nil` split in this case. -// -// The returned prefix must have the following properties: -// -// 1. The prefix must be a byte prefix: -// -// bytes.HasPrefix(a, prefix(a)) -// -// 2. A key consisting of just a prefix must sort before all other keys with -// that prefix: -// -// Compare(prefix(a), a) < 0 if len(suffix(a)) > 0 -// -// 3. Prefixes must be used to order keys before suffixes: -// -// If Compare(a, b) <= 0, then Compare(prefix(a), prefix(b)) <= 0 -// -// 4. Suffixes themselves must be valid keys and comparable, respecting the same -// ordering as within a key. -// -// If Compare(prefix(a), prefix(b)) == 0, then Compare(suffix(a), suffix(b)) == Compare(a, b) -type Split func(a []byte) int - -// Comparer defines a total ordering over the space of []byte keys: a 'less -// than' relationship. -type Comparer struct { - Compare Compare - Equal Equal - AbbreviatedKey AbbreviatedKey - FormatKey FormatKey - FormatValue FormatValue - Separator Separator - Split Split - Successor Successor - ImmediateSuccessor ImmediateSuccessor - - // Name is the name of the comparer. - // - // The Level-DB on-disk format stores the comparer name, and opening a - // database with a different comparer from the one it was created with - // will result in an error. - Name string -} - -// DefaultFormatter is the default implementation of user key formatting: -// non-ASCII data is formatted as escaped hexadecimal values. -var DefaultFormatter = func(key []byte) fmt.Formatter { - return FormatBytes(key) -} - -// DefaultComparer is the default implementation of the Comparer interface. -// It uses the natural ordering, consistent with bytes.Compare. -var DefaultComparer = &Comparer{ - Compare: bytes.Compare, - Equal: bytes.Equal, - - AbbreviatedKey: func(key []byte) uint64 { - if len(key) >= 8 { - return binary.BigEndian.Uint64(key) - } - var v uint64 - for _, b := range key { - v <<= 8 - v |= uint64(b) - } - return v << uint(8*(8-len(key))) - }, - - FormatKey: DefaultFormatter, - - Separator: func(dst, a, b []byte) []byte { - i, n := SharedPrefixLen(a, b), len(dst) - dst = append(dst, a...) - - min := len(a) - if min > len(b) { - min = len(b) - } - if i >= min { - // Do not shorten if one string is a prefix of the other. - return dst - } - - if a[i] >= b[i] { - // b is smaller than a or a is already the shortest possible. - return dst - } - - if i < len(b)-1 || a[i]+1 < b[i] { - i += n - dst[i]++ - return dst[:i+1] - } - - i += n + 1 - for ; i < len(dst); i++ { - if dst[i] != 0xff { - dst[i]++ - return dst[:i+1] - } - } - return dst - }, - - Successor: func(dst, a []byte) (ret []byte) { - for i := 0; i < len(a); i++ { - if a[i] != 0xff { - dst = append(dst, a[:i+1]...) - dst[len(dst)-1]++ - return dst - } - } - // a is a run of 0xffs, leave it alone. - return append(dst, a...) - }, - - ImmediateSuccessor: func(dst, a []byte) (ret []byte) { - return append(append(dst, a...), 0x00) - }, - - // This name is part of the C++ Level-DB implementation's default file - // format, and should not be changed. - Name: "leveldb.BytewiseComparator", -} - -// SharedPrefixLen returns the largest i such that a[:i] equals b[:i]. -// This function can be useful in implementing the Comparer interface. -func SharedPrefixLen(a, b []byte) int { - i, n := 0, len(a) - if n > len(b) { - n = len(b) - } - asUint64 := func(c []byte, i int) uint64 { - return binary.LittleEndian.Uint64(c[i:]) - } - for i < n-7 && asUint64(a, i) == asUint64(b, i) { - i += 8 - } - for i < n && a[i] == b[i] { - i++ - } - return i -} - -// FormatBytes formats a byte slice using hexadecimal escapes for non-ASCII -// data. -type FormatBytes []byte - -const lowerhex = "0123456789abcdef" - -// Format implements the fmt.Formatter interface. -func (p FormatBytes) Format(s fmt.State, c rune) { - buf := make([]byte, 0, len(p)) - for _, b := range p { - if b < utf8.RuneSelf && strconv.IsPrint(rune(b)) { - buf = append(buf, b) - continue - } - buf = append(buf, `\x`...) - buf = append(buf, lowerhex[b>>4]) - buf = append(buf, lowerhex[b&0xF]) - } - s.Write(buf) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/error.go b/vendor/github.com/cockroachdb/pebble/internal/base/error.go deleted file mode 100644 index 6ef7783..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/base/error.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package base - -import "github.com/cockroachdb/errors" - -// ErrNotFound means that a get or delete call did not find the requested key. -var ErrNotFound = errors.New("pebble: not found") - -// ErrCorruption is a marker to indicate that data in a file (WAL, MANIFEST, -// sstable) isn't in the expected format. -var ErrCorruption = errors.New("pebble: corruption") - -// MarkCorruptionError marks given error as a corruption error. -func MarkCorruptionError(err error) error { - if errors.Is(err, ErrCorruption) { - return err - } - return errors.Mark(err, ErrCorruption) -} - -// CorruptionErrorf formats according to a format specifier and returns -// the string as an error value that is marked as a corruption error. -func CorruptionErrorf(format string, args ...interface{}) error { - return errors.Mark(errors.Newf(format, args...), ErrCorruption) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/filenames.go b/vendor/github.com/cockroachdb/pebble/internal/base/filenames.go deleted file mode 100644 index d2ba2f0..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/base/filenames.go +++ /dev/null @@ -1,204 +0,0 @@ -// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package base - -import ( - "fmt" - "strconv" - "strings" - - "github.com/cockroachdb/errors/oserror" - "github.com/cockroachdb/pebble/vfs" - "github.com/cockroachdb/redact" -) - -// FileNum is an internal DB identifier for a file. -type FileNum uint64 - -// String returns a string representation of the file number. -func (fn FileNum) String() string { return fmt.Sprintf("%06d", fn) } - -// SafeFormat implements redact.SafeFormatter. -func (fn FileNum) SafeFormat(w redact.SafePrinter, _ rune) { - w.Printf("%06d", redact.SafeUint(fn)) -} - -// DiskFileNum converts a FileNum to a DiskFileNum. DiskFileNum should only be -// called if the caller can ensure that the FileNum belongs to a physical file -// on disk. These could be manifests, log files, physical sstables on disk, the -// options file, but not virtual sstables. -func (fn FileNum) DiskFileNum() DiskFileNum { - return DiskFileNum{fn} -} - -// A DiskFileNum is just a FileNum belonging to a file which exists on disk. -// Note that a FileNum is an internal DB identifier and it could belong to files -// which don't exist on disk. An example would be virtual sstable FileNums. -// Converting a DiskFileNum to a FileNum is always valid, whereas converting a -// FileNum to DiskFileNum may not be valid and care should be taken to prove -// that the FileNum actually exists on disk. -type DiskFileNum struct { - fn FileNum -} - -func (dfn DiskFileNum) String() string { return dfn.fn.String() } - -// SafeFormat implements redact.SafeFormatter. -func (dfn DiskFileNum) SafeFormat(w redact.SafePrinter, verb rune) { - dfn.fn.SafeFormat(w, verb) -} - -// FileNum converts a DiskFileNum to a FileNum. This conversion is always valid. -func (dfn DiskFileNum) FileNum() FileNum { - return dfn.fn -} - -// FileType enumerates the types of files found in a DB. -type FileType int - -// The FileType enumeration. -const ( - FileTypeLog FileType = iota - FileTypeLock - FileTypeTable - FileTypeManifest - FileTypeCurrent - FileTypeOptions - FileTypeOldTemp - FileTypeTemp -) - -// MakeFilename builds a filename from components. -func MakeFilename(fileType FileType, dfn DiskFileNum) string { - switch fileType { - case FileTypeLog: - return fmt.Sprintf("%s.log", dfn) - case FileTypeLock: - return "LOCK" - case FileTypeTable: - return fmt.Sprintf("%s.sst", dfn) - case FileTypeManifest: - return fmt.Sprintf("MANIFEST-%s", dfn) - case FileTypeCurrent: - return "CURRENT" - case FileTypeOptions: - return fmt.Sprintf("OPTIONS-%s", dfn) - case FileTypeOldTemp: - return fmt.Sprintf("CURRENT.%s.dbtmp", dfn) - case FileTypeTemp: - return fmt.Sprintf("temporary.%s.dbtmp", dfn) - } - panic("unreachable") -} - -// MakeFilepath builds a filepath from components. -func MakeFilepath(fs vfs.FS, dirname string, fileType FileType, dfn DiskFileNum) string { - return fs.PathJoin(dirname, MakeFilename(fileType, dfn)) -} - -// ParseFilename parses the components from a filename. -func ParseFilename(fs vfs.FS, filename string) (fileType FileType, dfn DiskFileNum, ok bool) { - filename = fs.PathBase(filename) - switch { - case filename == "CURRENT": - return FileTypeCurrent, DiskFileNum{0}, true - case filename == "LOCK": - return FileTypeLock, DiskFileNum{0}, true - case strings.HasPrefix(filename, "MANIFEST-"): - dfn, ok = parseDiskFileNum(filename[len("MANIFEST-"):]) - if !ok { - break - } - return FileTypeManifest, dfn, true - case strings.HasPrefix(filename, "OPTIONS-"): - dfn, ok = parseDiskFileNum(filename[len("OPTIONS-"):]) - if !ok { - break - } - return FileTypeOptions, dfn, ok - case strings.HasPrefix(filename, "CURRENT.") && strings.HasSuffix(filename, ".dbtmp"): - s := strings.TrimSuffix(filename[len("CURRENT."):], ".dbtmp") - dfn, ok = parseDiskFileNum(s) - if !ok { - break - } - return FileTypeOldTemp, dfn, ok - case strings.HasPrefix(filename, "temporary.") && strings.HasSuffix(filename, ".dbtmp"): - s := strings.TrimSuffix(filename[len("temporary."):], ".dbtmp") - dfn, ok = parseDiskFileNum(s) - if !ok { - break - } - return FileTypeTemp, dfn, ok - default: - i := strings.IndexByte(filename, '.') - if i < 0 { - break - } - dfn, ok = parseDiskFileNum(filename[:i]) - if !ok { - break - } - switch filename[i+1:] { - case "sst": - return FileTypeTable, dfn, true - case "log": - return FileTypeLog, dfn, true - } - } - return 0, dfn, false -} - -func parseDiskFileNum(s string) (dfn DiskFileNum, ok bool) { - u, err := strconv.ParseUint(s, 10, 64) - if err != nil { - return dfn, false - } - return DiskFileNum{FileNum(u)}, true -} - -// A Fataler fatals a process with a message when called. -type Fataler interface { - Fatalf(format string, args ...interface{}) -} - -// MustExist checks if err is an error indicating a file does not exist. -// If it is, it lists the containing directory's files to annotate the error -// with counts of the various types of files and invokes the provided fataler. -// See cockroachdb/cockroach#56490. -func MustExist(fs vfs.FS, filename string, fataler Fataler, err error) { - if err == nil || !oserror.IsNotExist(err) { - return - } - - ls, lsErr := fs.List(fs.PathDir(filename)) - if lsErr != nil { - // TODO(jackson): if oserror.IsNotExist(lsErr), the the data directory - // doesn't exist anymore. Another process likely deleted it before - // killing the process. We want to fatal the process, but without - // triggering error reporting like Sentry. - fataler.Fatalf("%s:\norig err: %s\nlist err: %s", redact.Safe(fs.PathBase(filename)), err, lsErr) - } - var total, unknown, tables, logs, manifests int - total = len(ls) - for _, f := range ls { - typ, _, ok := ParseFilename(fs, f) - if !ok { - unknown++ - continue - } - switch typ { - case FileTypeTable: - tables++ - case FileTypeLog: - logs++ - case FileTypeManifest: - manifests++ - } - } - - fataler.Fatalf("%s:\n%s\ndirectory contains %d files, %d unknown, %d tables, %d logs, %d manifests", - fs.PathBase(filename), err, total, unknown, tables, logs, manifests) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/cgo_disabled.go b/vendor/github.com/cockroachdb/pebble/internal/cache/cgo_disabled.go deleted file mode 100644 index 0e75574..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/cgo_disabled.go +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build !cgo -// +build !cgo - -package cache - -const cgoEnabled = false diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/cgo_enabled.go b/vendor/github.com/cockroachdb/pebble/internal/cache/cgo_enabled.go deleted file mode 100644 index b7014cb..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/cgo_enabled.go +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build cgo -// +build cgo - -package cache - -const cgoEnabled = true diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/entry.go b/vendor/github.com/cockroachdb/pebble/internal/cache/entry.go deleted file mode 100644 index a49fde6..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/entry.go +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package cache - -import "sync/atomic" - -type entryType int8 - -const ( - etTest entryType = iota - etCold - etHot -) - -func (p entryType) String() string { - switch p { - case etTest: - return "test" - case etCold: - return "cold" - case etHot: - return "hot" - } - return "unknown" -} - -// entry holds the metadata for a cache entry. The memory for an entry is -// allocated from manually managed memory. -// -// Using manual memory management for entries is technically a volation of the -// Cgo pointer rules: -// -// https://golang.org/cmd/cgo/#hdr-Passing_pointers -// -// Specifically, Go pointers should not be stored in C allocated memory. The -// reason for this rule is that the Go GC will not look at C allocated memory -// to find pointers to Go objects. If the only reference to a Go object is -// stored in C allocated memory, the object will be reclaimed. The shard field -// of the entry struct points to a Go allocated object, thus the -// violation. What makes this "safe" is that the Cache guarantees that there -// are other pointers to the shard which will keep it alive. -type entry struct { - key key - // The value associated with the entry. The entry holds a reference on the - // value which is maintained by entry.setValue(). - val *Value - blockLink struct { - next *entry - prev *entry - } - fileLink struct { - next *entry - prev *entry - } - size int64 - ptype entryType - // referenced is atomically set to indicate that this entry has been accessed - // since the last time one of the clock hands swept it. - referenced atomic.Bool - shard *shard - // Reference count for the entry. The entry is freed when the reference count - // drops to zero. - ref refcnt -} - -func newEntry(s *shard, key key, size int64) *entry { - e := entryAllocNew() - *e = entry{ - key: key, - size: size, - ptype: etCold, - shard: s, - } - e.blockLink.next = e - e.blockLink.prev = e - e.fileLink.next = e - e.fileLink.prev = e - e.ref.init(1) - return e -} - -func (e *entry) free() { - e.setValue(nil) - *e = entry{} - entryAllocFree(e) -} - -func (e *entry) next() *entry { - if e == nil { - return nil - } - return e.blockLink.next -} - -func (e *entry) prev() *entry { - if e == nil { - return nil - } - return e.blockLink.prev -} - -func (e *entry) link(s *entry) { - s.blockLink.prev = e.blockLink.prev - s.blockLink.prev.blockLink.next = s - s.blockLink.next = e - s.blockLink.next.blockLink.prev = s -} - -func (e *entry) unlink() *entry { - next := e.blockLink.next - e.blockLink.prev.blockLink.next = e.blockLink.next - e.blockLink.next.blockLink.prev = e.blockLink.prev - e.blockLink.prev = e - e.blockLink.next = e - return next -} - -func (e *entry) linkFile(s *entry) { - s.fileLink.prev = e.fileLink.prev - s.fileLink.prev.fileLink.next = s - s.fileLink.next = e - s.fileLink.next.fileLink.prev = s -} - -func (e *entry) unlinkFile() *entry { - next := e.fileLink.next - e.fileLink.prev.fileLink.next = e.fileLink.next - e.fileLink.next.fileLink.prev = e.fileLink.prev - e.fileLink.prev = e - e.fileLink.next = e - return next -} - -func (e *entry) setValue(v *Value) { - if v != nil { - v.acquire() - } - old := e.val - e.val = v - old.release() -} - -func (e *entry) peekValue() *Value { - return e.val -} - -func (e *entry) acquireValue() *Value { - v := e.val - if v != nil { - v.acquire() - } - return v -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/entry_invariants.go b/vendor/github.com/cockroachdb/pebble/internal/cache/entry_invariants.go deleted file mode 100644 index 31c54e4..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/entry_invariants.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. -// -//go:build (invariants && !race) || (tracing && !race) -// +build invariants,!race tracing,!race - -package cache - -import ( - "fmt" - "os" - - "github.com/cockroachdb/pebble/internal/invariants" -) - -// When the "invariants" or "tracing" build tags are enabled, we need to -// allocate entries using the Go allocator so entry.val properly maintains a -// reference to the Value. -const entriesGoAllocated = true - -func entryAllocNew() *entry { - e := &entry{} - // Note: this is a no-op if invariants and tracing are disabled or race is - // enabled. - invariants.SetFinalizer(e, func(obj interface{}) { - e := obj.(*entry) - if v := e.ref.refs(); v != 0 { - fmt.Fprintf(os.Stderr, "%p: cache entry has non-zero reference count: %d\n%s", - e, v, e.ref.traces()) - os.Exit(1) - } - }) - return e -} - -func entryAllocFree(e *entry) { -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/entry_normal.go b/vendor/github.com/cockroachdb/pebble/internal/cache/entry_normal.go deleted file mode 100644 index 92afb04..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/entry_normal.go +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. -// -//go:build (!invariants && !tracing) || race -// +build !invariants,!tracing race - -package cache - -import ( - "runtime" - "sync" - "unsafe" - - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/manual" -) - -const ( - entrySize = int(unsafe.Sizeof(entry{})) - entryAllocCacheLimit = 128 - // Avoid using runtime.SetFinalizer in race builds as finalizers tickle a bug - // in the Go race detector in go1.15 and earlier versions. This requires that - // entries are Go allocated rather than manually allocated. - // - // If cgo is disabled we need to allocate the entries using the Go allocator - // and is violates the Go GC rules to put Go pointers (such as the entry - // pointer fields) into untyped memory (i.e. a []byte). - entriesGoAllocated = invariants.RaceEnabled || !cgoEnabled -) - -var entryAllocPool = sync.Pool{ - New: func() interface{} { - return newEntryAllocCache() - }, -} - -func entryAllocNew() *entry { - a := entryAllocPool.Get().(*entryAllocCache) - e := a.alloc() - entryAllocPool.Put(a) - return e -} - -func entryAllocFree(e *entry) { - a := entryAllocPool.Get().(*entryAllocCache) - a.free(e) - entryAllocPool.Put(a) -} - -type entryAllocCache struct { - entries []*entry -} - -func newEntryAllocCache() *entryAllocCache { - c := &entryAllocCache{} - if !entriesGoAllocated { - // Note the use of a "real" finalizer here (as opposed to a build tag-gated - // no-op finalizer). Without the finalizer, objects released from the pool - // and subsequently GC'd by the Go runtime would fail to have their manually - // allocated memory freed, which results in a memory leak. - // lint:ignore SetFinalizer - runtime.SetFinalizer(c, freeEntryAllocCache) - } - return c -} - -func freeEntryAllocCache(obj interface{}) { - c := obj.(*entryAllocCache) - for i, e := range c.entries { - c.dealloc(e) - c.entries[i] = nil - } -} - -func (c *entryAllocCache) alloc() *entry { - n := len(c.entries) - if n == 0 { - if entriesGoAllocated { - return &entry{} - } - b := manual.New(entrySize) - return (*entry)(unsafe.Pointer(&b[0])) - } - e := c.entries[n-1] - c.entries = c.entries[:n-1] - return e -} - -func (c *entryAllocCache) dealloc(e *entry) { - if !entriesGoAllocated { - buf := (*[manual.MaxArrayLen]byte)(unsafe.Pointer(e))[:entrySize:entrySize] - manual.Free(buf) - } -} - -func (c *entryAllocCache) free(e *entry) { - if len(c.entries) == entryAllocCacheLimit { - c.dealloc(e) - return - } - c.entries = append(c.entries, e) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/robin_hood.go b/vendor/github.com/cockroachdb/pebble/internal/cache/robin_hood.go deleted file mode 100644 index 6e093fd..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/robin_hood.go +++ /dev/null @@ -1,320 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package cache - -import ( - "fmt" - "math/bits" - "os" - "runtime/debug" - "strings" - "time" - "unsafe" - - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/manual" -) - -var hashSeed = uint64(time.Now().UnixNano()) - -// Fibonacci hash: https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ -func robinHoodHash(k key, shift uint32) uint32 { - const m = 11400714819323198485 - h := hashSeed - h ^= k.id * m - h ^= uint64(k.fileNum.FileNum()) * m - h ^= k.offset * m - return uint32(h >> shift) -} - -type robinHoodEntry struct { - key key - // Note that value may point to a Go allocated object (if the "invariants" - // build tag was specified), even though the memory for the entry itself is - // manually managed. This is technically a volation of the Cgo pointer rules: - // - // https://golang.org/cmd/cgo/#hdr-Passing_pointers - // - // Specifically, Go pointers should not be stored in C allocated memory. The - // reason for this rule is that the Go GC will not look at C allocated memory - // to find pointers to Go objects. If the only reference to a Go object is - // stored in C allocated memory, the object will be reclaimed. What makes - // this "safe" is that the Cache guarantees that there are other pointers to - // the entry and shard which will keep them alive. In particular, every Go - // allocated entry in the cache is referenced by the shard.entries map. And - // every shard is referenced by the Cache.shards map. - value *entry - // The distance the entry is from its desired position. - dist uint32 -} - -type robinHoodEntries struct { - ptr unsafe.Pointer - len uint32 -} - -func newRobinHoodEntries(n uint32) robinHoodEntries { - size := uintptr(n) * unsafe.Sizeof(robinHoodEntry{}) - return robinHoodEntries{ - ptr: unsafe.Pointer(&(manual.New(int(size)))[0]), - len: n, - } -} - -func (e robinHoodEntries) at(i uint32) *robinHoodEntry { - return (*robinHoodEntry)(unsafe.Pointer(uintptr(e.ptr) + - uintptr(i)*unsafe.Sizeof(robinHoodEntry{}))) -} - -func (e robinHoodEntries) free() { - size := uintptr(e.len) * unsafe.Sizeof(robinHoodEntry{}) - buf := (*[manual.MaxArrayLen]byte)(e.ptr)[:size:size] - manual.Free(buf) -} - -// robinHoodMap is an implementation of Robin Hood hashing. Robin Hood hashing -// is an open-address hash table using linear probing. The twist is that the -// linear probe distance is reduced by moving existing entries when inserting -// and deleting. This is accomplished by keeping track of how far an entry is -// from its "desired" slot (hash of key modulo number of slots). During -// insertion, if the new entry being inserted is farther from its desired slot -// than the target entry, we swap the target and new entry. This effectively -// steals from the "rich" target entry and gives to the "poor" new entry (thus -// the origin of the name). -// -// An extension over the base Robin Hood hashing idea comes from -// https://probablydance.com/2017/02/26/i-wrote-the-fastest-hashtable/. A cap -// is placed on the max distance an entry can be from its desired slot. When -// this threshold is reached during insertion, the size of the table is doubled -// and insertion is restarted. Additionally, the entries slice is given "max -// dist" extra entries on the end. The very last entry in the entries slice is -// never used and acts as a sentinel which terminates loops. The previous -// maxDist-1 entries act as the extra entries. For example, if the size of the -// table is 2, maxDist is computed as 4 and the actual size of the entry slice -// is 6. -// -// +---+---+---+---+---+---+ -// | 0 | 1 | 2 | 3 | 4 | 5 | -// +---+---+---+---+---+---+ -// ^ -// size -// -// In this scenario, the target entry for a key will always be in the range -// [0,1]. Valid entries may reside in the range [0,4] due to the linear probing -// of up to maxDist entries. The entry at index 5 will never contain a value, -// and instead acts as a sentinel (its distance is always 0). The max distance -// threshold is set to log2(num-entries). This ensures that retrieval is O(log -// N), though note that N is the number of total entries, not the count of -// valid entries. -// -// Deletion is implemented via the backward shift delete mechanism instead of -// tombstones. This preserves the performance of the table in the presence of -// deletions. See -// http://codecapsule.com/2013/11/17/robin-hood-hashing-backward-shift-deletion -// for details. -type robinHoodMap struct { - entries robinHoodEntries - size uint32 - shift uint32 - count uint32 - maxDist uint32 -} - -func maxDistForSize(size uint32) uint32 { - desired := uint32(bits.Len32(size)) - if desired < 4 { - desired = 4 - } - return desired -} - -func newRobinHoodMap(initialCapacity int) *robinHoodMap { - m := &robinHoodMap{} - m.init(initialCapacity) - - // Note: this is a no-op if invariants are disabled or race is enabled. - invariants.SetFinalizer(m, func(obj interface{}) { - m := obj.(*robinHoodMap) - if m.entries.ptr != nil { - fmt.Fprintf(os.Stderr, "%p: robin-hood map not freed\n", m) - os.Exit(1) - } - }) - return m -} - -func (m *robinHoodMap) init(initialCapacity int) { - if initialCapacity < 1 { - initialCapacity = 1 - } - targetSize := 1 << (uint(bits.Len(uint(2*initialCapacity-1))) - 1) - m.rehash(uint32(targetSize)) -} - -func (m *robinHoodMap) free() { - if m.entries.ptr != nil { - m.entries.free() - m.entries.ptr = nil - } -} - -func (m *robinHoodMap) rehash(size uint32) { - oldEntries := m.entries - - m.size = size - m.shift = uint32(64 - bits.Len32(m.size-1)) - m.maxDist = maxDistForSize(size) - m.entries = newRobinHoodEntries(size + m.maxDist) - m.count = 0 - - for i := uint32(0); i < oldEntries.len; i++ { - e := oldEntries.at(i) - if e.value != nil { - m.Put(e.key, e.value) - } - } - - if oldEntries.ptr != nil { - oldEntries.free() - } -} - -// Find an entry containing the specified value. This is intended to be used -// from debug and test code. -func (m *robinHoodMap) findByValue(v *entry) *robinHoodEntry { - for i := uint32(0); i < m.entries.len; i++ { - e := m.entries.at(i) - if e.value == v { - return e - } - } - return nil -} - -func (m *robinHoodMap) Count() int { - return int(m.count) -} - -func (m *robinHoodMap) Put(k key, v *entry) { - maybeExists := true - n := robinHoodEntry{key: k, value: v, dist: 0} - for i := robinHoodHash(k, m.shift); ; i++ { - e := m.entries.at(i) - if maybeExists && k == e.key { - // Entry already exists: overwrite. - e.value = n.value - m.checkEntry(i) - return - } - - if e.value == nil { - // Found an empty entry: insert here. - *e = n - m.count++ - m.checkEntry(i) - return - } - - if e.dist < n.dist { - // Swap the new entry with the current entry because the current is - // rich. We then continue to loop, looking for a new location for the - // current entry. Note that this is also the not-found condition for - // retrieval, which means that "k" is not present in the map. See Get(). - n, *e = *e, n - m.checkEntry(i) - maybeExists = false - } - - // The new entry gradually moves away from its ideal position. - n.dist++ - - // If we've reached the max distance threshold, grow the table and restart - // the insertion. - if n.dist == m.maxDist { - m.rehash(2 * m.size) - i = robinHoodHash(n.key, m.shift) - 1 - n.dist = 0 - maybeExists = false - } - } -} - -func (m *robinHoodMap) Get(k key) *entry { - var dist uint32 - for i := robinHoodHash(k, m.shift); ; i++ { - e := m.entries.at(i) - if k == e.key { - // Found. - return e.value - } - if e.dist < dist { - // Not found. - return nil - } - dist++ - } -} - -func (m *robinHoodMap) Delete(k key) { - var dist uint32 - for i := robinHoodHash(k, m.shift); ; i++ { - e := m.entries.at(i) - if k == e.key { - m.checkEntry(i) - // We found the entry to delete. Shift the following entries backwards - // until the next empty value or entry with a zero distance. Note that - // empty values are guaranteed to have "dist == 0". - m.count-- - for j := i + 1; ; j++ { - t := m.entries.at(j) - if t.dist == 0 { - *e = robinHoodEntry{} - return - } - e.key = t.key - e.value = t.value - e.dist = t.dist - 1 - e = t - m.checkEntry(j) - } - } - if dist > e.dist { - // Not found. - return - } - dist++ - } -} - -func (m *robinHoodMap) checkEntry(i uint32) { - if invariants.Enabled { - e := m.entries.at(i) - if e.value != nil { - pos := robinHoodHash(e.key, m.shift) - if (uint32(i) - pos) != e.dist { - fmt.Fprintf(os.Stderr, "%d: invalid dist=%d, expected %d: %s\n%s", - i, e.dist, uint32(i)-pos, e.key, debug.Stack()) - os.Exit(1) - } - if e.dist > m.maxDist { - fmt.Fprintf(os.Stderr, "%d: invalid dist=%d > maxDist=%d: %s\n%s", - i, e.dist, m.maxDist, e.key, debug.Stack()) - os.Exit(1) - } - } - } -} - -func (m *robinHoodMap) String() string { - var buf strings.Builder - fmt.Fprintf(&buf, "count: %d\n", m.count) - for i := uint32(0); i < m.entries.len; i++ { - e := m.entries.at(i) - if e.value != nil { - fmt.Fprintf(&buf, "%d: [%s,%p,%d]\n", i, e.key, e.value, e.dist) - } - } - return buf.String() -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/value.go b/vendor/github.com/cockroachdb/pebble/internal/cache/value.go deleted file mode 100644 index 6d2cae1..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/value.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package cache - -// Value holds a reference counted immutable value. -type Value struct { - buf []byte - // Reference count for the value. The value is freed when the reference count - // drops to zero. - ref refcnt -} - -// Buf returns the buffer associated with the value. The contents of the buffer -// should not be changed once the value has been added to the cache. Instead, a -// new Value should be created and added to the cache to replace the existing -// value. -func (v *Value) Buf() []byte { - if v == nil { - return nil - } - return v.buf -} - -// Truncate the buffer to the specified length. The buffer length should not be -// changed once the value has been added to the cache as there may be -// concurrent readers of the Value. Instead, a new Value should be created and -// added to the cache to replace the existing value. -func (v *Value) Truncate(n int) { - v.buf = v.buf[:n] -} - -func (v *Value) refs() int32 { - return v.ref.refs() -} - -func (v *Value) acquire() { - v.ref.acquire() -} - -func (v *Value) release() { - if v != nil && v.ref.release() { - v.free() - } -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/value_invariants.go b/vendor/github.com/cockroachdb/pebble/internal/cache/value_invariants.go deleted file mode 100644 index 1e30d27..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/value_invariants.go +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build (invariants && !race) || (tracing && !race) -// +build invariants,!race tracing,!race - -package cache - -import ( - "fmt" - "os" - - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/manual" -) - -// newValue creates a Value with a manually managed buffer of size n. -// -// This definition of newValue is used when either the "invariants" or -// "tracing" build tags are specified. It hooks up a finalizer to the returned -// Value that checks for memory leaks when the GC determines the Value is no -// longer reachable. -func newValue(n int) *Value { - if n == 0 { - return nil - } - b := manual.New(n) - v := &Value{buf: b} - v.ref.init(1) - // Note: this is a no-op if invariants and tracing are disabled or race is - // enabled. - invariants.SetFinalizer(v, func(obj interface{}) { - v := obj.(*Value) - if v.buf != nil { - fmt.Fprintf(os.Stderr, "%p: cache value was not freed: refs=%d\n%s", - v, v.refs(), v.ref.traces()) - os.Exit(1) - } - }) - return v -} - -func (v *Value) free() { - // When "invariants" are enabled set the value contents to 0xff in order to - // cache use-after-free bugs. - for i := range v.buf { - v.buf[i] = 0xff - } - manual.Free(v.buf) - // Setting Value.buf to nil is needed for correctness of the leak checking - // that is performed when the "invariants" or "tracing" build tags are - // enabled. - v.buf = nil -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/value_normal.go b/vendor/github.com/cockroachdb/pebble/internal/cache/value_normal.go deleted file mode 100644 index e03379d..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/value_normal.go +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build (!invariants && !tracing) || race -// +build !invariants,!tracing race - -package cache - -import ( - "unsafe" - - "github.com/cockroachdb/pebble/internal/manual" -) - -const valueSize = int(unsafe.Sizeof(Value{})) - -func newValue(n int) *Value { - if n == 0 { - return nil - } - - if !cgoEnabled { - // If Cgo is disabled then all memory is allocated from the Go heap and we - // can't play the trick below to combine the Value and buffer allocation. - v := &Value{buf: make([]byte, n)} - v.ref.init(1) - return v - } - - // When we're not performing leak detection, the lifetime of the returned - // Value is exactly the lifetime of the backing buffer and we can manually - // allocate both. - // - // TODO(peter): It may be better to separate the allocation of the value and - // the buffer in order to reduce internal fragmentation in malloc. If the - // buffer is right at a power of 2, adding valueSize might push the - // allocation over into the next larger size. - b := manual.New(valueSize + n) - v := (*Value)(unsafe.Pointer(&b[0])) - v.buf = b[valueSize:] - v.ref.init(1) - return v -} - -func (v *Value) free() { - if !cgoEnabled { - return - } - - // When we're not performing leak detection, the Value and buffer were - // allocated contiguously. - n := valueSize + cap(v.buf) - buf := (*[manual.MaxArrayLen]byte)(unsafe.Pointer(v))[:n:n] - v.buf = nil - manual.Free(buf) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/fastrand/fastrand.go b/vendor/github.com/cockroachdb/pebble/internal/fastrand/fastrand.go deleted file mode 100644 index dd3ec9c..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/fastrand/fastrand.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package fastrand - -import _ "unsafe" // required by go:linkname - -// Uint32 returns a lock free uint32 value. -// -//go:linkname Uint32 runtime.fastrand -func Uint32() uint32 - -// Uint32n returns a lock free uint32 value in the interval [0, n). -// -//go:linkname Uint32n runtime.fastrandn -func Uint32n(n uint32) uint32 diff --git a/vendor/github.com/cockroachdb/pebble/internal/invariants/finalizer_off.go b/vendor/github.com/cockroachdb/pebble/internal/invariants/finalizer_off.go deleted file mode 100644 index d2c600a..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/invariants/finalizer_off.go +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build (!invariants && !tracing) || race -// +build !invariants,!tracing race - -package invariants - -// SetFinalizer is a wrapper around runtime.SetFinalizer that is a no-op under -// race builds or if neither the invariants or tracing build tags are -// specified. -func SetFinalizer(obj, finalizer interface{}) { -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/invariants/finalizer_on.go b/vendor/github.com/cockroachdb/pebble/internal/invariants/finalizer_on.go deleted file mode 100644 index da4e307..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/invariants/finalizer_on.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build (invariants && !race) || (tracing && !race) -// +build invariants,!race tracing,!race - -package invariants - -import "runtime" - -// SetFinalizer is a wrapper around runtime.SetFinalizer that is a no-op under -// race builds or if neither the invariants or tracing build tags are -// specified. -func SetFinalizer(obj, finalizer interface{}) { - runtime.SetFinalizer(obj, finalizer) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/invariants/off.go b/vendor/github.com/cockroachdb/pebble/internal/invariants/off.go deleted file mode 100644 index 01513f2..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/invariants/off.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build !invariants && !race -// +build !invariants,!race - -package invariants - -// Enabled is true if we were built with the "invariants" or "race" build tags. -const Enabled = false diff --git a/vendor/github.com/cockroachdb/pebble/internal/invariants/on.go b/vendor/github.com/cockroachdb/pebble/internal/invariants/on.go deleted file mode 100644 index b418680..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/invariants/on.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build invariants || race -// +build invariants race - -package invariants - -// Enabled is true if we were built with the "invariants" or "race" build tags. -const Enabled = true diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/filter.go b/vendor/github.com/cockroachdb/pebble/internal/keyspan/filter.go deleted file mode 100644 index a63a43c..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/filter.go +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package keyspan - -import "github.com/cockroachdb/pebble/internal/base" - -// FilterFunc defines a transform from the input Span into the output Span. The -// function returns true if the Span should be returned by the iterator, and -// false if the Span should be skipped. The FilterFunc is permitted to mutate -// the output Span, for example, to elice certain keys, or update the Span's -// bounds if so desired. The output Span's Keys slice may be reused to reduce -// allocations. -type FilterFunc func(in *Span, out *Span) (keep bool) - -// filteringIter is a FragmentIterator that uses a FilterFunc to select which -// Spans from the input iterator are returned in the output. -// -// A note on Span lifetimes: as the FilterFunc reuses a Span with a mutable -// slice of Keys to reduce allocations, Spans returned by this iterator are only -// valid until the next relative or absolute positioning method is called. -type filteringIter struct { - iter FragmentIterator - filterFn FilterFunc - cmp base.Compare - - // span is a mutable Span passed to the filterFn. The filterFn is free to - // mutate this Span. The slice of Keys in the Span is reused with every call - // to the filterFn. - span Span -} - -var _ FragmentIterator = (*filteringIter)(nil) - -// Filter returns a new filteringIter that will filter the Spans from the -// provided child iterator using the provided FilterFunc. -func Filter(iter FragmentIterator, filter FilterFunc, cmp base.Compare) FragmentIterator { - return &filteringIter{iter: iter, filterFn: filter, cmp: cmp} -} - -// SeekGE implements FragmentIterator. -func (i *filteringIter) SeekGE(key []byte) *Span { - span := i.filter(i.iter.SeekGE(key), +1) - // i.filter could return a span that's less than key, _if_ the filterFunc - // (which has no knowledge of the seek key) mutated the span to end at a key - // less than or equal to `key`. Detect this case and next/invalidate the iter. - if span != nil && i.cmp(span.End, key) <= 0 { - return i.Next() - } - return span -} - -// SeekLT implements FragmentIterator. -func (i *filteringIter) SeekLT(key []byte) *Span { - span := i.filter(i.iter.SeekLT(key), -1) - // i.filter could return a span that's >= key, _if_ the filterFunc (which has - // no knowledge of the seek key) mutated the span to start at a key greater - // than or equal to `key`. Detect this case and prev/invalidate the iter. - if span != nil && i.cmp(span.Start, key) >= 0 { - return i.Prev() - } - return span -} - -// First implements FragmentIterator. -func (i *filteringIter) First() *Span { - return i.filter(i.iter.First(), +1) -} - -// Last implements FragmentIterator. -func (i *filteringIter) Last() *Span { - return i.filter(i.iter.Last(), -1) -} - -// Next implements FragmentIterator. -func (i *filteringIter) Next() *Span { - return i.filter(i.iter.Next(), +1) -} - -// Prev implements FragmentIterator. -func (i *filteringIter) Prev() *Span { - return i.filter(i.iter.Prev(), -1) -} - -// Error implements FragmentIterator. -func (i *filteringIter) Error() error { - return i.iter.Error() -} - -// Close implements FragmentIterator. -func (i *filteringIter) Close() error { - return i.iter.Close() -} - -// filter uses the filterFn (if configured) to filter and possibly mutate the -// given Span. If the current Span is to be skipped, the iterator continues -// iterating in the given direction until it lands on a Span that should be -// returned, or the iterator becomes invalid. -func (i *filteringIter) filter(span *Span, dir int8) *Span { - if i.filterFn == nil { - return span - } - for i.Error() == nil && span != nil { - if keep := i.filterFn(span, &i.span); keep { - return &i.span - } - if dir == +1 { - span = i.iter.Next() - } else { - span = i.iter.Prev() - } - } - return span -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/get.go b/vendor/github.com/cockroachdb/pebble/internal/keyspan/get.go deleted file mode 100644 index c07f8c8..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/get.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package keyspan - -import "github.com/cockroachdb/pebble/internal/base" - -// Get returns the newest span that contains the target key. If no span -// contains the target key, an empty span is returned. The snapshot -// parameter controls the visibility of spans (only spans older than the -// snapshot sequence number are visible). The iterator must contain -// fragmented spans: no span may overlap another. -func Get(cmp base.Compare, iter FragmentIterator, key []byte) *Span { - // NB: We use SeekLT in order to land on the proper span for a search - // key that resides in the middle of a span. Consider the scenario: - // - // a---e - // e---i - // - // The spans are indexed by their start keys `a` and `e`. If the - // search key is `c` we want to land on the span [a,e). If we were - // to use SeekGE then the search key `c` would land on the span - // [e,i) and we'd have to backtrack. The one complexity here is what - // happens for the search key `e`. In that case SeekLT will land us - // on the span [a,e) and we'll have to move forward. - iterSpan := iter.SeekLT(key) - if iterSpan == nil { - iterSpan = iter.Next() - if iterSpan == nil { - // The iterator is empty. - return nil - } - if cmp(key, iterSpan.Start) < 0 { - // The search key lies before the first span. - return nil - } - } - - // Invariant: key > iterSpan.Start - if cmp(key, iterSpan.End) >= 0 { - // The current span lies before the search key. Advance the iterator - // once to potentially land on a key with a start key exactly equal to - // key. (See the comment at the beginning of this function.) - iterSpan = iter.Next() - if iterSpan == nil || cmp(key, iterSpan.Start) < 0 { - // We've run out of spans or we've moved on to a span which - // starts after our search key. - return nil - } - } - return iterSpan -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/internal_iter_shim.go b/vendor/github.com/cockroachdb/pebble/internal/keyspan/internal_iter_shim.go deleted file mode 100644 index cf2adc3..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/internal_iter_shim.go +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package keyspan - -import "github.com/cockroachdb/pebble/internal/base" - -// InternalIteratorShim is a temporary iterator type used as a shim between -// keyspan.MergingIter and base.InternalIterator. It's used temporarily for -// range deletions during compactions, allowing range deletions to be -// interleaved by a compaction input iterator. -// -// TODO(jackson): This type should be removed, and the usages converted to using -// an InterleavingIterator type that interleaves keyspan.Spans from a -// keyspan.FragmentIterator with point keys. -type InternalIteratorShim struct { - miter MergingIter - mbufs MergingBuffers - span *Span - iterKey base.InternalKey -} - -// Assert that InternalIteratorShim implements InternalIterator. -var _ base.InternalIterator = &InternalIteratorShim{} - -// Init initializes the internal iterator shim to merge the provided fragment -// iterators. -func (i *InternalIteratorShim) Init(cmp base.Compare, iters ...FragmentIterator) { - i.miter.Init(cmp, noopTransform, &i.mbufs, iters...) -} - -// Span returns the span containing the full set of keys over the key span at -// the current iterator position. -func (i *InternalIteratorShim) Span() *Span { - return i.span -} - -// SeekGE implements (base.InternalIterator).SeekGE. -func (i *InternalIteratorShim) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -// SeekPrefixGE implements (base.InternalIterator).SeekPrefixGE. -func (i *InternalIteratorShim) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -// SeekLT implements (base.InternalIterator).SeekLT. -func (i *InternalIteratorShim) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -// First implements (base.InternalIterator).First. -func (i *InternalIteratorShim) First() (*base.InternalKey, base.LazyValue) { - i.span = i.miter.First() - for i.span != nil && i.span.Empty() { - i.span = i.miter.Next() - } - if i.span == nil { - return nil, base.LazyValue{} - } - i.iterKey = base.InternalKey{UserKey: i.span.Start, Trailer: i.span.Keys[0].Trailer} - return &i.iterKey, base.MakeInPlaceValue(i.span.End) -} - -// Last implements (base.InternalIterator).Last. -func (i *InternalIteratorShim) Last() (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -// Next implements (base.InternalIterator).Next. -func (i *InternalIteratorShim) Next() (*base.InternalKey, base.LazyValue) { - i.span = i.miter.Next() - for i.span != nil && i.span.Empty() { - i.span = i.miter.Next() - } - if i.span == nil { - return nil, base.LazyValue{} - } - i.iterKey = base.InternalKey{UserKey: i.span.Start, Trailer: i.span.Keys[0].Trailer} - return &i.iterKey, base.MakeInPlaceValue(i.span.End) -} - -// NextPrefix implements (base.InternalIterator).NextPrefix. -func (i *InternalIteratorShim) NextPrefix([]byte) (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -// Prev implements (base.InternalIterator).Prev. -func (i *InternalIteratorShim) Prev() (*base.InternalKey, base.LazyValue) { - panic("unimplemented") -} - -// Error implements (base.InternalIterator).Error. -func (i *InternalIteratorShim) Error() error { - return i.miter.Error() -} - -// Close implements (base.InternalIterator).Close. -func (i *InternalIteratorShim) Close() error { - return i.miter.Close() -} - -// SetBounds implements (base.InternalIterator).SetBounds. -func (i *InternalIteratorShim) SetBounds(lower, upper []byte) { -} - -// String implements fmt.Stringer. -func (i *InternalIteratorShim) String() string { - return i.miter.String() -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/level_iter.go b/vendor/github.com/cockroachdb/pebble/internal/keyspan/level_iter.go deleted file mode 100644 index 6dd7ac6..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/level_iter.go +++ /dev/null @@ -1,521 +0,0 @@ -// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package keyspan - -import ( - "fmt" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/manifest" -) - -// LevelIter provides a merged view of spans from sstables in a level. -// It takes advantage of level invariants to only have one sstable span block -// open at one time, opened using the newIter function passed in. -type LevelIter struct { - cmp base.Compare - // Denotes the kind of key the level iterator should read. If the key type - // is KeyTypePoint, the level iterator will read range tombstones (which - // only affect point keys). If the key type is KeyTypeRange, the level - // iterator will read range keys. It is invalid to configure an iterator - // with the KeyTypePointAndRange key type. - // - // If key type is KeyTypePoint, no straddle spans are emitted between files, - // and point key bounds are used to find files instead of range key bounds. - // - // TODO(bilal): Straddle spans can safely be produced in rangedel mode once - // we can guarantee that we will never read sstables in a level that split - // user keys across them. This might be guaranteed in a future release, but - // as of CockroachDB 22.2 it is not guaranteed, so to be safe disable it when - // keyType == KeyTypePoint - keyType manifest.KeyType - // The LSM level this LevelIter is initialized for. Used in logging. - level manifest.Level - // The below fields are used to fill in gaps between adjacent files' range - // key spaces. This is an optimization to avoid unnecessarily loading files - // in cases where range keys are sparse and rare. dir is set by every - // positioning operation, straddleDir is set to dir whenever a straddling - // Span is synthesized and the last positioning operation returned a - // synthesized straddle span. - // - // Note that when a straddle span is initialized, iterFile is modified to - // point to the next file in the straddleDir direction. A change of direction - // on a straddle key therefore necessitates the value of iterFile to be - // reverted. - dir int - straddle Span - straddleDir int - // The iter for the current file (iterFile). It is nil under any of the - // following conditions: - // - files.Current() == nil - // - err != nil - // - straddleDir != 0, in which case iterFile is not nil and points to the - // next file (in the straddleDir direction). - // - some other constraint, like the bounds in opts, caused the file at index to not - // be relevant to the iteration. - iter FragmentIterator - // iterFile holds the current file. - // INVARIANT: iterFile = files.Current() - iterFile *manifest.FileMetadata - newIter TableNewSpanIter - files manifest.LevelIterator - err error - - // The options that were passed in. - tableOpts SpanIterOptions - - // TODO(bilal): Add InternalIteratorStats. -} - -// LevelIter implements the keyspan.FragmentIterator interface. -var _ FragmentIterator = (*LevelIter)(nil) - -// NewLevelIter returns a LevelIter. -func NewLevelIter( - opts SpanIterOptions, - cmp base.Compare, - newIter TableNewSpanIter, - files manifest.LevelIterator, - level manifest.Level, - keyType manifest.KeyType, -) *LevelIter { - l := &LevelIter{} - l.Init(opts, cmp, newIter, files, level, keyType) - return l -} - -// Init initializes a LevelIter. -func (l *LevelIter) Init( - opts SpanIterOptions, - cmp base.Compare, - newIter TableNewSpanIter, - files manifest.LevelIterator, - level manifest.Level, - keyType manifest.KeyType, -) { - l.err = nil - l.level = level - l.tableOpts = opts - l.cmp = cmp - l.iterFile = nil - l.newIter = newIter - switch keyType { - case manifest.KeyTypePoint: - l.keyType = keyType - l.files = files.Filter(keyType) - case manifest.KeyTypeRange: - l.keyType = keyType - l.files = files.Filter(keyType) - default: - panic(fmt.Sprintf("unsupported key type: %v", keyType)) - } -} - -func (l *LevelIter) findFileGE(key []byte) *manifest.FileMetadata { - // Find the earliest file whose largest key is >= key. - // - // If the earliest file has its largest key == key and that largest key is a - // range deletion sentinel, we know that we manufactured this sentinel to convert - // the exclusive range deletion end key into an inclusive key (reminder: [start, end)#seqnum - // is the form of a range deletion sentinel which can contribute a largest key = end#sentinel). - // In this case we don't return this as the earliest file since there is nothing actually - // equal to key in it. - - m := l.files.SeekGE(l.cmp, key) - for m != nil { - largestKey := m.LargestRangeKey - if l.keyType == manifest.KeyTypePoint { - largestKey = m.LargestPointKey - } - if !largestKey.IsExclusiveSentinel() || l.cmp(largestKey.UserKey, key) != 0 { - break - } - m = l.files.Next() - } - return m -} - -func (l *LevelIter) findFileLT(key []byte) *manifest.FileMetadata { - // Find the last file whose smallest key is < key. - return l.files.SeekLT(l.cmp, key) -} - -type loadFileReturnIndicator int8 - -const ( - noFileLoaded loadFileReturnIndicator = iota - fileAlreadyLoaded - newFileLoaded -) - -func (l *LevelIter) loadFile(file *manifest.FileMetadata, dir int) loadFileReturnIndicator { - indicator := noFileLoaded - if l.iterFile == file { - if l.err != nil { - return noFileLoaded - } - if l.iter != nil { - // We are already at the file, but we would need to check for bounds. - // Set indicator accordingly. - indicator = fileAlreadyLoaded - } - // We were already at file, but don't have an iterator, probably because the file was - // beyond the iteration bounds. It may still be, but it is also possible that the bounds - // have changed. We handle that below. - } - - // Note that LevelIter.Close() can be called multiple times. - if indicator != fileAlreadyLoaded { - if err := l.Close(); err != nil { - return noFileLoaded - } - } - - l.iterFile = file - if file == nil { - return noFileLoaded - } - if indicator != fileAlreadyLoaded { - l.iter, l.err = l.newIter(file, l.tableOpts) - indicator = newFileLoaded - } - if l.err != nil { - return noFileLoaded - } - return indicator -} - -// SeekGE implements keyspan.FragmentIterator. -func (l *LevelIter) SeekGE(key []byte) *Span { - l.dir = +1 - l.straddle = Span{} - l.straddleDir = 0 - l.err = nil // clear cached iteration error - - f := l.findFileGE(key) - if f != nil && l.keyType == manifest.KeyTypeRange && l.cmp(key, f.SmallestRangeKey.UserKey) < 0 { - // Peek at the previous file. - prevFile := l.files.Prev() - l.files.Next() - if prevFile != nil { - // We could unconditionally return an empty span between the seek key and - // f.SmallestRangeKey, however if this span is to the left of all range - // keys on this level, it could lead to inconsistent behaviour in relative - // positioning operations. Consider this example, with a b-c range key: - // - // SeekGE(a) -> a-b:{} - // Next() -> b-c{(#5,RANGEKEYSET,@4,foo)} - // Prev() -> nil - // - // Iterators higher up in the iterator stack rely on this sort of relative - // positioning consistency. - // - // TODO(bilal): Investigate ways to be able to return straddle spans in - // cases similar to the above, while still retaining correctness. - // Return a straddling key instead of loading the file. - l.iterFile = f - if err := l.Close(); err != nil { - return l.verify(nil) - } - l.straddleDir = +1 - l.straddle = Span{ - Start: prevFile.LargestRangeKey.UserKey, - End: f.SmallestRangeKey.UserKey, - Keys: nil, - } - return l.verify(&l.straddle) - } - } - loadFileIndicator := l.loadFile(f, +1) - if loadFileIndicator == noFileLoaded { - return l.verify(nil) - } - if span := l.iter.SeekGE(key); span != nil { - return l.verify(span) - } - return l.skipEmptyFileForward() -} - -// SeekLT implements keyspan.FragmentIterator. -func (l *LevelIter) SeekLT(key []byte) *Span { - l.dir = -1 - l.straddle = Span{} - l.straddleDir = 0 - l.err = nil // clear cached iteration error - - f := l.findFileLT(key) - if f != nil && l.keyType == manifest.KeyTypeRange && l.cmp(f.LargestRangeKey.UserKey, key) < 0 { - // Peek at the next file. - nextFile := l.files.Next() - l.files.Prev() - if nextFile != nil { - // We could unconditionally return an empty span between f.LargestRangeKey - // and the seek key, however if this span is to the right of all range keys - // on this level, it could lead to inconsistent behaviour in relative - // positioning operations. Consider this example, with a b-c range key: - // - // SeekLT(d) -> c-d:{} - // Prev() -> b-c{(#5,RANGEKEYSET,@4,foo)} - // Next() -> nil - // - // Iterators higher up in the iterator stack rely on this sort of relative - // positioning consistency. - // - // TODO(bilal): Investigate ways to be able to return straddle spans in - // cases similar to the above, while still retaining correctness. - // Return a straddling key instead of loading the file. - l.iterFile = f - if err := l.Close(); err != nil { - return l.verify(nil) - } - l.straddleDir = -1 - l.straddle = Span{ - Start: f.LargestRangeKey.UserKey, - End: nextFile.SmallestRangeKey.UserKey, - Keys: nil, - } - return l.verify(&l.straddle) - } - } - if l.loadFile(f, -1) == noFileLoaded { - return l.verify(nil) - } - if span := l.iter.SeekLT(key); span != nil { - return l.verify(span) - } - return l.skipEmptyFileBackward() -} - -// First implements keyspan.FragmentIterator. -func (l *LevelIter) First() *Span { - l.dir = +1 - l.straddle = Span{} - l.straddleDir = 0 - l.err = nil // clear cached iteration error - - if l.loadFile(l.files.First(), +1) == noFileLoaded { - return l.verify(nil) - } - if span := l.iter.First(); span != nil { - return l.verify(span) - } - return l.skipEmptyFileForward() -} - -// Last implements keyspan.FragmentIterator. -func (l *LevelIter) Last() *Span { - l.dir = -1 - l.straddle = Span{} - l.straddleDir = 0 - l.err = nil // clear cached iteration error - - if l.loadFile(l.files.Last(), -1) == noFileLoaded { - return l.verify(nil) - } - if span := l.iter.Last(); span != nil { - return l.verify(span) - } - return l.skipEmptyFileBackward() -} - -// Next implements keyspan.FragmentIterator. -func (l *LevelIter) Next() *Span { - if l.err != nil || (l.iter == nil && l.iterFile == nil && l.dir > 0) { - return l.verify(nil) - } - if l.iter == nil && l.iterFile == nil { - // l.dir <= 0 - return l.First() - } - l.dir = +1 - - if l.iter != nil { - if span := l.iter.Next(); span != nil { - return l.verify(span) - } - } - return l.skipEmptyFileForward() -} - -// Prev implements keyspan.FragmentIterator. -func (l *LevelIter) Prev() *Span { - if l.err != nil || (l.iter == nil && l.iterFile == nil && l.dir < 0) { - return l.verify(nil) - } - if l.iter == nil && l.iterFile == nil { - // l.dir >= 0 - return l.Last() - } - l.dir = -1 - - if l.iter != nil { - if span := l.iter.Prev(); span != nil { - return l.verify(span) - } - } - return l.skipEmptyFileBackward() -} - -func (l *LevelIter) skipEmptyFileForward() *Span { - if l.straddleDir == 0 && l.keyType == manifest.KeyTypeRange && - l.iterFile != nil && l.iter != nil { - // We were at a file that had spans. Check if the next file that has - // spans is not directly adjacent to the current file i.e. there is a - // gap in the span keyspace between the two files. In that case, synthesize - // a "straddle span" in l.straddle and return that. - // - // Straddle spans are not created in rangedel mode. - if err := l.Close(); err != nil { - l.err = err - return l.verify(nil) - } - startKey := l.iterFile.LargestRangeKey.UserKey - // Resetting l.iterFile without loading the file into l.iter is okay and - // does not change the logic in loadFile() as long as l.iter is also nil; - // which it should be due to the Close() call above. - l.iterFile = l.files.Next() - if l.iterFile == nil { - return l.verify(nil) - } - endKey := l.iterFile.SmallestRangeKey.UserKey - if l.cmp(startKey, endKey) < 0 { - // There is a gap between the two files. Synthesize a straddling span - // to avoid unnecessarily loading the next file. - l.straddle = Span{ - Start: startKey, - End: endKey, - } - l.straddleDir = +1 - return l.verify(&l.straddle) - } - } else if l.straddleDir < 0 { - // We were at a straddle key, but are now changing directions. l.iterFile - // was already moved backward by skipEmptyFileBackward, so advance it - // forward. - l.iterFile = l.files.Next() - } - l.straddle = Span{} - l.straddleDir = 0 - var span *Span - for span.Empty() { - fileToLoad := l.iterFile - if l.keyType == manifest.KeyTypePoint { - // We haven't iterated to the next file yet if we're in point key - // (rangedel) mode. - fileToLoad = l.files.Next() - } - if l.loadFile(fileToLoad, +1) == noFileLoaded { - return l.verify(nil) - } - span = l.iter.First() - // In rangedel mode, we can expect to get empty files that we'd need to - // skip over, but not in range key mode. - if l.keyType == manifest.KeyTypeRange { - break - } - } - return l.verify(span) -} - -func (l *LevelIter) skipEmptyFileBackward() *Span { - // We were at a file that had spans. Check if the previous file that has - // spans is not directly adjacent to the current file i.e. there is a - // gap in the span keyspace between the two files. In that case, synthesize - // a "straddle span" in l.straddle and return that. - // - // Straddle spans are not created in rangedel mode. - if l.straddleDir == 0 && l.keyType == manifest.KeyTypeRange && - l.iterFile != nil && l.iter != nil { - if err := l.Close(); err != nil { - l.err = err - return l.verify(nil) - } - endKey := l.iterFile.SmallestRangeKey.UserKey - // Resetting l.iterFile without loading the file into l.iter is okay and - // does not change the logic in loadFile() as long as l.iter is also nil; - // which it should be due to the Close() call above. - l.iterFile = l.files.Prev() - if l.iterFile == nil { - return l.verify(nil) - } - startKey := l.iterFile.LargestRangeKey.UserKey - if l.cmp(startKey, endKey) < 0 { - // There is a gap between the two files. Synthesize a straddling span - // to avoid unnecessarily loading the next file. - l.straddle = Span{ - Start: startKey, - End: endKey, - } - l.straddleDir = -1 - return l.verify(&l.straddle) - } - } else if l.straddleDir > 0 { - // We were at a straddle key, but are now changing directions. l.iterFile - // was already advanced forward by skipEmptyFileForward, so move it - // backward. - l.iterFile = l.files.Prev() - } - l.straddle = Span{} - l.straddleDir = 0 - var span *Span - for span.Empty() { - fileToLoad := l.iterFile - if l.keyType == manifest.KeyTypePoint { - fileToLoad = l.files.Prev() - } - if l.loadFile(fileToLoad, -1) == noFileLoaded { - return l.verify(nil) - } - span = l.iter.Last() - // In rangedel mode, we can expect to get empty files that we'd need to - // skip over, but not in range key mode as the filter on the FileMetadata - // should guarantee we always get a non-empty file. - if l.keyType == manifest.KeyTypeRange { - break - } - } - return l.verify(span) -} - -// verify is invoked whenever a span is returned from an iterator positioning -// method to a caller. During invariant builds, it asserts invariants to the -// caller. -func (l *LevelIter) verify(s *Span) *Span { - // NB: Do not add any logic outside the invariants.Enabled conditional to - // ensure that verify is always compiled away in production builds. - if invariants.Enabled { - if f := l.files.Current(); f != l.iterFile { - panic(fmt.Sprintf("LevelIter.files.Current (%s) and l.iterFile (%s) diverged", - f, l.iterFile)) - } - } - return s -} - -// Error implements keyspan.FragmentIterator. -func (l *LevelIter) Error() error { - if l.err != nil || l.iter == nil { - return l.err - } - return l.iter.Error() -} - -// Close implements keyspan.FragmentIterator. -func (l *LevelIter) Close() error { - if l.iter != nil { - l.err = l.iter.Close() - l.iter = nil - } - return l.err -} - -// String implements keyspan.FragmentIterator. -func (l *LevelIter) String() string { - if l.iterFile != nil { - return fmt.Sprintf("%s: fileNum=%s", l.level, l.iterFile.FileNum) - } - return fmt.Sprintf("%s: fileNum=", l.level) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/seek.go b/vendor/github.com/cockroachdb/pebble/internal/keyspan/seek.go deleted file mode 100644 index efcf682..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/seek.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package keyspan - -import "github.com/cockroachdb/pebble/internal/base" - -// SeekLE seeks to the span that contains or is before the target key. -func SeekLE(cmp base.Compare, iter FragmentIterator, key []byte) *Span { - // NB: We use SeekLT in order to land on the proper span for a search - // key that resides in the middle of a span. Consider the scenario: - // - // a---e - // e---i - // - // The spans are indexed by their start keys `a` and `e`. If the - // search key is `c` we want to land on the span [a,e). If we were to - // use SeekGE then the search key `c` would land on the span [e,i) and - // we'd have to backtrack. The one complexity here is what happens for the - // search key `e`. In that case SeekLT will land us on the span [a,e) - // and we'll have to move forward. - iterSpan := iter.SeekLT(key) - - if iterSpan == nil { - // Advance the iterator once to see if the next span has a start key - // equal to key. - iterSpan = iter.Next() - if iterSpan == nil || cmp(key, iterSpan.Start) < 0 { - // The iterator is exhausted or we've hit the next span. - return nil - } - } else { - // Invariant: key > iterSpan.Start - if cmp(key, iterSpan.End) >= 0 { - // The current span lies entirely before the search key. Check to see if - // the next span contains the search key. If it doesn't, we'll backup - // and return to our earlier candidate. - iterSpan = iter.Next() - if iterSpan == nil || cmp(key, iterSpan.Start) < 0 { - // The next span is past our search key or there is no next span. Go - // back. - iterSpan = iter.Prev() - } - } - } - return iterSpan -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/transformer.go b/vendor/github.com/cockroachdb/pebble/internal/keyspan/transformer.go deleted file mode 100644 index b5e8735..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/transformer.go +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package keyspan - -import "github.com/cockroachdb/pebble/internal/base" - -// Transformer defines a transformation to be applied to a Span. -type Transformer interface { - // Transform takes a Span as input and writes the transformed Span to the - // provided output *Span pointer. The output Span's Keys slice may be reused - // by Transform to reduce allocations. - Transform(cmp base.Compare, in Span, out *Span) error -} - -// The TransformerFunc type is an adapter to allow the use of ordinary functions -// as Transformers. If f is a function with the appropriate signature, -// TransformerFunc(f) is a Transformer that calls f. -type TransformerFunc func(base.Compare, Span, *Span) error - -// Transform calls f(cmp, in, out). -func (tf TransformerFunc) Transform(cmp base.Compare, in Span, out *Span) error { - return tf(cmp, in, out) -} - -var noopTransform Transformer = TransformerFunc(func(_ base.Compare, s Span, dst *Span) error { - dst.Start, dst.End = s.Start, s.End - dst.Keys = append(dst.Keys[:0], s.Keys...) - return nil -}) - -// VisibleTransform filters keys that are invisible at the provided snapshot -// sequence number. -func VisibleTransform(snapshot uint64) Transformer { - return TransformerFunc(func(_ base.Compare, s Span, dst *Span) error { - dst.Start, dst.End = s.Start, s.End - dst.Keys = dst.Keys[:0] - for _, k := range s.Keys { - // NB: The InternalKeySeqNumMax value is used for the batch snapshot - // because a batch's visible span keys are filtered when they're - // fragmented. There's no requirement to enforce visibility at - // iteration time. - if base.Visible(k.SeqNum(), snapshot, base.InternalKeySeqNumMax) { - dst.Keys = append(dst.Keys, k) - } - } - return nil - }) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/truncate.go b/vendor/github.com/cockroachdb/pebble/internal/keyspan/truncate.go deleted file mode 100644 index c0e609b..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/truncate.go +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package keyspan - -import "github.com/cockroachdb/pebble/internal/base" - -// Truncate creates a new iterator where every span in the supplied iterator is -// truncated to be contained within the range [lower, upper). If start and end -// are specified, filter out any spans that are completely outside those bounds. -func Truncate( - cmp base.Compare, - iter FragmentIterator, - lower, upper []byte, - start, end *base.InternalKey, - panicOnUpperTruncate bool, -) FragmentIterator { - return Filter(iter, func(in *Span, out *Span) (keep bool) { - out.Start, out.End = in.Start, in.End - out.Keys = append(out.Keys[:0], in.Keys...) - - // Ignore this span if it lies completely outside start, end. Note that - // end endInclusive indicated whether end is inclusive. - // - // The comparison between s.End and start is by user key only, as - // the span is exclusive at s.End, so comparing by user keys - // is sufficient. - if start != nil && cmp(in.End, start.UserKey) <= 0 { - return false - } - if end != nil { - v := cmp(in.Start, end.UserKey) - switch { - case v > 0: - // Wholly outside the end bound. Skip it. - return false - case v == 0: - // This span begins at the same user key as `end`. Whether or - // not any of the keys contained within the span are relevant is - // dependent on Trailers. Any keys contained within the span - // with trailers larger than end cover the small sliver of - // keyspace between [k#inf, k#]. Since keys are - // sorted descending by Trailer within the span, we need to find - // the prefix of keys with larger trailers. - for i := range in.Keys { - if in.Keys[i].Trailer < end.Trailer { - out.Keys = out.Keys[:i] - break - } - } - default: - // Wholly within the end bound. Keep it. - } - } - - var truncated bool - // Truncate the bounds to lower and upper. - if cmp(in.Start, lower) < 0 { - out.Start = lower - } - if cmp(in.End, upper) > 0 { - truncated = true - out.End = upper - } - - if panicOnUpperTruncate && truncated { - panic("pebble: upper bound should not be truncated") - } - - return !out.Empty() && cmp(out.Start, out.End) < 0 - }, cmp) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/manifest/level.go b/vendor/github.com/cockroachdb/pebble/internal/manifest/level.go deleted file mode 100644 index 1a971f6..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/manifest/level.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package manifest - -import "fmt" - -const ( - // 3 bits are necessary to represent level values from 0-6. - levelBits = 3 - levelMask = (1 << levelBits) - 1 - // invalidSublevel denotes an invalid or non-applicable sublevel. - invalidSublevel = -1 -) - -// Level encodes a level and optional sublevel for use in log and error -// messages. The encoding has the property that Level(0) == -// L0Sublevel(invalidSublevel). -type Level uint32 - -func makeLevel(level, sublevel int) Level { - return Level(((sublevel + 1) << levelBits) | level) -} - -// LevelToInt returns the int representation of a Level -func LevelToInt(l Level) int { - return int(l) & levelMask -} - -// L0Sublevel returns a Level representing the specified L0 sublevel. -func L0Sublevel(sublevel int) Level { - if sublevel < 0 { - panic(fmt.Sprintf("invalid L0 sublevel: %d", sublevel)) - } - return makeLevel(0, sublevel) -} - -func (l Level) String() string { - level := int(l) & levelMask - sublevel := (int(l) >> levelBits) - 1 - if sublevel != invalidSublevel { - return fmt.Sprintf("L%d.%d", level, sublevel) - } - return fmt.Sprintf("L%d", level) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/manifest/version.go b/vendor/github.com/cockroachdb/pebble/internal/manifest/version.go deleted file mode 100644 index 0a3b774..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/manifest/version.go +++ /dev/null @@ -1,1571 +0,0 @@ -// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package manifest - -import ( - "bytes" - "fmt" - "sort" - "strconv" - "strings" - "sync" - "sync/atomic" - "unicode" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" -) - -// Compare exports the base.Compare type. -type Compare = base.Compare - -// InternalKey exports the base.InternalKey type. -type InternalKey = base.InternalKey - -// TableInfo contains the common information for table related events. -type TableInfo struct { - // FileNum is the internal DB identifier for the table. - FileNum base.FileNum - // Size is the size of the file in bytes. - Size uint64 - // Smallest is the smallest internal key in the table. - Smallest InternalKey - // Largest is the largest internal key in the table. - Largest InternalKey - // SmallestSeqNum is the smallest sequence number in the table. - SmallestSeqNum uint64 - // LargestSeqNum is the largest sequence number in the table. - LargestSeqNum uint64 -} - -// TableStats contains statistics on a table used for compaction heuristics, -// and export via Metrics. -type TableStats struct { - // The total number of entries in the table. - NumEntries uint64 - // The number of point and range deletion entries in the table. - NumDeletions uint64 - // NumRangeKeySets is the total number of range key sets in the table. - // - // NB: If there's a chance that the sstable contains any range key sets, - // then NumRangeKeySets must be > 0. - NumRangeKeySets uint64 - // Estimate of the total disk space that may be dropped by this table's - // point deletions by compacting them. - PointDeletionsBytesEstimate uint64 - // Estimate of the total disk space that may be dropped by this table's - // range deletions by compacting them. This estimate is at data-block - // granularity and is not updated if compactions beneath the table reduce - // the amount of reclaimable disk space. It also does not account for - // overlapping data in L0 and ignores L0 sublevels, but the error that - // introduces is expected to be small. - // - // Tables in the bottommost level of the LSM may have a nonzero estimate if - // snapshots or move compactions prevented the elision of their range - // tombstones. A table in the bottommost level that was ingested into L6 - // will have a zero estimate, because the file's sequence numbers indicate - // that the tombstone cannot drop any data contained within the file itself. - RangeDeletionsBytesEstimate uint64 - // Total size of value blocks and value index block. - ValueBlocksSize uint64 -} - -// boundType represents the type of key (point or range) present as the smallest -// and largest keys. -type boundType uint8 - -const ( - boundTypePointKey boundType = iota + 1 - boundTypeRangeKey -) - -// CompactionState is the compaction state of a file. -// -// The following shows the valid state transitions: -// -// NotCompacting --> Compacting --> Compacted -// ^ | -// | | -// +-------<-------+ -// -// Input files to a compaction transition to Compacting when a compaction is -// picked. A file that has finished compacting typically transitions into the -// Compacted state, at which point it is effectively obsolete ("zombied") and -// will eventually be removed from the LSM. A file that has been move-compacted -// will transition from Compacting back into the NotCompacting state, signaling -// that the file may be selected for a subsequent compaction. A failed -// compaction will result in all input tables transitioning from Compacting to -// NotCompacting. -// -// This state is in-memory only. It is not persisted to the manifest. -type CompactionState uint8 - -// CompactionStates. -const ( - CompactionStateNotCompacting CompactionState = iota - CompactionStateCompacting - CompactionStateCompacted -) - -// String implements fmt.Stringer. -func (s CompactionState) String() string { - switch s { - case CompactionStateNotCompacting: - return "NotCompacting" - case CompactionStateCompacting: - return "Compacting" - case CompactionStateCompacted: - return "Compacted" - default: - panic(fmt.Sprintf("pebble: unknown compaction state %d", s)) - } -} - -// FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of -// some version. FileMetadata does not contain the actual level of the sst, -// since such leveled-ssts can move across levels in different versions, while -// sharing the same FileMetadata. There are two kinds of leveled-ssts, physical -// and virtual. Underlying both leveled-ssts is a backing-sst, for which the -// only state is FileBacking. A backing-sst is level-less. It is possible for a -// backing-sst to be referred to by a physical sst in one version and by one or -// more virtual ssts in one or more versions. A backing-sst becomes obsolete -// and can be deleted once it is no longer required by any physical or virtual -// sst in any version. -// -// We maintain some invariants: -// -// 1. Each physical and virtual sst will have a unique FileMetadata.FileNum, -// and there will be exactly one FileMetadata associated with the FileNum. -// -// 2. Within a version, a backing-sst is either only referred to by one -// physical sst or one or more virtual ssts. -// -// 3. Once a backing-sst is referred to by a virtual sst in the latest version, -// it cannot go back to being referred to by a physical sst in any future -// version. -// -// Once a physical sst is no longer needed by any version, we will no longer -// maintain the file metadata associated with it. We will still maintain the -// FileBacking associated with the physical sst if the backing sst is required -// by any virtual ssts in any version. -type FileMetadata struct { - // AllowedSeeks is used to determine if a file should be picked for - // a read triggered compaction. It is decremented when read sampling - // in pebble.Iterator after every after every positioning operation - // that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc). - AllowedSeeks atomic.Int64 - - // statsValid indicates if stats have been loaded for the table. The - // TableStats structure is populated only if valid is true. - statsValid atomic.Bool - - // FileBacking is the state which backs either a physical or virtual - // sstables. - FileBacking *FileBacking - - // InitAllowedSeeks is the inital value of allowed seeks. This is used - // to re-set allowed seeks on a file once it hits 0. - InitAllowedSeeks int64 - // FileNum is the file number. - // - // INVARIANT: when !FileMetadata.Virtual, FileNum == FileBacking.DiskFileNum. - FileNum base.FileNum - // Size is the size of the file, in bytes. Size is an approximate value for - // virtual sstables. - // - // INVARIANTS: - // - When !FileMetadata.Virtual, Size == FileBacking.Size. - // - Size should be non-zero. Size 0 virtual sstables must not be created. - Size uint64 - // File creation time in seconds since the epoch (1970-01-01 00:00:00 - // UTC). For ingested sstables, this corresponds to the time the file was - // ingested. For virtual sstables, this corresponds to the wall clock time - // when the FileMetadata for the virtual sstable was first created. - CreationTime int64 - // Lower and upper bounds for the smallest and largest sequence numbers in - // the table, across both point and range keys. For physical sstables, these - // values are tight bounds. For virtual sstables, there is no guarantee that - // there will be keys with SmallestSeqNum or LargestSeqNum within virtual - // sstable bounds. - SmallestSeqNum uint64 - LargestSeqNum uint64 - // SmallestPointKey and LargestPointKey are the inclusive bounds for the - // internal point keys stored in the table. This includes RANGEDELs, which - // alter point keys. - // NB: these field should be set using ExtendPointKeyBounds. They are left - // exported for reads as an optimization. - SmallestPointKey InternalKey - LargestPointKey InternalKey - // SmallestRangeKey and LargestRangeKey are the inclusive bounds for the - // internal range keys stored in the table. - // NB: these field should be set using ExtendRangeKeyBounds. They are left - // exported for reads as an optimization. - SmallestRangeKey InternalKey - LargestRangeKey InternalKey - // Smallest and Largest are the inclusive bounds for the internal keys stored - // in the table, across both point and range keys. - // NB: these fields are derived from their point and range key equivalents, - // and are updated via the MaybeExtend{Point,Range}KeyBounds methods. - Smallest InternalKey - Largest InternalKey - // Stats describe table statistics. Protected by DB.mu. - // - // For virtual sstables, set stats upon virtual sstable creation as - // asynchronous computation of stats is not currently supported. - // - // TODO(bananabrick): To support manifest replay for virtual sstables, we - // probably need to compute virtual sstable stats asynchronously. Otherwise, - // we'd have to write virtual sstable stats to the version edit. - Stats TableStats - - // For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and - // pick L0 compactions. Only accurate for the most recent Version. - SubLevel int - L0Index int - minIntervalIndex int - maxIntervalIndex int - - // NB: the alignment of this struct is 8 bytes. We pack all the bools to - // ensure an optimal packing. - - // IsIntraL0Compacting is set to True if this file is part of an intra-L0 - // compaction. When it's true, IsCompacting must also return true. If - // Compacting is true and IsIntraL0Compacting is false for an L0 file, the - // file must be part of a compaction to Lbase. - IsIntraL0Compacting bool - CompactionState CompactionState - // True if compaction of this file has been explicitly requested. - // Previously, RocksDB and earlier versions of Pebble allowed this - // flag to be set by a user table property collector. Some earlier - // versions of Pebble respected this flag, while other more recent - // versions ignored this flag. - // - // More recently this flag has been repurposed to facilitate the - // compaction of 'atomic compaction units'. Files marked for - // compaction are compacted in a rewrite compaction at the lowest - // possible compaction priority. - // - // NB: A count of files marked for compaction is maintained on - // Version, and compaction picking reads cached annotations - // determined by this field. - // - // Protected by DB.mu. - MarkedForCompaction bool - // HasPointKeys tracks whether the table contains point keys (including - // RANGEDELs). If a table contains only range deletions, HasPointsKeys is - // still true. - HasPointKeys bool - // HasRangeKeys tracks whether the table contains any range keys. - HasRangeKeys bool - // smallestSet and largestSet track whether the overall bounds have been set. - boundsSet bool - // boundTypeSmallest and boundTypeLargest provide an indication as to which - // key type (point or range) corresponds to the smallest and largest overall - // table bounds. - boundTypeSmallest, boundTypeLargest boundType - // Virtual is true if the FileMetadata belongs to a virtual sstable. - Virtual bool -} - -// PhysicalFileMeta is used by functions which want a guarantee that their input -// belongs to a physical sst and not a virtual sst. -// -// NB: This type should only be constructed by calling -// FileMetadata.PhysicalMeta. -type PhysicalFileMeta struct { - *FileMetadata -} - -// VirtualFileMeta is used by functions which want a guarantee that their input -// belongs to a virtual sst and not a physical sst. -// -// A VirtualFileMeta inherits all the same fields as a FileMetadata. These -// fields have additional invariants imposed on them, and/or slightly varying -// meanings: -// - Smallest and Largest (and their counterparts -// {Smallest, Largest}{Point,Range}Key) remain tight bounds that represent a -// key at that exact bound. We make the effort to determine the next smallest -// or largest key in an sstable after virtualizing it, to maintain this -// tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it -// could mean that a rangedel or range key ends at that user key, or has been -// truncated to that user key. -// - One invariant is that if a rangedel or range key is truncated on its -// upper bound, the virtual sstable *must* have a rangedel or range key -// sentinel key as its upper bound. This is because truncation yields -// an exclusive upper bound for the rangedel/rangekey, and if there are -// any points at that exclusive upper bound within the same virtual -// sstable, those could get uncovered by this truncation. We enforce this -// invariant in calls to keyspan.Truncate. -// - Size is an estimate of the size of the virtualized portion of this sstable. -// The underlying file's size is stored in FileBacking.Size, though it could -// also be estimated or could correspond to just the referenced portion of -// a file (eg. if the file originated on another node). -// - Size must be > 0. -// - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables. -// This means that all keys in the virtual sstable must have seqnums within -// [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's -// a key with a seqnum at either of the bounds. Calculating tight seqnum -// bounds would be too expensive and deliver little value. -// -// NB: This type should only be constructed by calling FileMetadata.VirtualMeta. -type VirtualFileMeta struct { - *FileMetadata -} - -// PhysicalMeta should be the only source of creating the PhysicalFileMeta -// wrapper type. -func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta { - if m.Virtual { - panic("pebble: file metadata does not belong to a physical sstable") - } - return PhysicalFileMeta{ - m, - } -} - -// VirtualMeta should be the only source of creating the VirtualFileMeta wrapper -// type. -func (m *FileMetadata) VirtualMeta() VirtualFileMeta { - if !m.Virtual { - panic("pebble: file metadata does not belong to a virtual sstable") - } - return VirtualFileMeta{ - m, - } -} - -// FileBacking either backs a single physical sstable, or one or more virtual -// sstables. -// -// See the comment above the FileMetadata type for sstable terminology. -type FileBacking struct { - // Reference count for the backing file on disk: incremented when a - // physical or virtual sstable which is backed by the FileBacking is - // added to a version and decremented when the version is unreferenced. - // We ref count in order to determine when it is safe to delete a - // backing sst file from disk. The backing file is obsolete when the - // reference count falls to zero. - refs atomic.Int32 - // latestVersionRefs are the references to the FileBacking in the - // latest version. This reference can be through a single physical - // sstable in the latest version, or one or more virtual sstables in the - // latest version. - // - // INVARIANT: latestVersionRefs <= refs. - latestVersionRefs atomic.Int32 - // VirtualizedSize is set iff the backing sst is only referred to by - // virtual ssts in the latest version. VirtualizedSize is the sum of the - // virtual sstable sizes of all of the virtual sstables in the latest - // version which are backed by the physical sstable. When a virtual - // sstable is removed from the latest version, we will decrement the - // VirtualizedSize. During compaction picking, we'll compensate a - // virtual sstable file size by - // (FileBacking.Size - FileBacking.VirtualizedSize) / latestVersionRefs. - // The intuition is that if FileBacking.Size - FileBacking.VirtualizedSize - // is high, then the space amplification due to virtual sstables is - // high, and we should pick the virtual sstable with a higher priority. - // - // TODO(bananabrick): Compensate the virtual sstable file size using - // the VirtualizedSize during compaction picking and test. - VirtualizedSize atomic.Uint64 - DiskFileNum base.DiskFileNum - Size uint64 -} - -// InitPhysicalBacking allocates and sets the FileBacking which is required by a -// physical sstable FileMetadata. -// -// Ensure that the state required by FileBacking, such as the FileNum, is -// already set on the FileMetadata before InitPhysicalBacking is called. -// Calling InitPhysicalBacking only after the relevant state has been set in the -// FileMetadata is not necessary in tests which don't rely on FileBacking. -func (m *FileMetadata) InitPhysicalBacking() { - if m.Virtual { - panic("pebble: virtual sstables should use a pre-existing FileBacking") - } - if m.FileBacking == nil { - m.FileBacking = &FileBacking{Size: m.Size, DiskFileNum: m.FileNum.DiskFileNum()} - } -} - -// InitProviderBacking creates a new FileBacking for a file backed by -// an objstorage.Provider. -func (m *FileMetadata) InitProviderBacking(fileNum base.DiskFileNum) { - if !m.Virtual { - panic("pebble: provider-backed sstables must be virtual") - } - if m.FileBacking == nil { - m.FileBacking = &FileBacking{DiskFileNum: fileNum} - } -} - -// ValidateVirtual should be called once the FileMetadata for a virtual sstable -// is created to verify that the fields of the virtual sstable are sound. -func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) { - if !m.Virtual { - panic("pebble: invalid virtual sstable") - } - - if createdFrom.SmallestSeqNum != m.SmallestSeqNum { - panic("pebble: invalid smallest sequence number for virtual sstable") - } - - if createdFrom.LargestSeqNum != m.LargestSeqNum { - panic("pebble: invalid largest sequence number for virtual sstable") - } - - if createdFrom.FileBacking != nil && createdFrom.FileBacking != m.FileBacking { - panic("pebble: invalid physical sstable state for virtual sstable") - } - - if m.Size == 0 { - panic("pebble: virtual sstable size must be set upon creation") - } -} - -// Refs returns the refcount of backing sstable. -func (m *FileMetadata) Refs() int32 { - return m.FileBacking.refs.Load() -} - -// Ref increments the ref count associated with the backing sstable. -func (m *FileMetadata) Ref() { - m.FileBacking.refs.Add(1) -} - -// Unref decrements the ref count associated with the backing sstable. -func (m *FileMetadata) Unref() int32 { - v := m.FileBacking.refs.Add(-1) - if invariants.Enabled && v < 0 { - panic("pebble: invalid FileMetadata refcounting") - } - return v -} - -// LatestRef increments the latest ref count associated with the backing -// sstable. -func (m *FileMetadata) LatestRef() { - m.FileBacking.latestVersionRefs.Add(1) - - if m.Virtual { - m.FileBacking.VirtualizedSize.Add(m.Size) - } -} - -// LatestUnref decrements the latest ref count associated with the backing -// sstable. -func (m *FileMetadata) LatestUnref() int32 { - if m.Virtual { - m.FileBacking.VirtualizedSize.Add(-m.Size) - } - - v := m.FileBacking.latestVersionRefs.Add(-1) - if invariants.Enabled && v < 0 { - panic("pebble: invalid FileMetadata latest refcounting") - } - return v -} - -// LatestRefs returns the latest ref count associated with the backing sstable. -func (m *FileMetadata) LatestRefs() int32 { - return m.FileBacking.latestVersionRefs.Load() -} - -// SetCompactionState transitions this file's compaction state to the given -// state. Protected by DB.mu. -func (m *FileMetadata) SetCompactionState(to CompactionState) { - if invariants.Enabled { - transitionErr := func() error { - return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to) - } - switch m.CompactionState { - case CompactionStateNotCompacting: - if to != CompactionStateCompacting { - panic(transitionErr()) - } - case CompactionStateCompacting: - if to != CompactionStateCompacted && to != CompactionStateNotCompacting { - panic(transitionErr()) - } - case CompactionStateCompacted: - panic(transitionErr()) - default: - panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState)) - } - } - m.CompactionState = to -} - -// IsCompacting returns true if this file's compaction state is -// CompactionStateCompacting. Protected by DB.mu. -func (m *FileMetadata) IsCompacting() bool { - return m.CompactionState == CompactionStateCompacting -} - -// StatsValid returns true if the table stats have been populated. If StatValid -// returns true, the Stats field may be read (with or without holding the -// database mutex). -func (m *FileMetadata) StatsValid() bool { - return m.statsValid.Load() -} - -// StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu -// while populating TableStats and calling StatsMarkValud. Once stats are -// populated, they must not be mutated. -func (m *FileMetadata) StatsMarkValid() { - m.statsValid.Store(true) -} - -// ExtendPointKeyBounds attempts to extend the lower and upper point key bounds -// and overall table bounds with the given smallest and largest keys. The -// smallest and largest bounds may not be extended if the table already has a -// bound that is smaller or larger, respectively. The receiver is returned. -// NB: calling this method should be preferred to manually setting the bounds by -// manipulating the fields directly, to maintain certain invariants. -func (m *FileMetadata) ExtendPointKeyBounds( - cmp Compare, smallest, largest InternalKey, -) *FileMetadata { - // Update the point key bounds. - if !m.HasPointKeys { - m.SmallestPointKey, m.LargestPointKey = smallest, largest - m.HasPointKeys = true - } else { - if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 { - m.SmallestPointKey = smallest - } - if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 { - m.LargestPointKey = largest - } - } - // Update the overall bounds. - m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey) - return m -} - -// ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds -// and overall table bounds with the given smallest and largest keys. The -// smallest and largest bounds may not be extended if the table already has a -// bound that is smaller or larger, respectively. The receiver is returned. -// NB: calling this method should be preferred to manually setting the bounds by -// manipulating the fields directly, to maintain certain invariants. -func (m *FileMetadata) ExtendRangeKeyBounds( - cmp Compare, smallest, largest InternalKey, -) *FileMetadata { - // Update the range key bounds. - if !m.HasRangeKeys { - m.SmallestRangeKey, m.LargestRangeKey = smallest, largest - m.HasRangeKeys = true - } else { - if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 { - m.SmallestRangeKey = smallest - } - if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 { - m.LargestRangeKey = largest - } - } - // Update the overall bounds. - m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey) - return m -} - -// extendOverallBounds attempts to extend the overall table lower and upper -// bounds. The given bounds may not be used if a lower or upper bound already -// exists that is smaller or larger than the given keys, respectively. The given -// boundType will be used if the bounds are updated. -func (m *FileMetadata) extendOverallBounds( - cmp Compare, smallest, largest InternalKey, bTyp boundType, -) { - if !m.boundsSet { - m.Smallest, m.Largest = smallest, largest - m.boundsSet = true - m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp - } else { - if base.InternalCompare(cmp, smallest, m.Smallest) < 0 { - m.Smallest = smallest - m.boundTypeSmallest = bTyp - } - if base.InternalCompare(cmp, largest, m.Largest) > 0 { - m.Largest = largest - m.boundTypeLargest = bTyp - } - } -} - -// Overlaps returns true if the file key range overlaps with the given range. -func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool { - if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) { - // f is completely before the specified range; no overlap. - return false - } - if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) { - // f is completely after the specified range; no overlap. - return false - } - return true -} - -// ContainedWithinSpan returns true if the file key range completely overlaps with the -// given range ("end" is assumed to exclusive). -func (m *FileMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool { - lowerCmp, upperCmp := cmp(m.Smallest.UserKey, start), cmp(m.Largest.UserKey, end) - return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest.IsExclusiveSentinel())) -} - -// ContainsKeyType returns whether or not the file contains keys of the provided -// type. -func (m *FileMetadata) ContainsKeyType(kt KeyType) bool { - switch kt { - case KeyTypePointAndRange: - return true - case KeyTypePoint: - return m.HasPointKeys - case KeyTypeRange: - return m.HasRangeKeys - default: - panic("unrecognized key type") - } -} - -// SmallestBound returns the file's smallest bound of the key type. It returns a -// false second return value if the file does not contain any keys of the key -// type. -func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) { - switch kt { - case KeyTypePointAndRange: - return &m.Smallest, true - case KeyTypePoint: - return &m.SmallestPointKey, m.HasPointKeys - case KeyTypeRange: - return &m.SmallestRangeKey, m.HasRangeKeys - default: - panic("unrecognized key type") - } -} - -// LargestBound returns the file's largest bound of the key type. It returns a -// false second return value if the file does not contain any keys of the key -// type. -func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) { - switch kt { - case KeyTypePointAndRange: - return &m.Largest, true - case KeyTypePoint: - return &m.LargestPointKey, m.HasPointKeys - case KeyTypeRange: - return &m.LargestRangeKey, m.HasRangeKeys - default: - panic("unrecognized key type") - } -} - -const ( - maskContainsPointKeys = 1 << 0 - maskSmallest = 1 << 1 - maskLargest = 1 << 2 -) - -// boundsMarker returns a marker byte whose bits encode the following -// information (in order from least significant bit): -// - if the table contains point keys -// - if the table's smallest key is a point key -// - if the table's largest key is a point key -func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) { - if m.HasPointKeys { - sentinel |= maskContainsPointKeys - } - switch m.boundTypeSmallest { - case boundTypePointKey: - sentinel |= maskSmallest - case boundTypeRangeKey: - // No op - leave bit unset. - default: - return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum) - } - switch m.boundTypeLargest { - case boundTypePointKey: - sentinel |= maskLargest - case boundTypeRangeKey: - // No op - leave bit unset. - default: - return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum) - } - return -} - -// String implements fmt.Stringer, printing the file number and the overall -// table bounds. -func (m *FileMetadata) String() string { - return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest) -} - -// DebugString returns a verbose representation of FileMetadata, typically for -// use in tests and debugging, returning the file number and the point, range -// and overall bounds for the table. -func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string { - var b bytes.Buffer - fmt.Fprintf(&b, "%s:[%s-%s]", - m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format)) - if !verbose { - return b.String() - } - fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum) - if m.HasPointKeys { - fmt.Fprintf(&b, " points:[%s-%s]", - m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format)) - } - if m.HasRangeKeys { - fmt.Fprintf(&b, " ranges:[%s-%s]", - m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format)) - } - return b.String() -} - -// ParseFileMetadataDebug parses a FileMetadata from its DebugString -// representation. -func ParseFileMetadataDebug(s string) (*FileMetadata, error) { - // Split lines of the form: - // 000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...] - fields := strings.FieldsFunc(s, func(c rune) bool { - switch c { - case ':', '[', '-', ']': - return true - default: - return unicode.IsSpace(c) // NB: also trim whitespace padding. - } - }) - if len(fields)%3 != 0 { - return nil, errors.Newf("malformed input: %s", s) - } - m := &FileMetadata{} - for len(fields) > 0 { - prefix := fields[0] - if prefix == "seqnums" { - smallestSeqNum, err := strconv.ParseUint(fields[1], 10, 64) - if err != nil { - return m, errors.Newf("malformed input: %s: %s", s, err) - } - largestSeqNum, err := strconv.ParseUint(fields[2], 10, 64) - if err != nil { - return m, errors.Newf("malformed input: %s: %s", s, err) - } - m.SmallestSeqNum, m.LargestSeqNum = smallestSeqNum, largestSeqNum - fields = fields[3:] - continue - } - smallest := base.ParsePrettyInternalKey(fields[1]) - largest := base.ParsePrettyInternalKey(fields[2]) - switch prefix { - case "points": - m.SmallestPointKey, m.LargestPointKey = smallest, largest - m.HasPointKeys = true - case "ranges": - m.SmallestRangeKey, m.LargestRangeKey = smallest, largest - m.HasRangeKeys = true - default: - fileNum, err := strconv.ParseUint(prefix, 10, 64) - if err != nil { - return m, errors.Newf("malformed input: %s: %s", s, err) - } - m.FileNum = base.FileNum(fileNum) - m.Smallest, m.Largest = smallest, largest - m.boundsSet = true - } - fields = fields[3:] - } - // By default, when the parser sees just the overall bounds, we set the point - // keys. This preserves backwards compatability with existing test cases that - // specify only the overall bounds. - if !m.HasPointKeys && !m.HasRangeKeys { - m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest - m.HasPointKeys = true - } - m.InitPhysicalBacking() - return m, nil -} - -// Validate validates the metadata for consistency with itself, returning an -// error if inconsistent. -func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error { - // Combined range and point key validation. - - if !m.HasPointKeys && !m.HasRangeKeys { - return base.CorruptionErrorf("file %s has neither point nor range keys", - errors.Safe(m.FileNum)) - } - if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 { - return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s", - errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey), - m.Largest.Pretty(formatKey)) - } - if m.SmallestSeqNum > m.LargestSeqNum { - return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d", - errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum) - } - - // Point key validation. - - if m.HasPointKeys { - if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 { - return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s", - errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey), - m.LargestPointKey.Pretty(formatKey)) - } - if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 || - base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 { - return base.CorruptionErrorf( - "file %s has inconsistent point key bounds relative to overall bounds: "+ - "overall = [%s-%s], point keys = [%s-%s]", - errors.Safe(m.FileNum), - m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey), - m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey), - ) - } - } - - // Range key validation. - - if m.HasRangeKeys { - if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 { - return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s", - errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey), - m.LargestRangeKey.Pretty(formatKey)) - } - if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 || - base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 { - return base.CorruptionErrorf( - "file %s has inconsistent range key bounds relative to overall bounds: "+ - "overall = [%s-%s], range keys = [%s-%s]", - errors.Safe(m.FileNum), - m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey), - m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey), - ) - } - } - - // Ensure that FileMetadata.Init was called. - if m.FileBacking == nil { - return base.CorruptionErrorf("file metadata FileBacking not set") - } - - return nil -} - -// TableInfo returns a subset of the FileMetadata state formatted as a -// TableInfo. -func (m *FileMetadata) TableInfo() TableInfo { - return TableInfo{ - FileNum: m.FileNum, - Size: m.Size, - Smallest: m.Smallest, - Largest: m.Largest, - SmallestSeqNum: m.SmallestSeqNum, - LargestSeqNum: m.LargestSeqNum, - } -} - -func cmpUint64(a, b uint64) int { - switch { - case a < b: - return -1 - case a > b: - return +1 - default: - return 0 - } -} - -func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int { - // NB: This is the same ordering that RocksDB uses for L0 files. - - // Sort first by largest sequence number. - if m.LargestSeqNum != b.LargestSeqNum { - return cmpUint64(m.LargestSeqNum, b.LargestSeqNum) - } - // Then by smallest sequence number. - if m.SmallestSeqNum != b.SmallestSeqNum { - return cmpUint64(m.SmallestSeqNum, b.SmallestSeqNum) - } - // Break ties by file number. - return cmpUint64(uint64(m.FileNum), uint64(b.FileNum)) -} - -func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool { - return m.cmpSeqNum(b) < 0 -} - -func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int { - return base.InternalCompare(cmp, m.Smallest, b.Smallest) -} - -// KeyRange returns the minimum smallest and maximum largest internalKey for -// all the FileMetadata in iters. -func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) { - first := true - for _, iter := range iters { - for meta := iter.First(); meta != nil; meta = iter.Next() { - if first { - first = false - smallest, largest = meta.Smallest, meta.Largest - continue - } - if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 { - smallest = meta.Smallest - } - if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 { - largest = meta.Largest - } - } - } - return smallest, largest -} - -type bySeqNum []*FileMetadata - -func (b bySeqNum) Len() int { return len(b) } -func (b bySeqNum) Less(i, j int) bool { - return b[i].lessSeqNum(b[j]) -} -func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] } - -// SortBySeqNum sorts the specified files by increasing sequence number. -func SortBySeqNum(files []*FileMetadata) { - sort.Sort(bySeqNum(files)) -} - -type bySmallest struct { - files []*FileMetadata - cmp Compare -} - -func (b bySmallest) Len() int { return len(b.files) } -func (b bySmallest) Less(i, j int) bool { - return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0 -} -func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] } - -// SortBySmallest sorts the specified files by smallest key using the supplied -// comparison function to order user keys. -func SortBySmallest(files []*FileMetadata, cmp Compare) { - sort.Sort(bySmallest{files, cmp}) -} - -func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice { - startIter := iter.Clone() - { - startIterFile := startIter.SeekGE(cmp, start) - // SeekGE compares user keys. The user key `start` may be equal to the - // f.Largest because f.Largest is a range deletion sentinel, indicating - // that the user key `start` is NOT contained within the file f. If - // that's the case, we can narrow the overlapping bounds to exclude the - // file with the sentinel. - if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() && - cmp(startIterFile.Largest.UserKey, start) == 0 { - startIterFile = startIter.Next() - } - _ = startIterFile // Ignore unused assignment. - } - - endIter := iter.Clone() - { - endIterFile := endIter.SeekGE(cmp, end) - - if !exclusiveEnd { - // endIter is now pointing at the *first* file with a largest key >= end. - // If there are multiple files including the user key `end`, we want all - // of them, so move forward. - for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 { - endIterFile = endIter.Next() - } - } - - // LevelSlice uses inclusive bounds, so if we seeked to the end sentinel - // or nexted too far because Largest.UserKey equaled `end`, go back. - // - // Consider !exclusiveEnd and end = 'f', with the following file bounds: - // - // [b,d] [e, f] [f, f] [g, h] - // - // the above for loop will Next until it arrives at [g, h]. We need to - // observe that g > f, and Prev to the file with bounds [f, f]. - if endIterFile == nil { - endIterFile = endIter.Prev() - } else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd { - endIterFile = endIter.Prev() - } - _ = endIterFile // Ignore unused assignment. - } - return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter) -} - -// NumLevels is the number of levels a Version contains. -const NumLevels = 7 - -// NewVersion constructs a new Version with the provided files. It requires -// the provided files are already well-ordered. It's intended for testing. -func NewVersion( - cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata, -) *Version { - var v Version - for l := range files { - // NB: We specifically insert `files` into the B-Tree in the order - // they appear within `files`. Some tests depend on this behavior in - // order to test consistency checking, etc. Once we've constructed the - // initial B-Tree, we swap out the btreeCmp for the correct one. - // TODO(jackson): Adjust or remove the tests and remove this. - v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l]) - v.Levels[l].level = l - if l == 0 { - v.Levels[l].tree.cmp = btreeCmpSeqNum - } else { - v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp) - } - for _, f := range files[l] { - v.Levels[l].totalSize += f.Size - } - } - if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { - panic(err) - } - return &v -} - -// Version is a collection of file metadata for on-disk tables at various -// levels. In-memory DBs are written to level-0 tables, and compactions -// migrate data from level N to level N+1. The tables map internal keys (which -// are a user key, a delete or set bit, and a sequence number) to user values. -// -// The tables at level 0 are sorted by largest sequence number. Due to file -// ingestion, there may be overlap in the ranges of sequence numbers contain in -// level 0 sstables. In particular, it is valid for one level 0 sstable to have -// the seqnum range [1,100] while an adjacent sstable has the seqnum range -// [50,50]. This occurs when the [50,50] table was ingested and given a global -// seqnum. The ingestion code will have ensured that the [50,50] sstable will -// not have any keys that overlap with the [1,100] in the seqnum range -// [1,49]. The range of internal keys [fileMetadata.smallest, -// fileMetadata.largest] in each level 0 table may overlap. -// -// The tables at any non-0 level are sorted by their internal key range and any -// two tables at the same non-0 level do not overlap. -// -// The internal key ranges of two tables at different levels X and Y may -// overlap, for any X != Y. -// -// Finally, for every internal key in a table at level X, there is no internal -// key in a higher level table that has both the same user key and a higher -// sequence number. -type Version struct { - refs atomic.Int32 - - // The level 0 sstables are organized in a series of sublevels. Similar to - // the seqnum invariant in normal levels, there is no internal key in a - // higher level table that has both the same user key and a higher sequence - // number. Within a sublevel, tables are sorted by their internal key range - // and any two tables at the same sublevel do not overlap. Unlike the normal - // levels, sublevel n contains older tables (lower sequence numbers) than - // sublevel n+1. - // - // The L0Sublevels struct is mostly used for compaction picking. As most - // internal data structures in it are only necessary for compaction picking - // and not for iterator creation, the reference to L0Sublevels is nil'd - // after this version becomes the non-newest version, to reduce memory - // usage. - // - // L0Sublevels.Levels contains L0 files ordered by sublevels. All the files - // in Levels[0] are in L0Sublevels.Levels. L0SublevelFiles is also set to - // a reference to that slice, as that slice is necessary for iterator - // creation and needs to outlast L0Sublevels. - L0Sublevels *L0Sublevels - L0SublevelFiles []LevelSlice - - Levels [NumLevels]LevelMetadata - - // RangeKeyLevels holds a subset of the same files as Levels that contain range - // keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this - // duplication should be minimal, as range keys are expected to be rare. - RangeKeyLevels [NumLevels]LevelMetadata - - // The callback to invoke when the last reference to a version is - // removed. Will be called with list.mu held. - Deleted func(obsolete []*FileBacking) - - // Stats holds aggregated stats about the version maintained from - // version to version. - Stats struct { - // MarkedForCompaction records the count of files marked for - // compaction within the version. - MarkedForCompaction int - } - - // The list the version is linked into. - list *VersionList - - // The next/prev link for the versionList doubly-linked list of versions. - prev, next *Version -} - -// String implements fmt.Stringer, printing the FileMetadata for each level in -// the Version. -func (v *Version) String() string { - return v.string(base.DefaultFormatter, false) -} - -// DebugString returns an alternative format to String() which includes sequence -// number and kind information for the sstable boundaries. -func (v *Version) DebugString(format base.FormatKey) string { - return v.string(format, true) -} - -func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string { - var buf bytes.Buffer - for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- { - fmt.Fprintf(&buf, "0.%d:\n", sublevel) - sublevels[sublevel].Each(func(f *FileMetadata) { - fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose)) - }) - } - return buf.String() -} - -func (v *Version) string(format base.FormatKey, verbose bool) string { - var buf bytes.Buffer - if len(v.L0SublevelFiles) > 0 { - fmt.Fprintf(&buf, "%s", describeSublevels(format, verbose, v.L0SublevelFiles)) - } - for level := 1; level < NumLevels; level++ { - if v.Levels[level].Empty() { - continue - } - fmt.Fprintf(&buf, "%d:\n", level) - iter := v.Levels[level].Iter() - for f := iter.First(); f != nil; f = iter.Next() { - fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose)) - } - } - return buf.String() -} - -// ParseVersionDebug parses a Version from its DebugString output. -func ParseVersionDebug( - cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, s string, -) (*Version, error) { - var level int - var files [NumLevels][]*FileMetadata - for _, l := range strings.Split(s, "\n") { - l = strings.TrimSpace(l) - - switch l[:2] { - case "0.", "0:", "1:", "2:", "3:", "4:", "5:", "6:": - var err error - level, err = strconv.Atoi(l[:1]) - if err != nil { - return nil, err - } - default: - m, err := ParseFileMetadataDebug(l) - if err != nil { - return nil, err - } - // If we only parsed overall bounds, default to setting the point bounds. - if !m.HasPointKeys && !m.HasRangeKeys { - m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest - m.HasPointKeys = true - } - files[level] = append(files[level], m) - } - } - // Reverse the order of L0 files. This ensures we construct the same - // sublevels. (They're printed from higher sublevel to lower, which means in - // a partial order that represents newest to oldest). - for i := 0; i < len(files[0])/2; i++ { - files[0][i], files[0][len(files[0])-i-1] = files[0][len(files[0])-i-1], files[0][i] - } - return NewVersion(cmp, formatKey, flushSplitBytes, files), nil -} - -// Refs returns the number of references to the version. -func (v *Version) Refs() int32 { - return v.refs.Load() -} - -// Ref increments the version refcount. -func (v *Version) Ref() { - v.refs.Add(1) -} - -// Unref decrements the version refcount. If the last reference to the version -// was removed, the version is removed from the list of versions and the -// Deleted callback is invoked. Requires that the VersionList mutex is NOT -// locked. -func (v *Version) Unref() { - if v.refs.Add(-1) == 0 { - l := v.list - l.mu.Lock() - l.Remove(v) - v.Deleted(v.unrefFiles()) - l.mu.Unlock() - } -} - -// UnrefLocked decrements the version refcount. If the last reference to the -// version was removed, the version is removed from the list of versions and -// the Deleted callback is invoked. Requires that the VersionList mutex is -// already locked. -func (v *Version) UnrefLocked() { - if v.refs.Add(-1) == 0 { - v.list.Remove(v) - v.Deleted(v.unrefFiles()) - } -} - -func (v *Version) unrefFiles() []*FileBacking { - var obsolete []*FileBacking - for _, lm := range v.Levels { - obsolete = append(obsolete, lm.release()...) - } - for _, lm := range v.RangeKeyLevels { - obsolete = append(obsolete, lm.release()...) - } - return obsolete -} - -// Next returns the next version in the list of versions. -func (v *Version) Next() *Version { - return v.next -} - -// InitL0Sublevels initializes the L0Sublevels -func (v *Version) InitL0Sublevels( - cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, -) error { - var err error - v.L0Sublevels, err = NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes) - if err == nil && v.L0Sublevels != nil { - v.L0SublevelFiles = v.L0Sublevels.Levels - } - return err -} - -// Contains returns a boolean indicating whether the provided file exists in -// the version at the given level. If level is non-zero then Contains binary -// searches among the files. If level is zero, Contains scans the entire -// level. -func (v *Version) Contains(level int, cmp Compare, m *FileMetadata) bool { - iter := v.Levels[level].Iter() - if level > 0 { - overlaps := v.Overlaps(level, cmp, m.Smallest.UserKey, m.Largest.UserKey, - m.Largest.IsExclusiveSentinel()) - iter = overlaps.Iter() - } - for f := iter.First(); f != nil; f = iter.Next() { - if f == m { - return true - } - } - return false -} - -// Overlaps returns all elements of v.files[level] whose user key range -// intersects the given range. If level is non-zero then the user key ranges of -// v.files[level] are assumed to not overlap (although they may touch). If level -// is zero then that assumption cannot be made, and the [start, end] range is -// expanded to the union of those matching ranges so far and the computation is -// repeated until [start, end] stabilizes. -// The returned files are a subsequence of the input files, i.e., the ordering -// is not changed. -func (v *Version) Overlaps( - level int, cmp Compare, start, end []byte, exclusiveEnd bool, -) LevelSlice { - if level == 0 { - // Indices that have been selected as overlapping. - l0 := v.Levels[level] - l0Iter := l0.Iter() - selectedIndices := make([]bool, l0.Len()) - numSelected := 0 - var slice LevelSlice - for { - restart := false - for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { - selected := selectedIndices[i] - if selected { - continue - } - if !meta.Overlaps(cmp, start, end, exclusiveEnd) { - // meta is completely outside the specified range; skip it. - continue - } - // Overlaps. - selectedIndices[i] = true - numSelected++ - - smallest := meta.Smallest.UserKey - largest := meta.Largest.UserKey - // Since level == 0, check if the newly added fileMetadata has - // expanded the range. We expand the range immediately for files - // we have remaining to check in this loop. All already checked - // and unselected files will need to be rechecked via the - // restart below. - if cmp(smallest, start) < 0 { - start = smallest - restart = true - } - if v := cmp(largest, end); v > 0 { - end = largest - exclusiveEnd = meta.Largest.IsExclusiveSentinel() - restart = true - } else if v == 0 && exclusiveEnd && !meta.Largest.IsExclusiveSentinel() { - // Only update the exclusivity of our existing `end` - // bound. - exclusiveEnd = false - restart = true - } - } - - if !restart { - // Construct a B-Tree containing only the matching items. - var tr btree - tr.cmp = v.Levels[level].tree.cmp - for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { - if selectedIndices[i] { - err := tr.Insert(meta) - if err != nil { - panic(err) - } - } - } - slice = newLevelSlice(tr.Iter()) - // TODO(jackson): Avoid the oddity of constructing and - // immediately releasing a B-Tree. Make LevelSlice an - // interface? - tr.Release() - break - } - // Continue looping to retry the files that were not selected. - } - return slice - } - - return overlaps(v.Levels[level].Iter(), cmp, start, end, exclusiveEnd) -} - -// CheckOrdering checks that the files are consistent with respect to -// increasing file numbers (for level 0 files) and increasing and non- -// overlapping internal key ranges (for level non-0 files). -func (v *Version) CheckOrdering( - cmp Compare, format base.FormatKey, order OrderingInvariants, -) error { - for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- { - sublevelIter := v.L0SublevelFiles[sublevel].Iter() - // Sublevels have NEVER allowed split user keys, so we can pass - // ProhibitSplitUserKeys. - if err := CheckOrdering(cmp, format, L0Sublevel(sublevel), sublevelIter, ProhibitSplitUserKeys); err != nil { - return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format)) - } - } - - for level, lm := range v.Levels { - if err := CheckOrdering(cmp, format, Level(level), lm.Iter(), order); err != nil { - return base.CorruptionErrorf("%s\n%s", err, v.DebugString(format)) - } - } - return nil -} - -// VersionList holds a list of versions. The versions are ordered from oldest -// to newest. -type VersionList struct { - mu *sync.Mutex - root Version -} - -// Init initializes the version list. -func (l *VersionList) Init(mu *sync.Mutex) { - l.mu = mu - l.root.next = &l.root - l.root.prev = &l.root -} - -// Empty returns true if the list is empty, and false otherwise. -func (l *VersionList) Empty() bool { - return l.root.next == &l.root -} - -// Front returns the oldest version in the list. Note that this version is only -// valid if Empty() returns true. -func (l *VersionList) Front() *Version { - return l.root.next -} - -// Back returns the newest version in the list. Note that this version is only -// valid if Empty() returns true. -func (l *VersionList) Back() *Version { - return l.root.prev -} - -// PushBack adds a new version to the back of the list. This new version -// becomes the "newest" version in the list. -func (l *VersionList) PushBack(v *Version) { - if v.list != nil || v.prev != nil || v.next != nil { - panic("pebble: version list is inconsistent") - } - v.prev = l.root.prev - v.prev.next = v - v.next = &l.root - v.next.prev = v - v.list = l - // Let L0Sublevels on the second newest version get GC'd, as it is no longer - // necessary. See the comment in Version. - v.prev.L0Sublevels = nil -} - -// Remove removes the specified version from the list. -func (l *VersionList) Remove(v *Version) { - if v == &l.root { - panic("pebble: cannot remove version list root node") - } - if v.list != l { - panic("pebble: version list is inconsistent") - } - v.prev.next = v.next - v.next.prev = v.prev - v.next = nil // avoid memory leaks - v.prev = nil // avoid memory leaks - v.list = nil // avoid memory leaks -} - -// OrderingInvariants dictates the file ordering invariants active. -type OrderingInvariants int8 - -const ( - // ProhibitSplitUserKeys indicates that adjacent files within a level cannot - // contain the same user key. - ProhibitSplitUserKeys OrderingInvariants = iota - // AllowSplitUserKeys indicates that adjacent files within a level may - // contain the same user key. This is only allowed by historical format - // major versions. - // - // TODO(jackson): Remove. - AllowSplitUserKeys -) - -// CheckOrdering checks that the files are consistent with respect to -// seqnums (for level 0 files -- see detailed comment below) and increasing and non- -// overlapping internal key ranges (for non-level 0 files). -// -// The ordering field may be passed AllowSplitUserKeys to allow adjacent files that are both -// inclusive of the same user key. Pebble no longer creates version edits -// installing such files, and Pebble databases with sufficiently high format -// major version should no longer have any such files within their LSM. -// TODO(jackson): Remove AllowSplitUserKeys when we remove support for the -// earlier format major versions. -func CheckOrdering( - cmp Compare, format base.FormatKey, level Level, files LevelIterator, ordering OrderingInvariants, -) error { - // The invariants to check for L0 sublevels are the same as the ones to - // check for all other levels. However, if L0 is not organized into - // sublevels, or if all L0 files are being passed in, we do the legacy L0 - // checks, defined in the detailed comment below. - if level == Level(0) { - // We have 2 kinds of files: - // - Files with exactly one sequence number: these could be either ingested files - // or flushed files. We cannot tell the difference between them based on FileMetadata, - // so our consistency checking here uses the weaker checks assuming it is a narrow - // flushed file. We cannot error on ingested files having sequence numbers coincident - // with flushed files as the seemingly ingested file could just be a flushed file - // with just one key in it which is a truncated range tombstone sharing sequence numbers - // with other files in the same flush. - // - Files with multiple sequence numbers: these are necessarily flushed files. - // - // Three cases of overlapping sequence numbers: - // Case 1: - // An ingested file contained in the sequence numbers of the flushed file -- it must be - // fully contained (not coincident with either end of the flushed file) since the memtable - // must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence - // num b, and the memtable got a subsequent update that was given sequence num b+1, before - // being flushed. - // - // So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and - // third file are inconsistent with each other. So comparing adjacent files is insufficient - // for consistency checking. - // - // Visually we have something like - // x------y x-----------yx-------------y (flushed files where x, y are the endpoints) - // y y y y (y's represent ingested files) - // And these are ordered in increasing order of y. Note that y's must be unique. - // - // Case 2: - // A flushed file that did not overlap in keys with any file in any level, but does overlap - // in the file key intervals. This file is placed in L0 since it overlaps in the file - // key intervals but since it has no overlapping data, it is assigned a sequence number - // of 0 in RocksDB. We handle this case for compatibility with RocksDB. - // - // Case 3: - // A sequence of flushed files that overlap in sequence numbers with one another, - // but do not overlap in keys inside the sstables. These files correspond to - // partitioned flushes or the results of intra-L0 compactions of partitioned - // flushes. - // - // Since these types of SSTables violate most other sequence number - // overlap invariants, and handling this case is important for compatibility - // with future versions of pebble, this method relaxes most L0 invariant - // checks. - - var prev *FileMetadata - for f := files.First(); f != nil; f, prev = files.Next(), f { - if prev == nil { - continue - } - // Validate that the sorting is sane. - if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum { - // Multiple files satisfying case 2 mentioned above. - } else if !prev.lessSeqNum(f) { - return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>", - errors.Safe(prev.FileNum), errors.Safe(f.FileNum), - errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum), - errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum)) - } - } - } else { - var prev *FileMetadata - for f := files.First(); f != nil; f, prev = files.Next(), f { - if err := f.Validate(cmp, format); err != nil { - return errors.Wrapf(err, "%s ", level) - } - if prev != nil { - if prev.cmpSmallestKey(f, cmp) >= 0 { - return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]", - errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum), - prev.Smallest.Pretty(format), prev.Largest.Pretty(format), - f.Smallest.Pretty(format), f.Largest.Pretty(format)) - } - - // What's considered "overlapping" is dependent on the format - // major version. If ordering=ProhibitSplitUserKeys, then both - // files cannot contain keys with the same user keys. If the - // bounds have the same user key, the previous file's boundary - // must have a Trailer indicating that it's exclusive. - switch ordering { - case AllowSplitUserKeys: - if base.InternalCompare(cmp, prev.Largest, f.Smallest) >= 0 { - return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]", - errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum), - prev.Smallest.Pretty(format), prev.Largest.Pretty(format), - f.Smallest.Pretty(format), f.Largest.Pretty(format)) - } - case ProhibitSplitUserKeys: - if v := cmp(prev.Largest.UserKey, f.Smallest.UserKey); v > 0 || (v == 0 && !prev.Largest.IsExclusiveSentinel()) { - return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]", - errors.Safe(level), errors.Safe(prev.FileNum), errors.Safe(f.FileNum), - prev.Smallest.Pretty(format), prev.Largest.Pretty(format), - f.Smallest.Pretty(format), f.Largest.Pretty(format)) - } - default: - panic("unreachable") - } - } - } - } - return nil -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/manifest/version_edit.go b/vendor/github.com/cockroachdb/pebble/internal/manifest/version_edit.go deleted file mode 100644 index d7853c2..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/manifest/version_edit.go +++ /dev/null @@ -1,1121 +0,0 @@ -// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package manifest - -import ( - "bufio" - "bytes" - "encoding/binary" - "fmt" - "io" - "sort" - "time" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" -) - -// TODO(peter): describe the MANIFEST file format, independently of the C++ -// project. - -var errCorruptManifest = base.CorruptionErrorf("pebble: corrupt manifest") - -type byteReader interface { - io.ByteReader - io.Reader -} - -// Tags for the versionEdit disk format. -// Tag 8 is no longer used. -const ( - // LevelDB tags. - tagComparator = 1 - tagLogNumber = 2 - tagNextFileNumber = 3 - tagLastSequence = 4 - tagCompactPointer = 5 - tagDeletedFile = 6 - tagNewFile = 7 - tagPrevLogNumber = 9 - - // RocksDB tags. - tagNewFile2 = 100 - tagNewFile3 = 102 - tagNewFile4 = 103 - tagColumnFamily = 200 - tagColumnFamilyAdd = 201 - tagColumnFamilyDrop = 202 - tagMaxColumnFamily = 203 - - // Pebble tags. - tagNewFile5 = 104 // Range keys. - tagCreatedBackingTable = 105 - tagRemovedBackingTable = 106 - - // The custom tags sub-format used by tagNewFile4 and above. - customTagTerminate = 1 - customTagNeedsCompaction = 2 - customTagCreationTime = 6 - customTagPathID = 65 - customTagNonSafeIgnoreMask = 1 << 6 - customTagVirtual = 66 -) - -// DeletedFileEntry holds the state for a file deletion from a level. The file -// itself might still be referenced by another level. -type DeletedFileEntry struct { - Level int - FileNum base.FileNum -} - -// NewFileEntry holds the state for a new file or one moved from a different -// level. -type NewFileEntry struct { - Level int - Meta *FileMetadata - // BackingFileNum is only set during manifest replay, and only for virtual - // sstables. - BackingFileNum base.DiskFileNum -} - -// VersionEdit holds the state for an edit to a Version along with other -// on-disk state (log numbers, next file number, and the last sequence number). -type VersionEdit struct { - // ComparerName is the value of Options.Comparer.Name. This is only set in - // the first VersionEdit in a manifest (either when the DB is created, or - // when a new manifest is created) and is used to verify that the comparer - // specified at Open matches the comparer that was previously used. - ComparerName string - - // MinUnflushedLogNum is the smallest WAL log file number corresponding to - // mutations that have not been flushed to an sstable. - // - // This is an optional field, and 0 represents it is not set. - MinUnflushedLogNum base.FileNum - - // ObsoletePrevLogNum is a historic artifact from LevelDB that is not used by - // Pebble, RocksDB, or even LevelDB. Its use in LevelDB was deprecated in - // 6/2011. We keep it around purely for informational purposes when - // displaying MANIFEST contents. - ObsoletePrevLogNum uint64 - - // The next file number. A single counter is used to assign file numbers - // for the WAL, MANIFEST, sstable, and OPTIONS files. - NextFileNum base.FileNum - - // LastSeqNum is an upper bound on the sequence numbers that have been - // assigned in flushed WALs. Unflushed WALs (that will be replayed during - // recovery) may contain sequence numbers greater than this value. - LastSeqNum uint64 - - // A file num may be present in both deleted files and new files when it - // is moved from a lower level to a higher level (when the compaction - // found that there was no overlapping file at the higher level). - DeletedFiles map[DeletedFileEntry]*FileMetadata - NewFiles []NewFileEntry - // CreatedBackingTables can be used to preserve the FileBacking associated - // with a physical sstable. This is useful when virtual sstables in the - // latest version are reconstructed during manifest replay, and we also need - // to reconstruct the FileBacking which is required by these virtual - // sstables. - // - // INVARIANT: The FileBacking associated with a physical sstable must only - // be added as a backing file in the same version edit where the physical - // sstable is first virtualized. This means that the physical sstable must - // be present in DeletedFiles and that there must be at least one virtual - // sstable with the same FileBacking as the physical sstable in NewFiles. A - // file must be present in CreatedBackingTables in exactly one version edit. - // The physical sstable associated with the FileBacking must also not be - // present in NewFiles. - CreatedBackingTables []*FileBacking - // RemovedBackingTables is used to remove the FileBacking associated with a - // virtual sstable. Note that a backing sstable can be removed as soon as - // there are no virtual sstables in the latest version which are using the - // backing sstable, but the backing sstable doesn't necessarily have to be - // removed atomically with the version edit which removes the last virtual - // sstable associated with the backing sstable. The removal can happen in a - // future version edit. - // - // INVARIANT: A file must only be added to RemovedBackingTables if it was - // added to CreateBackingTables in a prior version edit. The same version - // edit also cannot have the same file present in both CreateBackingTables - // and RemovedBackingTables. A file must be present in RemovedBackingTables - // in exactly one version edit. - RemovedBackingTables []base.DiskFileNum -} - -// Decode decodes an edit from the specified reader. -// -// Note that the Decode step will not set the FileBacking for virtual sstables -// and the responsibility is left to the caller. However, the Decode step will -// populate the NewFileEntry.BackingFileNum in VersionEdit.NewFiles. -func (v *VersionEdit) Decode(r io.Reader) error { - br, ok := r.(byteReader) - if !ok { - br = bufio.NewReader(r) - } - d := versionEditDecoder{br} - for { - tag, err := binary.ReadUvarint(br) - if err == io.EOF { - break - } - if err != nil { - return err - } - switch tag { - case tagComparator: - s, err := d.readBytes() - if err != nil { - return err - } - v.ComparerName = string(s) - - case tagLogNumber: - n, err := d.readFileNum() - if err != nil { - return err - } - v.MinUnflushedLogNum = n - - case tagNextFileNumber: - n, err := d.readFileNum() - if err != nil { - return err - } - v.NextFileNum = n - - case tagLastSequence: - n, err := d.readUvarint() - if err != nil { - return err - } - v.LastSeqNum = n - - case tagCompactPointer: - if _, err := d.readLevel(); err != nil { - return err - } - if _, err := d.readBytes(); err != nil { - return err - } - // NB: RocksDB does not use compaction pointers anymore. - - case tagRemovedBackingTable: - n, err := d.readUvarint() - if err != nil { - return err - } - v.RemovedBackingTables = append( - v.RemovedBackingTables, base.FileNum(n).DiskFileNum(), - ) - case tagCreatedBackingTable: - dfn, err := d.readUvarint() - if err != nil { - return err - } - size, err := d.readUvarint() - if err != nil { - return err - } - fileBacking := &FileBacking{ - DiskFileNum: base.FileNum(dfn).DiskFileNum(), - Size: size, - } - v.CreatedBackingTables = append(v.CreatedBackingTables, fileBacking) - case tagDeletedFile: - level, err := d.readLevel() - if err != nil { - return err - } - fileNum, err := d.readFileNum() - if err != nil { - return err - } - if v.DeletedFiles == nil { - v.DeletedFiles = make(map[DeletedFileEntry]*FileMetadata) - } - v.DeletedFiles[DeletedFileEntry{level, fileNum}] = nil - - case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4, tagNewFile5: - level, err := d.readLevel() - if err != nil { - return err - } - fileNum, err := d.readFileNum() - if err != nil { - return err - } - if tag == tagNewFile3 { - // The pathID field appears unused in RocksDB. - _ /* pathID */, err := d.readUvarint() - if err != nil { - return err - } - } - size, err := d.readUvarint() - if err != nil { - return err - } - // We read the smallest / largest key bounds differently depending on - // whether we have point, range or both types of keys present in the - // table. - var ( - smallestPointKey, largestPointKey []byte - smallestRangeKey, largestRangeKey []byte - parsedPointBounds bool - boundsMarker byte - ) - if tag != tagNewFile5 { - // Range keys not present in the table. Parse the point key bounds. - smallestPointKey, err = d.readBytes() - if err != nil { - return err - } - largestPointKey, err = d.readBytes() - if err != nil { - return err - } - } else { - // Range keys are present in the table. Determine whether we have point - // keys to parse, in addition to the bounds. - boundsMarker, err = d.ReadByte() - if err != nil { - return err - } - // Parse point key bounds, if present. - if boundsMarker&maskContainsPointKeys > 0 { - smallestPointKey, err = d.readBytes() - if err != nil { - return err - } - largestPointKey, err = d.readBytes() - if err != nil { - return err - } - parsedPointBounds = true - } else { - // The table does not have point keys. - // Sanity check: the bounds must be range keys. - if boundsMarker&maskSmallest != 0 || boundsMarker&maskLargest != 0 { - return base.CorruptionErrorf( - "new-file-4-range-keys: table without point keys has point key bounds: marker=%x", - boundsMarker, - ) - } - } - // Parse range key bounds. - smallestRangeKey, err = d.readBytes() - if err != nil { - return err - } - largestRangeKey, err = d.readBytes() - if err != nil { - return err - } - } - var smallestSeqNum uint64 - var largestSeqNum uint64 - if tag != tagNewFile { - smallestSeqNum, err = d.readUvarint() - if err != nil { - return err - } - largestSeqNum, err = d.readUvarint() - if err != nil { - return err - } - } - var markedForCompaction bool - var creationTime uint64 - virtualState := struct { - virtual bool - backingFileNum uint64 - }{} - if tag == tagNewFile4 || tag == tagNewFile5 { - for { - customTag, err := d.readUvarint() - if err != nil { - return err - } - if customTag == customTagTerminate { - break - } else if customTag == customTagVirtual { - virtualState.virtual = true - n, err := d.readUvarint() - if err != nil { - return err - } - virtualState.backingFileNum = n - continue - } - - field, err := d.readBytes() - if err != nil { - return err - } - switch customTag { - case customTagNeedsCompaction: - if len(field) != 1 { - return base.CorruptionErrorf("new-file4: need-compaction field wrong size") - } - markedForCompaction = (field[0] == 1) - - case customTagCreationTime: - var n int - creationTime, n = binary.Uvarint(field) - if n != len(field) { - return base.CorruptionErrorf("new-file4: invalid file creation time") - } - - case customTagPathID: - return base.CorruptionErrorf("new-file4: path-id field not supported") - - default: - if (customTag & customTagNonSafeIgnoreMask) != 0 { - return base.CorruptionErrorf("new-file4: custom field not supported: %d", customTag) - } - } - } - } - m := &FileMetadata{ - FileNum: fileNum, - Size: size, - CreationTime: int64(creationTime), - SmallestSeqNum: smallestSeqNum, - LargestSeqNum: largestSeqNum, - MarkedForCompaction: markedForCompaction, - Virtual: virtualState.virtual, - } - if tag != tagNewFile5 { // no range keys present - m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey) - m.LargestPointKey = base.DecodeInternalKey(largestPointKey) - m.HasPointKeys = true - m.Smallest, m.Largest = m.SmallestPointKey, m.LargestPointKey - m.boundTypeSmallest, m.boundTypeLargest = boundTypePointKey, boundTypePointKey - } else { // range keys present - // Set point key bounds, if parsed. - if parsedPointBounds { - m.SmallestPointKey = base.DecodeInternalKey(smallestPointKey) - m.LargestPointKey = base.DecodeInternalKey(largestPointKey) - m.HasPointKeys = true - } - // Set range key bounds. - m.SmallestRangeKey = base.DecodeInternalKey(smallestRangeKey) - m.LargestRangeKey = base.DecodeInternalKey(largestRangeKey) - m.HasRangeKeys = true - // Set overall bounds (by default assume range keys). - m.Smallest, m.Largest = m.SmallestRangeKey, m.LargestRangeKey - m.boundTypeSmallest, m.boundTypeLargest = boundTypeRangeKey, boundTypeRangeKey - if boundsMarker&maskSmallest == maskSmallest { - m.Smallest = m.SmallestPointKey - m.boundTypeSmallest = boundTypePointKey - } - if boundsMarker&maskLargest == maskLargest { - m.Largest = m.LargestPointKey - m.boundTypeLargest = boundTypePointKey - } - } - m.boundsSet = true - if !virtualState.virtual { - m.InitPhysicalBacking() - } - - nfe := NewFileEntry{ - Level: level, - Meta: m, - } - if virtualState.virtual { - nfe.BackingFileNum = base.FileNum(virtualState.backingFileNum).DiskFileNum() - } - v.NewFiles = append(v.NewFiles, nfe) - - case tagPrevLogNumber: - n, err := d.readUvarint() - if err != nil { - return err - } - v.ObsoletePrevLogNum = n - - case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily: - return base.CorruptionErrorf("column families are not supported") - - default: - return errCorruptManifest - } - } - return nil -} - -func (v *VersionEdit) string(verbose bool, fmtKey base.FormatKey) string { - var buf bytes.Buffer - if v.ComparerName != "" { - fmt.Fprintf(&buf, " comparer: %s", v.ComparerName) - } - if v.MinUnflushedLogNum != 0 { - fmt.Fprintf(&buf, " log-num: %d\n", v.MinUnflushedLogNum) - } - if v.ObsoletePrevLogNum != 0 { - fmt.Fprintf(&buf, " prev-log-num: %d\n", v.ObsoletePrevLogNum) - } - if v.NextFileNum != 0 { - fmt.Fprintf(&buf, " next-file-num: %d\n", v.NextFileNum) - } - if v.LastSeqNum != 0 { - fmt.Fprintf(&buf, " last-seq-num: %d\n", v.LastSeqNum) - } - entries := make([]DeletedFileEntry, 0, len(v.DeletedFiles)) - for df := range v.DeletedFiles { - entries = append(entries, df) - } - sort.Slice(entries, func(i, j int) bool { - if entries[i].Level != entries[j].Level { - return entries[i].Level < entries[j].Level - } - return entries[i].FileNum < entries[j].FileNum - }) - for _, df := range entries { - fmt.Fprintf(&buf, " deleted: L%d %s\n", df.Level, df.FileNum) - } - for _, nf := range v.NewFiles { - fmt.Fprintf(&buf, " added: L%d", nf.Level) - if verbose { - fmt.Fprintf(&buf, " %s", nf.Meta.DebugString(fmtKey, true /* verbose */)) - } else { - fmt.Fprintf(&buf, " %s", nf.Meta.String()) - } - if nf.Meta.CreationTime != 0 { - fmt.Fprintf(&buf, " (%s)", - time.Unix(nf.Meta.CreationTime, 0).UTC().Format(time.RFC3339)) - } - fmt.Fprintln(&buf) - } - return buf.String() -} - -// DebugString is a more verbose version of String(). Use this in tests. -func (v *VersionEdit) DebugString(fmtKey base.FormatKey) string { - return v.string(true /* verbose */, fmtKey) -} - -// String implements fmt.Stringer for a VersionEdit. -func (v *VersionEdit) String() string { - return v.string(false /* verbose */, base.DefaultFormatter) -} - -// Encode encodes an edit to the specified writer. -func (v *VersionEdit) Encode(w io.Writer) error { - e := versionEditEncoder{new(bytes.Buffer)} - - if v.ComparerName != "" { - e.writeUvarint(tagComparator) - e.writeString(v.ComparerName) - } - if v.MinUnflushedLogNum != 0 { - e.writeUvarint(tagLogNumber) - e.writeUvarint(uint64(v.MinUnflushedLogNum)) - } - if v.ObsoletePrevLogNum != 0 { - e.writeUvarint(tagPrevLogNumber) - e.writeUvarint(v.ObsoletePrevLogNum) - } - if v.NextFileNum != 0 { - e.writeUvarint(tagNextFileNumber) - e.writeUvarint(uint64(v.NextFileNum)) - } - for _, dfn := range v.RemovedBackingTables { - e.writeUvarint(tagRemovedBackingTable) - e.writeUvarint(uint64(dfn.FileNum())) - } - for _, fileBacking := range v.CreatedBackingTables { - e.writeUvarint(tagCreatedBackingTable) - e.writeUvarint(uint64(fileBacking.DiskFileNum.FileNum())) - e.writeUvarint(fileBacking.Size) - } - // RocksDB requires LastSeqNum to be encoded for the first MANIFEST entry, - // even though its value is zero. We detect this by encoding LastSeqNum when - // ComparerName is set. - if v.LastSeqNum != 0 || v.ComparerName != "" { - e.writeUvarint(tagLastSequence) - e.writeUvarint(v.LastSeqNum) - } - for x := range v.DeletedFiles { - e.writeUvarint(tagDeletedFile) - e.writeUvarint(uint64(x.Level)) - e.writeUvarint(uint64(x.FileNum)) - } - for _, x := range v.NewFiles { - customFields := x.Meta.MarkedForCompaction || x.Meta.CreationTime != 0 || x.Meta.Virtual - var tag uint64 - switch { - case x.Meta.HasRangeKeys: - tag = tagNewFile5 - case customFields: - tag = tagNewFile4 - default: - tag = tagNewFile2 - } - e.writeUvarint(tag) - e.writeUvarint(uint64(x.Level)) - e.writeUvarint(uint64(x.Meta.FileNum)) - e.writeUvarint(x.Meta.Size) - if !x.Meta.HasRangeKeys { - // If we have no range keys, preserve the original format and write the - // smallest and largest point keys. - e.writeKey(x.Meta.SmallestPointKey) - e.writeKey(x.Meta.LargestPointKey) - } else { - // When range keys are present, we first write a marker byte that - // indicates if the table also contains point keys, in addition to how the - // overall bounds for the table should be reconstructed. This byte is - // followed by the keys themselves. - b, err := x.Meta.boundsMarker() - if err != nil { - return err - } - if err = e.WriteByte(b); err != nil { - return err - } - // Write point key bounds (if present). - if x.Meta.HasPointKeys { - e.writeKey(x.Meta.SmallestPointKey) - e.writeKey(x.Meta.LargestPointKey) - } - // Write range key bounds. - e.writeKey(x.Meta.SmallestRangeKey) - e.writeKey(x.Meta.LargestRangeKey) - } - e.writeUvarint(x.Meta.SmallestSeqNum) - e.writeUvarint(x.Meta.LargestSeqNum) - if customFields { - if x.Meta.CreationTime != 0 { - e.writeUvarint(customTagCreationTime) - var buf [binary.MaxVarintLen64]byte - n := binary.PutUvarint(buf[:], uint64(x.Meta.CreationTime)) - e.writeBytes(buf[:n]) - } - if x.Meta.MarkedForCompaction { - e.writeUvarint(customTagNeedsCompaction) - e.writeBytes([]byte{1}) - } - if x.Meta.Virtual { - e.writeUvarint(customTagVirtual) - e.writeUvarint(uint64(x.Meta.FileBacking.DiskFileNum.FileNum())) - } - e.writeUvarint(customTagTerminate) - } - } - _, err := w.Write(e.Bytes()) - return err -} - -// versionEditDecoder should be used to decode version edits. -type versionEditDecoder struct { - byteReader -} - -func (d versionEditDecoder) readBytes() ([]byte, error) { - n, err := d.readUvarint() - if err != nil { - return nil, err - } - s := make([]byte, n) - _, err = io.ReadFull(d, s) - if err != nil { - if err == io.ErrUnexpectedEOF { - return nil, errCorruptManifest - } - return nil, err - } - return s, nil -} - -func (d versionEditDecoder) readLevel() (int, error) { - u, err := d.readUvarint() - if err != nil { - return 0, err - } - if u >= NumLevels { - return 0, errCorruptManifest - } - return int(u), nil -} - -func (d versionEditDecoder) readFileNum() (base.FileNum, error) { - u, err := d.readUvarint() - if err != nil { - return 0, err - } - return base.FileNum(u), nil -} - -func (d versionEditDecoder) readUvarint() (uint64, error) { - u, err := binary.ReadUvarint(d) - if err != nil { - if err == io.EOF { - return 0, errCorruptManifest - } - return 0, err - } - return u, nil -} - -type versionEditEncoder struct { - *bytes.Buffer -} - -func (e versionEditEncoder) writeBytes(p []byte) { - e.writeUvarint(uint64(len(p))) - e.Write(p) -} - -func (e versionEditEncoder) writeKey(k InternalKey) { - e.writeUvarint(uint64(k.Size())) - e.Write(k.UserKey) - buf := k.EncodeTrailer() - e.Write(buf[:]) -} - -func (e versionEditEncoder) writeString(s string) { - e.writeUvarint(uint64(len(s))) - e.WriteString(s) -} - -func (e versionEditEncoder) writeUvarint(u uint64) { - var buf [binary.MaxVarintLen64]byte - n := binary.PutUvarint(buf[:], u) - e.Write(buf[:n]) -} - -// BulkVersionEdit summarizes the files added and deleted from a set of version -// edits. -// -// INVARIANTS: -// No file can be added to a level more than once. This is true globally, and -// also true for all of the calls to Accumulate for a single bulk version edit. -// -// No file can be removed from a level more than once. This is true globally, -// and also true for all of the calls to Accumulate for a single bulk version -// edit. -// -// A file must not be added and removed from a given level in the same version -// edit. -// -// A file that is being removed from a level must have been added to that level -// before (in a prior version edit). Note that a given file can be deleted from -// a level and added to another level in a single version edit -type BulkVersionEdit struct { - Added [NumLevels]map[base.FileNum]*FileMetadata - Deleted [NumLevels]map[base.FileNum]*FileMetadata - - // AddedFileBacking is a map to support lookup so that we can populate the - // FileBacking of virtual sstables during manifest replay. - AddedFileBacking map[base.DiskFileNum]*FileBacking - RemovedFileBacking []base.DiskFileNum - - // AddedByFileNum maps file number to file metadata for all added files - // from accumulated version edits. AddedByFileNum is only populated if set - // to non-nil by a caller. It must be set to non-nil when replaying - // version edits read from a MANIFEST (as opposed to VersionEdits - // constructed in-memory). While replaying a MANIFEST file, - // VersionEdit.DeletedFiles map entries have nil values, because the - // on-disk deletion record encodes only the file number. Accumulate - // uses AddedByFileNum to correctly populate the BulkVersionEdit's Deleted - // field with non-nil *FileMetadata. - AddedByFileNum map[base.FileNum]*FileMetadata - - // MarkedForCompactionCountDiff holds the aggregated count of files - // marked for compaction added or removed. - MarkedForCompactionCountDiff int -} - -// Accumulate adds the file addition and deletions in the specified version -// edit to the bulk edit's internal state. -// -// INVARIANTS: -// If a file is added to a given level in a call to Accumulate and then removed -// from that level in a subsequent call, the file will not be present in the -// resulting BulkVersionEdit.Deleted for that level. -// -// After accumulation of version edits, the bulk version edit may have -// information about a file which has been deleted from a level, but it may -// not have information about the same file added to the same level. The add -// could've occurred as part of a previous bulk version edit. In this case, -// the deleted file must be present in BulkVersionEdit.Deleted, at the end -// of the accumulation, because we need to decrease the refcount of the -// deleted file in Apply. -func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) error { - for df, m := range ve.DeletedFiles { - dmap := b.Deleted[df.Level] - if dmap == nil { - dmap = make(map[base.FileNum]*FileMetadata) - b.Deleted[df.Level] = dmap - } - - if m == nil { - // m is nil only when replaying a MANIFEST. - if b.AddedByFileNum == nil { - return errors.Errorf("deleted file L%d.%s's metadata is absent and bve.AddedByFileNum is nil", df.Level, df.FileNum) - } - m = b.AddedByFileNum[df.FileNum] - if m == nil { - return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", df.Level, df.FileNum) - } - } - if m.MarkedForCompaction { - b.MarkedForCompactionCountDiff-- - } - if _, ok := b.Added[df.Level][df.FileNum]; !ok { - dmap[df.FileNum] = m - } else { - // Present in b.Added for the same level. - delete(b.Added[df.Level], df.FileNum) - } - } - - // Generate state for Added backing files. Note that these must be generated - // before we loop through the NewFiles, because we need to populate the - // FileBackings which might be used by the NewFiles loop. - if b.AddedFileBacking == nil { - b.AddedFileBacking = make(map[base.DiskFileNum]*FileBacking) - } - for _, fb := range ve.CreatedBackingTables { - if _, ok := b.AddedFileBacking[fb.DiskFileNum]; ok { - // There is already a FileBacking associated with fb.DiskFileNum. - // This should never happen. There must always be only one FileBacking - // associated with a backing sstable. - panic(fmt.Sprintf("pebble: duplicate file backing %s", fb.DiskFileNum.String())) - } - b.AddedFileBacking[fb.DiskFileNum] = fb - } - - for _, nf := range ve.NewFiles { - // A new file should not have been deleted in this or a preceding - // VersionEdit at the same level (though files can move across levels). - if dmap := b.Deleted[nf.Level]; dmap != nil { - if _, ok := dmap[nf.Meta.FileNum]; ok { - return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", nf.Level, nf.Meta.FileNum) - } - } - if nf.Meta.Virtual && nf.Meta.FileBacking == nil { - // FileBacking for a virtual sstable must only be nil if we're performing - // manifest replay. - nf.Meta.FileBacking = b.AddedFileBacking[nf.BackingFileNum] - if nf.Meta.FileBacking == nil { - return errors.Errorf("FileBacking for virtual sstable must not be nil") - } - } else if nf.Meta.FileBacking == nil { - return errors.Errorf("Added file L%d.%s's has no FileBacking", nf.Level, nf.Meta.FileNum) - } - - if b.Added[nf.Level] == nil { - b.Added[nf.Level] = make(map[base.FileNum]*FileMetadata) - } - b.Added[nf.Level][nf.Meta.FileNum] = nf.Meta - if b.AddedByFileNum != nil { - b.AddedByFileNum[nf.Meta.FileNum] = nf.Meta - } - if nf.Meta.MarkedForCompaction { - b.MarkedForCompactionCountDiff++ - } - } - - // Since a file can be removed from backing files in exactly one version - // edit it is safe to just append without any de-duplication. - b.RemovedFileBacking = append(b.RemovedFileBacking, ve.RemovedBackingTables...) - - return nil -} - -// AccumulateIncompleteAndApplySingleVE should be called if a single version edit -// is to be applied to the provided curr Version and if the caller needs to -// update the versionSet.zombieTables map. This function exists separately from -// BulkVersionEdit.Apply because it is easier to reason about properties -// regarding BulkVersionedit.Accumulate/Apply and zombie table generation, if we -// know that exactly one version edit is being accumulated. -// -// Note that the version edit passed into this function may be incomplete -// because compactions don't have the ref counting information necessary to -// populate VersionEdit.RemovedBackingTables. This function will complete such a -// version edit by populating RemovedBackingTables. -// -// Invariant: Any file being deleted through ve must belong to the curr Version. -// We can't have a delete for some arbitrary file which does not exist in curr. -func AccumulateIncompleteAndApplySingleVE( - ve *VersionEdit, - curr *Version, - cmp Compare, - formatKey base.FormatKey, - flushSplitBytes int64, - readCompactionRate int64, - backingStateMap map[base.DiskFileNum]*FileBacking, - addBackingFunc func(*FileBacking), - removeBackingFunc func(base.DiskFileNum), - orderingInvariants OrderingInvariants, -) (_ *Version, zombies map[base.DiskFileNum]uint64, _ error) { - if len(ve.RemovedBackingTables) != 0 { - panic("pebble: invalid incomplete version edit") - } - var b BulkVersionEdit - err := b.Accumulate(ve) - if err != nil { - return nil, nil, err - } - zombies = make(map[base.DiskFileNum]uint64) - v, err := b.Apply( - curr, cmp, formatKey, flushSplitBytes, readCompactionRate, zombies, orderingInvariants, - ) - if err != nil { - return nil, nil, err - } - - for _, s := range b.AddedFileBacking { - addBackingFunc(s) - } - - for fileNum := range zombies { - if _, ok := backingStateMap[fileNum]; ok { - // This table was backing some virtual sstable in the latest version, - // but is now a zombie. We add RemovedBackingTables entries for - // these, before the version edit is written to disk. - ve.RemovedBackingTables = append( - ve.RemovedBackingTables, fileNum, - ) - removeBackingFunc(fileNum) - } - } - return v, zombies, nil -} - -// Apply applies the delta b to the current version to produce a new -// version. The new version is consistent with respect to the comparer cmp. -// -// curr may be nil, which is equivalent to a pointer to a zero version. -// -// On success, if a non-nil zombies map is provided to Apply, the map is updated -// with file numbers and files sizes of deleted files. These files are -// considered zombies because they are no longer referenced by the returned -// Version, but cannot be deleted from disk as they are still in use by the -// incoming Version. -func (b *BulkVersionEdit) Apply( - curr *Version, - cmp Compare, - formatKey base.FormatKey, - flushSplitBytes int64, - readCompactionRate int64, - zombies map[base.DiskFileNum]uint64, - orderingInvariants OrderingInvariants, -) (*Version, error) { - addZombie := func(state *FileBacking) { - if zombies != nil { - zombies[state.DiskFileNum] = state.Size - } - } - removeZombie := func(state *FileBacking) { - if zombies != nil { - delete(zombies, state.DiskFileNum) - } - } - - v := new(Version) - - // Adjust the count of files marked for compaction. - if curr != nil { - v.Stats.MarkedForCompaction = curr.Stats.MarkedForCompaction - } - v.Stats.MarkedForCompaction += b.MarkedForCompactionCountDiff - if v.Stats.MarkedForCompaction < 0 { - return nil, base.CorruptionErrorf("pebble: version marked for compaction count negative") - } - - for level := range v.Levels { - if curr == nil || curr.Levels[level].tree.root == nil { - v.Levels[level] = makeLevelMetadata(cmp, level, nil /* files */) - } else { - v.Levels[level] = curr.Levels[level].clone() - } - if curr == nil || curr.RangeKeyLevels[level].tree.root == nil { - v.RangeKeyLevels[level] = makeLevelMetadata(cmp, level, nil /* files */) - } else { - v.RangeKeyLevels[level] = curr.RangeKeyLevels[level].clone() - } - - if len(b.Added[level]) == 0 && len(b.Deleted[level]) == 0 { - // There are no edits on this level. - if level == 0 { - // Initialize L0Sublevels. - if curr == nil || curr.L0Sublevels == nil { - if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { - return nil, errors.Wrap(err, "pebble: internal error") - } - } else { - v.L0Sublevels = curr.L0Sublevels - v.L0SublevelFiles = v.L0Sublevels.Levels - } - } - continue - } - - // Some edits on this level. - lm := &v.Levels[level] - lmRange := &v.RangeKeyLevels[level] - - addedFilesMap := b.Added[level] - deletedFilesMap := b.Deleted[level] - if n := v.Levels[level].Len() + len(addedFilesMap); n == 0 { - return nil, base.CorruptionErrorf( - "pebble: internal error: No current or added files but have deleted files: %d", - errors.Safe(len(deletedFilesMap))) - } - - // NB: addedFilesMap may be empty. If a file is present in addedFilesMap - // for a level, it won't be present in deletedFilesMap for the same - // level. - - for _, f := range deletedFilesMap { - if obsolete := v.Levels[level].remove(f); obsolete { - // Deleting a file from the B-Tree may decrement its - // reference count. However, because we cloned the - // previous level's B-Tree, this should never result in a - // file's reference count dropping to zero. - err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during B-Tree removal", level, f.FileNum) - return nil, err - } - if f.HasRangeKeys { - if obsolete := v.RangeKeyLevels[level].remove(f); obsolete { - // Deleting a file from the B-Tree may decrement its - // reference count. However, because we cloned the - // previous level's B-Tree, this should never result in a - // file's reference count dropping to zero. - err := errors.Errorf("pebble: internal error: file L%d.%s obsolete during range-key B-Tree removal", level, f.FileNum) - return nil, err - } - } - - // Note that a backing sst will only become a zombie if the - // references to it in the latest version is 0. We will remove the - // backing sst from the zombie list in the next loop if one of the - // addedFiles in any of the levels is referencing the backing sst. - // This is possible if a physical sstable is virtualized, or if it - // is moved. - latestRefCount := f.LatestRefs() - if latestRefCount <= 0 { - // If a file is present in deletedFilesMap for a level, then it - // must have already been added to the level previously, which - // means that its latest ref count cannot be 0. - err := errors.Errorf("pebble: internal error: incorrect latestRefs reference counting for file", f.FileNum) - return nil, err - } else if f.LatestUnref() == 0 { - addZombie(f.FileBacking) - } - } - - addedFiles := make([]*FileMetadata, 0, len(addedFilesMap)) - for _, f := range addedFilesMap { - addedFiles = append(addedFiles, f) - } - // Sort addedFiles by file number. This isn't necessary, but tests which - // replay invalid manifests check the error output, and the error output - // depends on the order in which files are added to the btree. - sort.Slice(addedFiles, func(i, j int) bool { - return addedFiles[i].FileNum < addedFiles[j].FileNum - }) - - var sm, la *FileMetadata - for _, f := range addedFiles { - // NB: allowedSeeks is used for read triggered compactions. It is set using - // Options.Experimental.ReadCompactionRate which defaults to 32KB. - var allowedSeeks int64 - if readCompactionRate != 0 { - allowedSeeks = int64(f.Size) / readCompactionRate - } - if allowedSeeks < 100 { - allowedSeeks = 100 - } - f.AllowedSeeks.Store(allowedSeeks) - f.InitAllowedSeeks = allowedSeeks - - err := lm.insert(f) - // We're adding this file to the new version, so increment the - // latest refs count. - f.LatestRef() - if err != nil { - return nil, errors.Wrap(err, "pebble") - } - if f.HasRangeKeys { - err = lmRange.insert(f) - if err != nil { - return nil, errors.Wrap(err, "pebble") - } - } - removeZombie(f.FileBacking) - // Track the keys with the smallest and largest keys, so that we can - // check consistency of the modified span. - if sm == nil || base.InternalCompare(cmp, sm.Smallest, f.Smallest) > 0 { - sm = f - } - if la == nil || base.InternalCompare(cmp, la.Largest, f.Largest) < 0 { - la = f - } - } - - if level == 0 { - if curr != nil && curr.L0Sublevels != nil && len(deletedFilesMap) == 0 { - // Flushes and ingestions that do not delete any L0 files do not require - // a regeneration of L0Sublevels from scratch. We can instead generate - // it incrementally. - var err error - // AddL0Files requires addedFiles to be sorted in seqnum order. - SortBySeqNum(addedFiles) - v.L0Sublevels, err = curr.L0Sublevels.AddL0Files(addedFiles, flushSplitBytes, &v.Levels[0]) - if errors.Is(err, errInvalidL0SublevelsOpt) { - err = v.InitL0Sublevels(cmp, formatKey, flushSplitBytes) - } else if invariants.Enabled && err == nil { - copyOfSublevels, err := NewL0Sublevels(&v.Levels[0], cmp, formatKey, flushSplitBytes) - if err != nil { - panic(fmt.Sprintf("error when regenerating sublevels: %s", err)) - } - s1 := describeSublevels(base.DefaultFormatter, false /* verbose */, copyOfSublevels.Levels) - s2 := describeSublevels(base.DefaultFormatter, false /* verbose */, v.L0Sublevels.Levels) - if s1 != s2 { - panic(fmt.Sprintf("incremental L0 sublevel generation produced different output than regeneration: %s != %s", s1, s2)) - } - } - if err != nil { - return nil, errors.Wrap(err, "pebble: internal error") - } - v.L0SublevelFiles = v.L0Sublevels.Levels - } else if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil { - return nil, errors.Wrap(err, "pebble: internal error") - } - if err := CheckOrdering(cmp, formatKey, Level(0), v.Levels[level].Iter(), orderingInvariants); err != nil { - return nil, errors.Wrap(err, "pebble: internal error") - } - continue - } - - // Check consistency of the level in the vicinity of our edits. - if sm != nil && la != nil { - overlap := overlaps(v.Levels[level].Iter(), cmp, sm.Smallest.UserKey, - la.Largest.UserKey, la.Largest.IsExclusiveSentinel()) - // overlap contains all of the added files. We want to ensure that - // the added files are consistent with neighboring existing files - // too, so reslice the overlap to pull in a neighbor on each side. - check := overlap.Reslice(func(start, end *LevelIterator) { - if m := start.Prev(); m == nil { - start.Next() - } - if m := end.Next(); m == nil { - end.Prev() - } - }) - if err := CheckOrdering(cmp, formatKey, Level(level), check.Iter(), orderingInvariants); err != nil { - return nil, errors.Wrap(err, "pebble: internal error") - } - } - } - return v, nil -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/manual/manual_32bit.go b/vendor/github.com/cockroachdb/pebble/internal/manual/manual_32bit.go deleted file mode 100644 index 19369fa..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/manual/manual_32bit.go +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build 386 || amd64p32 || arm || armbe || ppc || sparc -// +build 386 amd64p32 arm armbe ppc sparc - -package manual - -const ( - // MaxArrayLen is a safe maximum length for slices on this architecture. - MaxArrayLen = 1<<31 - 1 -) diff --git a/vendor/github.com/cockroachdb/pebble/internal/manual/manual_64bit.go b/vendor/github.com/cockroachdb/pebble/internal/manual/manual_64bit.go deleted file mode 100644 index 0709dd5..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/manual/manual_64bit.go +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build amd64 || arm64 || arm64be || ppc64 || ppc64le || mips64 || mips64le || s390x || sparc64 || riscv64 || loong64 -// +build amd64 arm64 arm64be ppc64 ppc64le mips64 mips64le s390x sparc64 riscv64 loong64 - -package manual - -const ( - // MaxArrayLen is a safe maximum length for slices on this architecture. - MaxArrayLen = 1<<50 - 1 -) diff --git a/vendor/github.com/cockroachdb/pebble/internal/manual/manual_mips.go b/vendor/github.com/cockroachdb/pebble/internal/manual/manual_mips.go deleted file mode 100644 index 08bb880..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/manual/manual_mips.go +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build mips || mipsle || mips64p32 || mips64p32le -// +build mips mipsle mips64p32 mips64p32le - -package manual - -const ( - // MaxArrayLen is a safe maximum length for slices on this architecture. - MaxArrayLen = 1 << 30 -) diff --git a/vendor/github.com/cockroachdb/pebble/internal/manual/manual_nocgo.go b/vendor/github.com/cockroachdb/pebble/internal/manual/manual_nocgo.go deleted file mode 100644 index 74befbd..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/manual/manual_nocgo.go +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build !cgo -// +build !cgo - -package manual - -// Provides versions of New and Free when cgo is not available (e.g. cross -// compilation). - -// New allocates a slice of size n. -func New(n int) []byte { - return make([]byte, n) -} - -// Free frees the specified slice. -func Free(b []byte) { -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/private/sstable.go b/vendor/github.com/cockroachdb/pebble/internal/private/sstable.go deleted file mode 100644 index 780dd56..0000000 --- a/vendor/github.com/cockroachdb/pebble/internal/private/sstable.go +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package private - -import "github.com/cockroachdb/pebble/internal/base" - -// SSTableCacheOpts is a hook for specifying cache options to -// sstable.NewReader. -var SSTableCacheOpts func(cacheID uint64, fileNum base.DiskFileNum) interface{} - -// SSTableRawTombstonesOpt is a sstable.Reader option for disabling -// fragmentation of the range tombstones returned by -// sstable.Reader.NewRangeDelIter(). Used by debug tools to get a raw view of -// the tombstones contained in an sstable. -var SSTableRawTombstonesOpt interface{} - -// SSTableWriterDisableKeyOrderChecks is a hook for disabling the key ordering -// invariant check performed by sstable.Writer. It is intended for internal use -// only in the construction of invalid sstables for testing. See -// tool/make_test_sstables.go. -var SSTableWriterDisableKeyOrderChecks func(interface{}) - -// SSTableInternalProperties is a func(*sstable.Writer) *sstable.Properties -// function that allows Pebble-internal code to mutate properties that external -// sstable writers are not permitted to edit. It's an untyped interface{} to -// avoid a cyclic dependency. -var SSTableInternalProperties interface{} diff --git a/vendor/github.com/cockroachdb/pebble/level_iter.go b/vendor/github.com/cockroachdb/pebble/level_iter.go deleted file mode 100644 index ae6b045..0000000 --- a/vendor/github.com/cockroachdb/pebble/level_iter.go +++ /dev/null @@ -1,1242 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "context" - "fmt" - "runtime/debug" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/sstable" -) - -// tableNewIters creates a new point and range-del iterator for the given file -// number. -// -// On success, the internalIterator is not-nil and must be closed; the -// FragmentIterator can be nil. -// TODO(radu): always return a non-nil FragmentIterator. -// -// On error, the iterators are nil. -// -// The only (non-test) implementation of tableNewIters is tableCacheContainer.newIters(). -type tableNewIters func( - ctx context.Context, - file *manifest.FileMetadata, - opts *IterOptions, - internalOpts internalIterOpts, -) (internalIterator, keyspan.FragmentIterator, error) - -// tableNewRangeDelIter takes a tableNewIters and returns a TableNewSpanIter -// for the rangedel iterator returned by tableNewIters. -func tableNewRangeDelIter(ctx context.Context, newIters tableNewIters) keyspan.TableNewSpanIter { - return func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { - iter, rangeDelIter, err := newIters(ctx, file, nil, internalIterOpts{}) - if iter != nil { - _ = iter.Close() - } - if rangeDelIter == nil { - rangeDelIter = emptyKeyspanIter - } - return rangeDelIter, err - } -} - -type internalIterOpts struct { - bytesIterated *uint64 - bufferPool *sstable.BufferPool - stats *base.InternalIteratorStats - boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter -} - -// levelIter provides a merged view of the sstables in a level. -// -// levelIter is used during compaction and as part of the Iterator -// implementation. When used as part of the Iterator implementation, level -// iteration needs to "pause" at sstable boundaries if a range deletion -// tombstone is the source of that boundary. We know if a range tombstone is -// the smallest or largest key in a file because the kind will be -// InternalKeyKindRangeDeletion. If the boundary key is a range deletion -// tombstone, we materialize a fake entry to return from levelIter. This -// prevents mergingIter from advancing past the sstable until the sstable -// contains the smallest (or largest for reverse iteration) key in the merged -// heap. Note that mergingIter treats a range deletion tombstone returned by -// the point iterator as a no-op. -// -// SeekPrefixGE presents the need for a second type of pausing. If an sstable -// iterator returns "not found" for a SeekPrefixGE operation, we don't want to -// advance to the next sstable as the "not found" does not indicate that all of -// the keys in the sstable are less than the search key. Advancing to the next -// sstable would cause us to skip over range tombstones, violating -// correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the -// kind InternalKeyKindRangeDeletion which will be used to pause the levelIter -// at the sstable until the mergingIter is ready to advance past it. -type levelIter struct { - // The context is stored here since (a) iterators are expected to be - // short-lived (since they pin sstables), (b) plumbing a context into every - // method is very painful, (c) they do not (yet) respect context - // cancellation and are only used for tracing. - ctx context.Context - logger Logger - comparer *Comparer - cmp Compare - split Split - // The lower/upper bounds for iteration as specified at creation or the most - // recent call to SetBounds. - lower []byte - upper []byte - // The iterator options for the currently open table. If - // tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary - // does not lie within the table bounds. - tableOpts IterOptions - // The LSM level this levelIter is initialized for. - level manifest.Level - // The keys to return when iterating past an sstable boundary and that - // boundary is a range deletion tombstone. The boundary could be smallest - // (i.e. arrived at with Prev), or largest (arrived at with Next). - smallestBoundary *InternalKey - largestBoundary *InternalKey - // combinedIterState may be set when a levelIter is used during user - // iteration. Although levelIter only iterates over point keys, it's also - // responsible for lazily constructing the combined range & point iterator - // when it observes a file containing range keys. If the combined iter - // state's initialized field is true, the iterator is already using combined - // iterator, OR the iterator is not configured to use combined iteration. If - // it's false, the levelIter must set the `triggered` and `key` fields when - // the levelIter passes over a file containing range keys. See the - // lazyCombinedIter for more details. - combinedIterState *combinedIterState - // A synthetic boundary key to return when SeekPrefixGE finds an sstable - // which doesn't contain the search key, but which does contain range - // tombstones. - syntheticBoundary InternalKey - // The iter for the current file. It is nil under any of the following conditions: - // - files.Current() == nil - // - err != nil - // - some other constraint, like the bounds in opts, caused the file at index to not - // be relevant to the iteration. - iter internalIterator - // iterFile holds the current file. It is always equal to l.files.Current(). - iterFile *fileMetadata - // filteredIter is an optional interface that may be implemented by internal - // iterators that perform filtering of keys. When a new file's iterator is - // opened, it's tested to see if it implements filteredIter. If it does, - // it's stored here to allow the level iterator to recognize when keys were - // omitted from iteration results due to filtering. This is important when a - // file contains range deletions that may delete keys from other files. The - // levelIter must not advance to the next file until the mergingIter has - // advanced beyond the file's bounds. See - // levelIterBoundaryContext.isIgnorableBoundaryKey. - filteredIter filteredIter - newIters tableNewIters - // When rangeDelIterPtr != nil, the caller requires that *rangeDelIterPtr must - // point to a range del iterator corresponding to the current file. When this - // iterator returns nil, *rangeDelIterPtr should also be set to nil. Whenever - // a non-nil internalIterator is placed in rangeDelIterPtr, a copy is placed - // in rangeDelIterCopy. This is done for the following special case: - // when this iterator returns nil because of exceeding the bounds, we don't - // close iter and *rangeDelIterPtr since we could reuse it in the next seek. But - // we need to set *rangeDelIterPtr to nil because of the aforementioned contract. - // This copy is used to revive the *rangeDelIterPtr in the case of reuse. - rangeDelIterPtr *keyspan.FragmentIterator - rangeDelIterCopy keyspan.FragmentIterator - files manifest.LevelIterator - err error - - // Pointer into this level's entry in `mergingIterLevel::levelIterBoundaryContext`. - // We populate it with the corresponding bounds for the currently opened file. It is used for - // two purposes (described for forward iteration. The explanation for backward iteration is - // similar.) - // - To limit the optimization that seeks lower-level iterators past keys shadowed by a range - // tombstone. Limiting this seek to the file largestUserKey is necessary since - // range tombstones are stored untruncated, while they only apply to keys within their - // containing file's boundaries. For a detailed example, see comment above `mergingIter`. - // - To constrain the tombstone to act-within the bounds of the sstable when checking - // containment. For forward iteration we need the smallestUserKey. - // - // An example is sstable bounds [c#8, g#12] containing a tombstone [b, i)#7. - // - When doing a SeekGE to user key X, the levelIter is at this sstable because X is either within - // the sstable bounds or earlier than the start of the sstable (and there is no sstable in - // between at this level). If X >= smallestUserKey, and the tombstone [b, i) contains X, - // it is correct to SeekGE the sstables at lower levels to min(g, i) (i.e., min of - // largestUserKey, tombstone.End) since any user key preceding min(g, i) must be covered by this - // tombstone (since it cannot have a version younger than this tombstone as it is at a lower - // level). And even if X = smallestUserKey or equal to the start user key of the tombstone, - // if the above conditions are satisfied we know that the internal keys corresponding to X at - // lower levels must have a version smaller than that in this file (again because of the level - // argument). So we don't need to use sequence numbers for this comparison. - // - When checking whether this tombstone deletes internal key X we know that the levelIter is at this - // sstable so (repeating the above) X.UserKey is either within the sstable bounds or earlier than the - // start of the sstable (and there is no sstable in between at this level). - // - X is at at a lower level. If X.UserKey >= smallestUserKey, and the tombstone contains - // X.UserKey, we know X is deleted. This argument also works when X is a user key (we use - // it when seeking to test whether a user key is deleted). - // - X is at the same level. X must be within the sstable bounds of the tombstone so the - // X.UserKey >= smallestUserKey comparison is trivially true. In addition to the tombstone containing - // X we need to compare the sequence number of X and the tombstone (we don't need to look - // at how this tombstone is truncated to act-within the file bounds, which are InternalKeys, - // since X and the tombstone are from the same file). - // - // Iterating backwards has one more complication when checking whether a tombstone deletes - // internal key X at a lower level (the construction we do here also works for a user key X). - // Consider sstable bounds [c#8, g#InternalRangeDelSentinel] containing a tombstone [b, i)#7. - // If we are positioned at key g#10 at a lower sstable, the tombstone we will see is [b, i)#7, - // since the higher sstable is positioned at a key <= g#10. We should not use this tombstone - // to delete g#10. This requires knowing that the largestUserKey is a range delete sentinel, - // which we set in a separate bool below. - // - // These fields differs from the `*Boundary` fields in a few ways: - // - `*Boundary` is only populated when the iterator is positioned exactly on the sentinel key. - // - `*Boundary` can hold either the lower- or upper-bound, depending on the iterator direction. - // - `*Boundary` is not exposed to the next higher-level iterator, i.e., `mergingIter`. - boundaryContext *levelIterBoundaryContext - - // internalOpts holds the internal iterator options to pass to the table - // cache when constructing new table iterators. - internalOpts internalIterOpts - - // Scratch space for the obsolete keys filter, when there are no other block - // property filters specified. See the performance note where - // IterOptions.PointKeyFilters is declared. - filtersBuf [1]BlockPropertyFilter - - // Disable invariant checks even if they are otherwise enabled. Used by tests - // which construct "impossible" situations (e.g. seeking to a key before the - // lower bound). - disableInvariants bool -} - -// filteredIter is an additional interface implemented by iterators that may -// skip over point keys during iteration. The sstable.Iterator implements this -// interface. -type filteredIter interface { - // MaybeFilteredKeys may be called when an iterator is exhausted, indicating - // whether or not the iterator's last positioning method may have skipped - // any keys due to low-level filters. - // - // When an iterator is configured to use block-property filters, the - // low-level iterator may skip over blocks or whole sstables of keys. - // Implementations that implement skipping must implement this interface. - // Higher-level iterators require it to preserve invariants (eg, a levelIter - // used in a mergingIter must keep the file's range-del iterator open until - // the mergingIter has moved past the file's bounds, even if all of the - // file's point keys were filtered). - // - // MaybeFilteredKeys may always return false positives, that is it may - // return true when no keys were filtered. It should only be called when the - // iterator is exhausted. It must never return false negatives when the - // iterator is exhausted. - MaybeFilteredKeys() bool -} - -// levelIter implements the base.InternalIterator interface. -var _ base.InternalIterator = (*levelIter)(nil) - -// newLevelIter returns a levelIter. It is permissible to pass a nil split -// parameter if the caller is never going to call SeekPrefixGE. -func newLevelIter( - opts IterOptions, - comparer *Comparer, - newIters tableNewIters, - files manifest.LevelIterator, - level manifest.Level, - internalOpts internalIterOpts, -) *levelIter { - l := &levelIter{} - l.init(context.Background(), opts, comparer, newIters, files, level, - internalOpts) - return l -} - -func (l *levelIter) init( - ctx context.Context, - opts IterOptions, - comparer *Comparer, - newIters tableNewIters, - files manifest.LevelIterator, - level manifest.Level, - internalOpts internalIterOpts, -) { - l.ctx = ctx - l.err = nil - l.level = level - l.logger = opts.getLogger() - l.lower = opts.LowerBound - l.upper = opts.UpperBound - l.tableOpts.TableFilter = opts.TableFilter - l.tableOpts.PointKeyFilters = opts.PointKeyFilters - if len(opts.PointKeyFilters) == 0 { - l.tableOpts.PointKeyFilters = l.filtersBuf[:0:1] - } - l.tableOpts.UseL6Filters = opts.UseL6Filters - l.tableOpts.level = l.level - l.tableOpts.snapshotForHideObsoletePoints = opts.snapshotForHideObsoletePoints - l.comparer = comparer - l.cmp = comparer.Compare - l.split = comparer.Split - l.iterFile = nil - l.newIters = newIters - l.files = files - l.internalOpts = internalOpts -} - -func (l *levelIter) initRangeDel(rangeDelIter *keyspan.FragmentIterator) { - l.rangeDelIterPtr = rangeDelIter -} - -func (l *levelIter) initBoundaryContext(context *levelIterBoundaryContext) { - l.boundaryContext = context -} - -func (l *levelIter) initCombinedIterState(state *combinedIterState) { - l.combinedIterState = state -} - -func (l *levelIter) maybeTriggerCombinedIteration(file *fileMetadata, dir int) { - // If we encounter a file that contains range keys, we may need to - // trigger a switch to combined range-key and point-key iteration, - // if the *pebble.Iterator is configured for it. This switch is done - // lazily because range keys are intended to be rare, and - // constructing the range-key iterator substantially adds to the - // cost of iterator construction and seeking. - // - // If l.combinedIterState.initialized is already true, either the - // iterator is already using combined iteration or the iterator is not - // configured to observe range keys. Either way, there's nothing to do. - // If false, trigger the switch to combined iteration, using the the - // file's bounds to seek the range-key iterator appropriately. - // - // We only need to trigger combined iteration if the file contains - // RangeKeySets: if there are only Unsets and Dels, the user will observe no - // range keys regardless. If this file has table stats available, they'll - // tell us whether the file has any RangeKeySets. Otherwise, we must - // fallback to assuming it does if HasRangeKeys=true. - if file != nil && file.HasRangeKeys && l.combinedIterState != nil && !l.combinedIterState.initialized && - (l.upper == nil || l.cmp(file.SmallestRangeKey.UserKey, l.upper) < 0) && - (l.lower == nil || l.cmp(file.LargestRangeKey.UserKey, l.lower) > 0) && - (!file.StatsValid() || file.Stats.NumRangeKeySets > 0) { - // The file contains range keys, and we're not using combined iteration yet. - // Trigger a switch to combined iteration. It's possible that a switch has - // already been triggered if multiple levels encounter files containing - // range keys while executing a single mergingIter operation. In this case, - // we need to compare the existing key recorded to l.combinedIterState.key, - // adjusting it if our key is smaller (forward iteration) or larger - // (backward iteration) than the existing key. - // - // These key comparisons are only required during a single high-level - // iterator operation. When the high-level iter op completes, - // iinitialized will be true, and future calls to this function will be - // no-ops. - switch dir { - case +1: - if !l.combinedIterState.triggered { - l.combinedIterState.triggered = true - l.combinedIterState.key = file.SmallestRangeKey.UserKey - } else if l.cmp(l.combinedIterState.key, file.SmallestRangeKey.UserKey) > 0 { - l.combinedIterState.key = file.SmallestRangeKey.UserKey - } - case -1: - if !l.combinedIterState.triggered { - l.combinedIterState.triggered = true - l.combinedIterState.key = file.LargestRangeKey.UserKey - } else if l.cmp(l.combinedIterState.key, file.LargestRangeKey.UserKey) < 0 { - l.combinedIterState.key = file.LargestRangeKey.UserKey - } - } - } -} - -func (l *levelIter) findFileGE(key []byte, flags base.SeekGEFlags) *fileMetadata { - // Find the earliest file whose largest key is >= key. - - // NB: if flags.TrySeekUsingNext()=true, the levelIter must respect it. If - // the levelIter is positioned at the key P, it must return a key ≥ P. If - // used within a merging iterator, the merging iterator will depend on the - // levelIter only moving forward to maintain heap invariants. - - // Ordinarily we seek the LevelIterator using SeekGE. In some instances, we - // Next instead. In other instances, we try Next-ing first, falling back to - // seek: - // a) flags.TrySeekUsingNext(): The top-level Iterator knows we're seeking - // to a key later than the current iterator position. We don't know how - // much later the seek key is, so it's possible there are many sstables - // between the current position and the seek key. However in most real- - // world use cases, the seek key is likely to be nearby. Rather than - // performing a log(N) seek through the file metadata, we next a few - // times from from our existing location. If we don't find a file whose - // largest is >= key within a few nexts, we fall back to seeking. - // - // Note that in this case, the file returned by findFileGE may be - // different than the file returned by a raw binary search (eg, when - // TrySeekUsingNext=false). This is possible because the most recent - // positioning operation may have already determined that previous - // files' keys that are ≥ key are all deleted. This information is - // encoded within the iterator's current iterator position and is - // unavailable to a fresh binary search. - // - // b) flags.RelativeSeek(): The merging iterator decided to re-seek this - // level according to a range tombstone. When lazy combined iteration - // is enabled, the level iterator is responsible for watching for - // files containing range keys and triggering the switch to combined - // iteration when such a file is observed. If a range deletion was - // observed in a higher level causing the merging iterator to seek the - // level to the range deletion's end key, we need to check whether all - // of the files between the old position and the new position contain - // any range keys. - // - // In this scenario, we don't seek the LevelIterator and instead we - // Next it, one file at a time, checking each for range keys. The - // merging iterator sets this flag to inform us that we're moving - // forward relative to the existing position and that we must examine - // each intermediate sstable's metadata for lazy-combined iteration. - // In this case, we only Next and never Seek. We set nextsUntilSeek=-1 - // to signal this intention. - // - // NB: At most one of flags.RelativeSeek() and flags.TrySeekUsingNext() may - // be set, because the merging iterator re-seeks relative seeks with - // explicitly only the RelativeSeek flag set. - var nextsUntilSeek int - var nextInsteadOfSeek bool - if flags.TrySeekUsingNext() { - nextInsteadOfSeek = true - nextsUntilSeek = 4 // arbitrary - } - if flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized { - nextInsteadOfSeek = true - nextsUntilSeek = -1 - } - - var m *fileMetadata - if nextInsteadOfSeek { - m = l.iterFile - } else { - m = l.files.SeekGE(l.cmp, key) - } - // The below loop has a bit of an unusual organization. There are several - // conditions under which we need to Next to a later file. If none of those - // conditions are met, the file in `m` is okay to return. The loop body is - // structured with a series of if statements, each of which may continue the - // loop to the next file. If none of the statements are met, the end of the - // loop body is a break. - for m != nil { - if m.HasRangeKeys { - l.maybeTriggerCombinedIteration(m, +1) - - // Some files may only contain range keys, which we can skip. - // NB: HasPointKeys=true if the file contains any points or range - // deletions (which delete points). - if !m.HasPointKeys { - m = l.files.Next() - continue - } - } - - // This file has point keys. - // - // However, there are a couple reasons why `m` may not be positioned ≥ - // `key` yet: - // - // 1. If SeekGE(key) landed on a file containing range keys, the file - // may contain range keys ≥ `key` but no point keys ≥ `key`. - // 2. When nexting instead of seeking, we must check to see whether - // we've nexted sufficiently far, or we need to next again. - // - // If the file does not contain point keys ≥ `key`, next to continue - // looking for a file that does. - if (m.HasRangeKeys || nextInsteadOfSeek) && l.cmp(m.LargestPointKey.UserKey, key) < 0 { - // If nextInsteadOfSeek is set and nextsUntilSeek is non-negative, - // the iterator has been nexting hoping to discover the relevant - // file without seeking. It's exhausted the allotted nextsUntilSeek - // and should seek to the sought key. - if nextInsteadOfSeek && nextsUntilSeek == 0 { - nextInsteadOfSeek = false - m = l.files.SeekGE(l.cmp, key) - continue - } else if nextsUntilSeek > 0 { - nextsUntilSeek-- - } - m = l.files.Next() - continue - } - - // This file has a point key bound ≥ `key`. But the largest point key - // bound may still be a range deletion sentinel, which is exclusive. In - // this case, the file doesn't actually contain any point keys equal to - // `key`. We next to keep searching for a file that actually contains - // point keys ≥ key. - // - // Additionally, this prevents loading untruncated range deletions from - // a table which can't possibly contain the target key and is required - // for correctness by mergingIter.SeekGE (see the comment in that - // function). - if m.LargestPointKey.IsExclusiveSentinel() && l.cmp(m.LargestPointKey.UserKey, key) == 0 { - m = l.files.Next() - continue - } - - // This file contains point keys ≥ `key`. Break and return it. - break - } - return m -} - -func (l *levelIter) findFileLT(key []byte, flags base.SeekLTFlags) *fileMetadata { - // Find the last file whose smallest key is < ikey. - - // Ordinarily we seek the LevelIterator using SeekLT. - // - // When lazy combined iteration is enabled, there's a complication. The - // level iterator is responsible for watching for files containing range - // keys and triggering the switch to combined iteration when such a file is - // observed. If a range deletion was observed in a higher level causing the - // merging iterator to seek the level to the range deletion's start key, we - // need to check whether all of the files between the old position and the - // new position contain any range keys. - // - // In this scenario, we don't seek the LevelIterator and instead we Prev it, - // one file at a time, checking each for range keys. - prevInsteadOfSeek := flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized - - var m *fileMetadata - if prevInsteadOfSeek { - m = l.iterFile - } else { - m = l.files.SeekLT(l.cmp, key) - } - // The below loop has a bit of an unusual organization. There are several - // conditions under which we need to Prev to a previous file. If none of - // those conditions are met, the file in `m` is okay to return. The loop - // body is structured with a series of if statements, each of which may - // continue the loop to the previous file. If none of the statements are - // met, the end of the loop body is a break. - for m != nil { - if m.HasRangeKeys { - l.maybeTriggerCombinedIteration(m, -1) - - // Some files may only contain range keys, which we can skip. - // NB: HasPointKeys=true if the file contains any points or range - // deletions (which delete points). - if !m.HasPointKeys { - m = l.files.Prev() - continue - } - } - - // This file has point keys. - // - // However, there are a couple reasons why `m` may not be positioned < - // `key` yet: - // - // 1. If SeekLT(key) landed on a file containing range keys, the file - // may contain range keys < `key` but no point keys < `key`. - // 2. When preving instead of seeking, we must check to see whether - // we've preved sufficiently far, or we need to prev again. - // - // If the file does not contain point keys < `key`, prev to continue - // looking for a file that does. - if (m.HasRangeKeys || prevInsteadOfSeek) && l.cmp(m.SmallestPointKey.UserKey, key) >= 0 { - m = l.files.Prev() - continue - } - - // This file contains point keys < `key`. Break and return it. - break - } - return m -} - -// Init the iteration bounds for the current table. Returns -1 if the table -// lies fully before the lower bound, +1 if the table lies fully after the -// upper bound, and 0 if the table overlaps the iteration bounds. -func (l *levelIter) initTableBounds(f *fileMetadata) int { - l.tableOpts.LowerBound = l.lower - if l.tableOpts.LowerBound != nil { - if l.cmp(f.LargestPointKey.UserKey, l.tableOpts.LowerBound) < 0 { - // The largest key in the sstable is smaller than the lower bound. - return -1 - } - if l.cmp(l.tableOpts.LowerBound, f.SmallestPointKey.UserKey) <= 0 { - // The lower bound is smaller or equal to the smallest key in the - // table. Iteration within the table does not need to check the lower - // bound. - l.tableOpts.LowerBound = nil - } - } - l.tableOpts.UpperBound = l.upper - if l.tableOpts.UpperBound != nil { - if l.cmp(f.SmallestPointKey.UserKey, l.tableOpts.UpperBound) >= 0 { - // The smallest key in the sstable is greater than or equal to the upper - // bound. - return 1 - } - if l.cmp(l.tableOpts.UpperBound, f.LargestPointKey.UserKey) > 0 { - // The upper bound is greater than the largest key in the - // table. Iteration within the table does not need to check the upper - // bound. NB: tableOpts.UpperBound is exclusive and f.LargestPointKey is - // inclusive. - l.tableOpts.UpperBound = nil - } - } - return 0 -} - -type loadFileReturnIndicator int8 - -const ( - noFileLoaded loadFileReturnIndicator = iota - fileAlreadyLoaded - newFileLoaded -) - -func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicator { - l.smallestBoundary = nil - l.largestBoundary = nil - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } - if l.iterFile == file { - if l.err != nil { - return noFileLoaded - } - if l.iter != nil { - // We don't bother comparing the file bounds with the iteration bounds when we have - // an already open iterator. It is possible that the iter may not be relevant given the - // current iteration bounds, but it knows those bounds, so it will enforce them. - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = l.rangeDelIterCopy - } - - // There are a few reasons we might not have triggered combined - // iteration yet, even though we already had `file` open. - // 1. If the bounds changed, we might have previously avoided - // switching to combined iteration because the bounds excluded - // the range keys contained in this file. - // 2. If an existing iterator was reconfigured to iterate over range - // keys (eg, using SetOptions), then we wouldn't have triggered - // the switch to combined iteration yet. - l.maybeTriggerCombinedIteration(file, dir) - return fileAlreadyLoaded - } - // We were already at file, but don't have an iterator, probably because the file was - // beyond the iteration bounds. It may still be, but it is also possible that the bounds - // have changed. We handle that below. - } - - // Close both iter and rangeDelIterPtr. While mergingIter knows about - // rangeDelIterPtr, it can't call Close() on it because it does not know - // when the levelIter will switch it. Note that levelIter.Close() can be - // called multiple times. - if err := l.Close(); err != nil { - return noFileLoaded - } - - for { - l.iterFile = file - if file == nil { - return noFileLoaded - } - - l.maybeTriggerCombinedIteration(file, dir) - if !file.HasPointKeys { - switch dir { - case +1: - file = l.files.Next() - continue - case -1: - file = l.files.Prev() - continue - } - } - - switch l.initTableBounds(file) { - case -1: - // The largest key in the sstable is smaller than the lower bound. - if dir < 0 { - return noFileLoaded - } - file = l.files.Next() - continue - case +1: - // The smallest key in the sstable is greater than or equal to the upper - // bound. - if dir > 0 { - return noFileLoaded - } - file = l.files.Prev() - continue - } - - var rangeDelIter keyspan.FragmentIterator - var iter internalIterator - iter, rangeDelIter, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts) - l.iter = iter - if l.err != nil { - return noFileLoaded - } - if rangeDelIter != nil { - if fi, ok := iter.(filteredIter); ok { - l.filteredIter = fi - } else { - l.filteredIter = nil - } - } else { - l.filteredIter = nil - } - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = rangeDelIter - l.rangeDelIterCopy = rangeDelIter - } else if rangeDelIter != nil { - rangeDelIter.Close() - } - if l.boundaryContext != nil { - l.boundaryContext.smallestUserKey = file.Smallest.UserKey - l.boundaryContext.largestUserKey = file.Largest.UserKey - l.boundaryContext.isLargestUserKeyExclusive = file.Largest.IsExclusiveSentinel() - } - return newFileLoaded - } -} - -// In race builds we verify that the keys returned by levelIter lie within -// [lower,upper). -func (l *levelIter) verify(key *InternalKey, val base.LazyValue) (*InternalKey, base.LazyValue) { - // Note that invariants.Enabled is a compile time constant, which means the - // block of code will be compiled out of normal builds making this method - // eligible for inlining. Do not change this to use a variable. - if invariants.Enabled && !l.disableInvariants && key != nil { - // We allow returning a boundary key that is outside of the lower/upper - // bounds as such keys are always range tombstones which will be skipped by - // the Iterator. - if l.lower != nil && key != l.smallestBoundary && l.cmp(key.UserKey, l.lower) < 0 { - l.logger.Fatalf("levelIter %s: lower bound violation: %s < %s\n%s", l.level, key, l.lower, debug.Stack()) - } - if l.upper != nil && key != l.largestBoundary && l.cmp(key.UserKey, l.upper) > 0 { - l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.level, key, l.upper, debug.Stack()) - } - } - return key, val -} - -func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { - l.err = nil // clear cached iteration error - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } - // NB: the top-level Iterator has already adjusted key based on - // IterOptions.LowerBound. - loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) - if loadFileIndicator == noFileLoaded { - return nil, base.LazyValue{} - } - if loadFileIndicator == newFileLoaded { - // File changed, so l.iter has changed, and that iterator is not - // positioned appropriately. - flags = flags.DisableTrySeekUsingNext() - } - if ikey, val := l.iter.SeekGE(key, flags); ikey != nil { - return l.verify(ikey, val) - } - return l.verify(l.skipEmptyFileForward()) -} - -func (l *levelIter) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - l.err = nil // clear cached iteration error - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } - - // NB: the top-level Iterator has already adjusted key based on - // IterOptions.LowerBound. - loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) - if loadFileIndicator == noFileLoaded { - return nil, base.LazyValue{} - } - if loadFileIndicator == newFileLoaded { - // File changed, so l.iter has changed, and that iterator is not - // positioned appropriately. - flags = flags.DisableTrySeekUsingNext() - } - if key, val := l.iter.SeekPrefixGE(prefix, key, flags); key != nil { - return l.verify(key, val) - } - // When SeekPrefixGE returns nil, we have not necessarily reached the end of - // the sstable. All we know is that a key with prefix does not exist in the - // current sstable. We do know that the key lies within the bounds of the - // table as findFileGE found the table where key <= meta.Largest. We return - // the table's bound with isIgnorableBoundaryKey set. - if l.rangeDelIterPtr != nil && *l.rangeDelIterPtr != nil { - if l.tableOpts.UpperBound != nil { - l.syntheticBoundary.UserKey = l.tableOpts.UpperBound - l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel - l.largestBoundary = &l.syntheticBoundary - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = true - l.boundaryContext.isIgnorableBoundaryKey = false - } - return l.verify(l.largestBoundary, base.LazyValue{}) - } - // Return the file's largest bound, ensuring this file stays open until - // the mergingIter advances beyond the file's bounds. We set - // isIgnorableBoundaryKey to signal that the actual key returned should - // be ignored, and does not represent a real key in the database. - l.largestBoundary = &l.iterFile.LargestPointKey - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = true - } - return l.verify(l.largestBoundary, base.LazyValue{}) - } - // It is possible that we are here because bloom filter matching failed. In - // that case it is likely that all keys matching the prefix are wholly - // within the current file and cannot be in the subsequent file. In that - // case we don't want to go to the next file, since loading and seeking in - // there has some cost. Additionally, for sparse key spaces, loading the - // next file will defeat the optimization for the next SeekPrefixGE that is - // called with flags.TrySeekUsingNext(), since for sparse key spaces it is - // likely that the next key will also be contained in the current file. - var n int - if l.split != nil { - // If the split function is specified, calculate the prefix length accordingly. - n = l.split(l.iterFile.LargestPointKey.UserKey) - } else { - // If the split function is not specified, the entire key is used as the - // prefix. This case can occur when getIter uses SeekPrefixGE. - n = len(l.iterFile.LargestPointKey.UserKey) - } - if l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 { - return nil, base.LazyValue{} - } - return l.verify(l.skipEmptyFileForward()) -} - -func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { - l.err = nil // clear cached iteration error - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } - - // NB: the top-level Iterator has already adjusted key based on - // IterOptions.UpperBound. - if l.loadFile(l.findFileLT(key, flags), -1) == noFileLoaded { - return nil, base.LazyValue{} - } - if key, val := l.iter.SeekLT(key, flags); key != nil { - return l.verify(key, val) - } - return l.verify(l.skipEmptyFileBackward()) -} - -func (l *levelIter) First() (*InternalKey, base.LazyValue) { - l.err = nil // clear cached iteration error - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } - - // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is - // set. - if l.loadFile(l.files.First(), +1) == noFileLoaded { - return nil, base.LazyValue{} - } - if key, val := l.iter.First(); key != nil { - return l.verify(key, val) - } - return l.verify(l.skipEmptyFileForward()) -} - -func (l *levelIter) Last() (*InternalKey, base.LazyValue) { - l.err = nil // clear cached iteration error - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } - - // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is - // set. - if l.loadFile(l.files.Last(), -1) == noFileLoaded { - return nil, base.LazyValue{} - } - if key, val := l.iter.Last(); key != nil { - return l.verify(key, val) - } - return l.verify(l.skipEmptyFileBackward()) -} - -func (l *levelIter) Next() (*InternalKey, base.LazyValue) { - if l.err != nil || l.iter == nil { - return nil, base.LazyValue{} - } - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } - - switch { - case l.largestBoundary != nil: - if l.tableOpts.UpperBound != nil { - // The UpperBound was within this file, so don't load the next - // file. We leave the largestBoundary unchanged so that subsequent - // calls to Next() stay at this file. If a Seek/First/Last call is - // made and this file continues to be relevant, loadFile() will - // set the largestBoundary to nil. - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil - } - return nil, base.LazyValue{} - } - // We're stepping past the boundary key, so now we can load the next file. - if l.loadFile(l.files.Next(), +1) != noFileLoaded { - if key, val := l.iter.First(); key != nil { - return l.verify(key, val) - } - return l.verify(l.skipEmptyFileForward()) - } - return nil, base.LazyValue{} - - default: - // Reset the smallest boundary since we're moving away from it. - l.smallestBoundary = nil - if key, val := l.iter.Next(); key != nil { - return l.verify(key, val) - } - } - return l.verify(l.skipEmptyFileForward()) -} - -func (l *levelIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { - if l.err != nil || l.iter == nil { - return nil, base.LazyValue{} - } - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } - - switch { - case l.largestBoundary != nil: - if l.tableOpts.UpperBound != nil { - // The UpperBound was within this file, so don't load the next - // file. We leave the largestBoundary unchanged so that subsequent - // calls to Next() stay at this file. If a Seek/First/Last call is - // made and this file continues to be relevant, loadFile() will - // set the largestBoundary to nil. - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil - } - return nil, base.LazyValue{} - } - // We're stepping past the boundary key, so we need to load a later - // file. - - default: - // Reset the smallest boundary since we're moving away from it. - l.smallestBoundary = nil - - if key, val := l.iter.NextPrefix(succKey); key != nil { - return l.verify(key, val) - } - // Fall through to seeking. - } - - // Seek the manifest level iterator using TrySeekUsingNext=true and - // RelativeSeek=true so that we take advantage of the knowledge that - // `succKey` can only be contained in later files. - metadataSeekFlags := base.SeekGEFlagsNone.EnableTrySeekUsingNext().EnableRelativeSeek() - if l.loadFile(l.findFileGE(succKey, metadataSeekFlags), +1) != noFileLoaded { - // NB: The SeekGE on the file's iterator must not set TrySeekUsingNext, - // because l.iter is unpositioned. - if key, val := l.iter.SeekGE(succKey, base.SeekGEFlagsNone); key != nil { - return l.verify(key, val) - } - return l.verify(l.skipEmptyFileForward()) - } - return nil, base.LazyValue{} -} - -func (l *levelIter) Prev() (*InternalKey, base.LazyValue) { - if l.err != nil || l.iter == nil { - return nil, base.LazyValue{} - } - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = false - l.boundaryContext.isIgnorableBoundaryKey = false - } - - switch { - case l.smallestBoundary != nil: - if l.tableOpts.LowerBound != nil { - // The LowerBound was within this file, so don't load the previous - // file. We leave the smallestBoundary unchanged so that - // subsequent calls to Prev() stay at this file. If a - // Seek/First/Last call is made and this file continues to be - // relevant, loadFile() will set the smallestBoundary to nil. - if l.rangeDelIterPtr != nil { - *l.rangeDelIterPtr = nil - } - return nil, base.LazyValue{} - } - // We're stepping past the boundary key, so now we can load the prev file. - if l.loadFile(l.files.Prev(), -1) != noFileLoaded { - if key, val := l.iter.Last(); key != nil { - return l.verify(key, val) - } - return l.verify(l.skipEmptyFileBackward()) - } - return nil, base.LazyValue{} - - default: - // Reset the largest boundary since we're moving away from it. - l.largestBoundary = nil - if key, val := l.iter.Prev(); key != nil { - return l.verify(key, val) - } - } - return l.verify(l.skipEmptyFileBackward()) -} - -func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) { - var key *InternalKey - var val base.LazyValue - // The first iteration of this loop starts with an already exhausted - // l.iter. The reason for the exhaustion is either that we iterated to the - // end of the sstable, or our iteration was terminated early due to the - // presence of an upper-bound or the use of SeekPrefixGE. If - // l.rangeDelIterPtr is non-nil, we may need to pretend the iterator is - // not exhausted to allow for the merging to finish consuming the - // l.rangeDelIterPtr before levelIter switches the rangeDelIter from - // under it. This pretense is done by either generating a synthetic - // boundary key or returning the largest key of the file, depending on the - // exhaustion reason. - - // Subsequent iterations will examine consecutive files such that the first - // file that does not have an exhausted iterator causes the code to return - // that key, else the behavior described above if there is a corresponding - // rangeDelIterPtr. - for ; key == nil; key, val = l.iter.First() { - if l.rangeDelIterPtr != nil { - // We're being used as part of a mergingIter and we've exhausted the - // current sstable. If an upper bound is present and the upper bound lies - // within the current sstable, then we will have reached the upper bound - // rather than the end of the sstable. We need to return a synthetic - // boundary key so that mergingIter can use the range tombstone iterator - // until the other levels have reached this boundary. - // - // It is safe to set the boundary key to the UpperBound user key - // with the RANGEDEL sentinel since it is the smallest InternalKey - // that matches the exclusive upper bound, and does not represent - // a real key. - if l.tableOpts.UpperBound != nil { - if *l.rangeDelIterPtr != nil { - l.syntheticBoundary.UserKey = l.tableOpts.UpperBound - l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel - l.largestBoundary = &l.syntheticBoundary - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = true - } - return l.largestBoundary, base.LazyValue{} - } - // Else there are no range deletions in this sstable. This - // helps with performance when many levels are populated with - // sstables and most don't have any actual keys within the - // bounds. - return nil, base.LazyValue{} - } - // If the boundary is a range deletion tombstone, return that key. - if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete { - l.largestBoundary = &l.iterFile.LargestPointKey - if l.boundaryContext != nil { - l.boundaryContext.isIgnorableBoundaryKey = true - } - return l.largestBoundary, base.LazyValue{} - } - // If the last point iterator positioning op might've skipped keys, - // it's possible the file's range deletions are still relevant to - // other levels. Return the largest boundary as a special ignorable - // marker to avoid advancing to the next file. - // - // The sstable iterator cannot guarantee that keys were skipped. A - // SeekGE that lands on a index separator k only knows that the - // block at the index entry contains keys ≤ k. We can't know whether - // there were actually keys between the seek key and the index - // separator key. If the block is then excluded due to block - // property filters, the iterator does not know whether keys were - // actually skipped by the block's exclusion. - // - // Since MaybeFilteredKeys cannot guarantee that keys were skipped, - // it's possible l.iterFile.Largest was already returned. Returning - // l.iterFile.Largest again is a violation of the strict - // monotonicity normally provided. The mergingIter's heap can - // tolerate this repeat key and in this case will keep the level at - // the top of the heap and immediately skip the entry, advancing to - // the next file. - if *l.rangeDelIterPtr != nil && l.filteredIter != nil && - l.filteredIter.MaybeFilteredKeys() { - l.largestBoundary = &l.iterFile.Largest - if l.boundaryContext != nil { - l.boundaryContext.isIgnorableBoundaryKey = true - } - return l.largestBoundary, base.LazyValue{} - } - } - - // Current file was exhausted. Move to the next file. - if l.loadFile(l.files.Next(), +1) == noFileLoaded { - return nil, base.LazyValue{} - } - } - return key, val -} - -func (l *levelIter) skipEmptyFileBackward() (*InternalKey, base.LazyValue) { - var key *InternalKey - var val base.LazyValue - // The first iteration of this loop starts with an already exhausted - // l.iter. The reason for the exhaustion is either that we iterated to the - // end of the sstable, or our iteration was terminated early due to the - // presence of a lower-bound. If l.rangeDelIterPtr is non-nil, we may need - // to pretend the iterator is not exhausted to allow for the merging to - // finish consuming the l.rangeDelIterPtr before levelIter switches the - // rangeDelIter from under it. This pretense is done by either generating - // a synthetic boundary key or returning the smallest key of the file, - // depending on the exhaustion reason. - - // Subsequent iterations will examine consecutive files such that the first - // file that does not have an exhausted iterator causes the code to return - // that key, else the behavior described above if there is a corresponding - // rangeDelIterPtr. - for ; key == nil; key, val = l.iter.Last() { - if l.rangeDelIterPtr != nil { - // We're being used as part of a mergingIter and we've exhausted the - // current sstable. If a lower bound is present and the lower bound lies - // within the current sstable, then we will have reached the lower bound - // rather than the beginning of the sstable. We need to return a - // synthetic boundary key so that mergingIter can use the range tombstone - // iterator until the other levels have reached this boundary. - // - // It is safe to set the boundary key to the LowerBound user key - // with the RANGEDEL sentinel since it is the smallest InternalKey - // that is within the inclusive lower bound, and does not - // represent a real key. - if l.tableOpts.LowerBound != nil { - if *l.rangeDelIterPtr != nil { - l.syntheticBoundary.UserKey = l.tableOpts.LowerBound - l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel - l.smallestBoundary = &l.syntheticBoundary - if l.boundaryContext != nil { - l.boundaryContext.isSyntheticIterBoundsKey = true - } - return l.smallestBoundary, base.LazyValue{} - } - // Else there are no range deletions in this sstable. This - // helps with performance when many levels are populated with - // sstables and most don't have any actual keys within the - // bounds. - return nil, base.LazyValue{} - } - // If the boundary is a range deletion tombstone, return that key. - if l.iterFile.SmallestPointKey.Kind() == InternalKeyKindRangeDelete { - l.smallestBoundary = &l.iterFile.SmallestPointKey - if l.boundaryContext != nil { - l.boundaryContext.isIgnorableBoundaryKey = true - } - return l.smallestBoundary, base.LazyValue{} - } - // If the last point iterator positioning op skipped keys, it's - // possible the file's range deletions are still relevant to other - // levels. Return the smallest boundary as a special ignorable key - // to avoid advancing to the next file. - // - // The sstable iterator cannot guarantee that keys were skipped. A - // SeekGE that lands on a index separator k only knows that the - // block at the index entry contains keys ≤ k. We can't know whether - // there were actually keys between the seek key and the index - // separator key. If the block is then excluded due to block - // property filters, the iterator does not know whether keys were - // actually skipped by the block's exclusion. - // - // Since MaybeFilteredKeys cannot guarantee that keys were skipped, - // it's possible l.iterFile.Smallest was already returned. Returning - // l.iterFile.Smallest again is a violation of the strict - // monotonicity normally provided. The mergingIter's heap can - // tolerate this repeat key and in this case will keep the level at - // the top of the heap and immediately skip the entry, advancing to - // the next file. - if *l.rangeDelIterPtr != nil && l.filteredIter != nil && l.filteredIter.MaybeFilteredKeys() { - l.smallestBoundary = &l.iterFile.Smallest - if l.boundaryContext != nil { - l.boundaryContext.isIgnorableBoundaryKey = true - } - return l.smallestBoundary, base.LazyValue{} - } - } - - // Current file was exhausted. Move to the previous file. - if l.loadFile(l.files.Prev(), -1) == noFileLoaded { - return nil, base.LazyValue{} - } - } - return key, val -} - -func (l *levelIter) Error() error { - if l.err != nil || l.iter == nil { - return l.err - } - return l.iter.Error() -} - -func (l *levelIter) Close() error { - if l.iter != nil { - l.err = l.iter.Close() - l.iter = nil - } - if l.rangeDelIterPtr != nil { - if t := l.rangeDelIterCopy; t != nil { - l.err = firstError(l.err, t.Close()) - } - *l.rangeDelIterPtr = nil - l.rangeDelIterCopy = nil - } - return l.err -} - -func (l *levelIter) SetBounds(lower, upper []byte) { - l.lower = lower - l.upper = upper - - if l.iter == nil { - return - } - - // Update tableOpts.{Lower,Upper}Bound in case the new boundaries fall within - // the boundaries of the current table. - if l.initTableBounds(l.iterFile) != 0 { - // The table does not overlap the bounds. Close() will set levelIter.err if - // an error occurs. - _ = l.Close() - return - } - - l.iter.SetBounds(l.tableOpts.LowerBound, l.tableOpts.UpperBound) -} - -func (l *levelIter) String() string { - if l.iterFile != nil { - return fmt.Sprintf("%s: fileNum=%s", l.level, l.iter.String()) - } - return fmt.Sprintf("%s: fileNum=", l.level) -} - -var _ internalIterator = &levelIter{} diff --git a/vendor/github.com/cockroachdb/pebble/log_recycler.go b/vendor/github.com/cockroachdb/pebble/log_recycler.go deleted file mode 100644 index c8c2ff4..0000000 --- a/vendor/github.com/cockroachdb/pebble/log_recycler.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "sync" - - "github.com/cockroachdb/errors" -) - -type logRecycler struct { - // The maximum number of log files to maintain for recycling. - limit int - - // The minimum log number that is allowed to be recycled. Log numbers smaller - // than this will be subject to immediate deletion. This is used to prevent - // recycling a log written by a previous instance of the DB which may not - // have had log recycling enabled. If that previous instance of the DB was - // RocksDB, the old non-recyclable log record headers will be present. - minRecycleLogNum FileNum - - mu struct { - sync.Mutex - logs []fileInfo - maxLogNum FileNum - } -} - -// add attempts to recycle the log file specified by logInfo. Returns true if -// the log file should not be deleted (i.e. the log is being recycled), and -// false otherwise. -func (r *logRecycler) add(logInfo fileInfo) bool { - if logInfo.fileNum.FileNum() < r.minRecycleLogNum { - return false - } - - r.mu.Lock() - defer r.mu.Unlock() - - if logInfo.fileNum.FileNum() <= r.mu.maxLogNum { - // The log file number was already considered for recycling. Don't consider - // it again. This avoids a race between adding the same log file for - // recycling multiple times, and removing the log file for actual - // reuse. Note that we return true because the log was already considered - // for recycling and either it was deleted on the previous attempt (which - // means we shouldn't get here) or it was recycled and thus the file - // shouldn't be deleted. - return true - } - r.mu.maxLogNum = logInfo.fileNum.FileNum() - if len(r.mu.logs) >= r.limit { - return false - } - r.mu.logs = append(r.mu.logs, logInfo) - return true -} - -// peek returns the log at the head of the recycling queue, or the zero value -// fileInfo and false if the queue is empty. -func (r *logRecycler) peek() (fileInfo, bool) { - r.mu.Lock() - defer r.mu.Unlock() - - if len(r.mu.logs) == 0 { - return fileInfo{}, false - } - return r.mu.logs[0], true -} - -func (r *logRecycler) stats() (count int, size uint64) { - r.mu.Lock() - defer r.mu.Unlock() - count = len(r.mu.logs) - for i := 0; i < count; i++ { - size += r.mu.logs[i].fileSize - } - return count, size -} - -// pop removes the log number at the head of the recycling queue, enforcing -// that it matches the specified logNum. An error is returned of the recycling -// queue is empty or the head log number does not match the specified one. -func (r *logRecycler) pop(logNum FileNum) error { - r.mu.Lock() - defer r.mu.Unlock() - - if len(r.mu.logs) == 0 { - return errors.New("pebble: log recycler empty") - } - if r.mu.logs[0].fileNum.FileNum() != logNum { - return errors.Errorf("pebble: log recycler invalid %d vs %d", errors.Safe(logNum), errors.Safe(fileInfoNums(r.mu.logs))) - } - r.mu.logs = r.mu.logs[1:] - return nil -} - -func fileInfoNums(finfos []fileInfo) []FileNum { - if len(finfos) == 0 { - return nil - } - nums := make([]FileNum, len(finfos)) - for i := range finfos { - nums[i] = finfos[i].fileNum.FileNum() - } - return nums -} diff --git a/vendor/github.com/cockroachdb/pebble/merging_iter_heap.go b/vendor/github.com/cockroachdb/pebble/merging_iter_heap.go deleted file mode 100644 index c8c336f..0000000 --- a/vendor/github.com/cockroachdb/pebble/merging_iter_heap.go +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -type mergingIterHeap struct { - cmp Compare - reverse bool - items []*mergingIterLevel -} - -func (h *mergingIterHeap) len() int { - return len(h.items) -} - -func (h *mergingIterHeap) clear() { - h.items = h.items[:0] -} - -func (h *mergingIterHeap) less(i, j int) bool { - ikey, jkey := h.items[i].iterKey, h.items[j].iterKey - if c := h.cmp(ikey.UserKey, jkey.UserKey); c != 0 { - if h.reverse { - return c > 0 - } - return c < 0 - } - if h.reverse { - return ikey.Trailer < jkey.Trailer - } - return ikey.Trailer > jkey.Trailer -} - -func (h *mergingIterHeap) swap(i, j int) { - h.items[i], h.items[j] = h.items[j], h.items[i] -} - -// init, fix, up and down are copied from the go stdlib. -func (h *mergingIterHeap) init() { - // heapify - n := h.len() - for i := n/2 - 1; i >= 0; i-- { - h.down(i, n) - } -} - -func (h *mergingIterHeap) fix(i int) { - if !h.down(i, h.len()) { - h.up(i) - } -} - -func (h *mergingIterHeap) pop() *mergingIterLevel { - n := h.len() - 1 - h.swap(0, n) - h.down(0, n) - item := h.items[n] - h.items = h.items[:n] - return item -} - -func (h *mergingIterHeap) up(j int) { - for { - i := (j - 1) / 2 // parent - if i == j || !h.less(j, i) { - break - } - h.swap(i, j) - j = i - } -} - -func (h *mergingIterHeap) down(i0, n int) bool { - i := i0 - for { - j1 := 2*i + 1 - if j1 >= n || j1 < 0 { // j1 < 0 after int overflow - break - } - j := j1 // left child - if j2 := j1 + 1; j2 < n && h.less(j2, j1) { - j = j2 // = 2*i + 2 // right child - } - if !h.less(j, i) { - break - } - h.swap(i, j) - i = j - } - return i > i0 -} diff --git a/vendor/github.com/cockroachdb/pebble/metrics.go b/vendor/github.com/cockroachdb/pebble/metrics.go deleted file mode 100644 index cab9291..0000000 --- a/vendor/github.com/cockroachdb/pebble/metrics.go +++ /dev/null @@ -1,625 +0,0 @@ -// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "fmt" - "math" - "time" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/cache" - "github.com/cockroachdb/pebble/internal/humanize" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" - "github.com/cockroachdb/pebble/record" - "github.com/cockroachdb/pebble/sstable" - "github.com/cockroachdb/redact" - "github.com/prometheus/client_golang/prometheus" -) - -// CacheMetrics holds metrics for the block and table cache. -type CacheMetrics = cache.Metrics - -// FilterMetrics holds metrics for the filter policy -type FilterMetrics = sstable.FilterMetrics - -// ThroughputMetric is a cumulative throughput metric. See the detailed -// comment in base. -type ThroughputMetric = base.ThroughputMetric - -// SecondaryCacheMetrics holds metrics for the persistent secondary cache -// that caches commonly accessed blocks from blob storage on a local -// file system. -type SecondaryCacheMetrics = sharedcache.Metrics - -// LevelMetrics holds per-level metrics such as the number of files and total -// size of the files, and compaction related metrics. -type LevelMetrics struct { - // The number of sublevels within the level. The sublevel count corresponds - // to the read amplification for the level. An empty level will have a - // sublevel count of 0, implying no read amplification. Only L0 will have - // a sublevel count other than 0 or 1. - Sublevels int32 - // The total number of files in the level. - NumFiles int64 - // The total number of virtual sstables in the level. - NumVirtualFiles uint64 - // The total size in bytes of the files in the level. - Size int64 - // The total size of the virtual sstables in the level. - VirtualSize uint64 - // The level's compaction score. This is the compensatedScoreRatio in the - // candidateLevelInfo. - Score float64 - // The number of incoming bytes from other levels read during - // compactions. This excludes bytes moved and bytes ingested. For L0 this is - // the bytes written to the WAL. - BytesIn uint64 - // The number of bytes ingested. The sibling metric for tables is - // TablesIngested. - BytesIngested uint64 - // The number of bytes moved into the level by a "move" compaction. The - // sibling metric for tables is TablesMoved. - BytesMoved uint64 - // The number of bytes read for compactions at the level. This includes bytes - // read from other levels (BytesIn), as well as bytes read for the level. - BytesRead uint64 - // The number of bytes written during compactions. The sibling - // metric for tables is TablesCompacted. This metric may be summed - // with BytesFlushed to compute the total bytes written for the level. - BytesCompacted uint64 - // The number of bytes written during flushes. The sibling - // metrics for tables is TablesFlushed. This metric is always - // zero for all levels other than L0. - BytesFlushed uint64 - // The number of sstables compacted to this level. - TablesCompacted uint64 - // The number of sstables flushed to this level. - TablesFlushed uint64 - // The number of sstables ingested into the level. - TablesIngested uint64 - // The number of sstables moved to this level by a "move" compaction. - TablesMoved uint64 - - MultiLevel struct { - // BytesInTop are the total bytes in a multilevel compaction coming from the top level. - BytesInTop uint64 - - // BytesIn, exclusively for multiLevel compactions. - BytesIn uint64 - - // BytesRead, exclusively for multilevel compactions. - BytesRead uint64 - } - - // Additional contains misc additional metrics that are not always printed. - Additional struct { - // The sum of Properties.ValueBlocksSize for all the sstables in this - // level. Printed by LevelMetrics.format iff there is at least one level - // with a non-zero value. - ValueBlocksSize uint64 - // Cumulative metrics about bytes written to data blocks and value blocks, - // via compactions (except move compactions) or flushes. Not printed by - // LevelMetrics.format, but are available to sophisticated clients. - BytesWrittenDataBlocks uint64 - BytesWrittenValueBlocks uint64 - } -} - -// Add updates the counter metrics for the level. -func (m *LevelMetrics) Add(u *LevelMetrics) { - m.NumFiles += u.NumFiles - m.NumVirtualFiles += u.NumVirtualFiles - m.VirtualSize += u.VirtualSize - m.Size += u.Size - m.BytesIn += u.BytesIn - m.BytesIngested += u.BytesIngested - m.BytesMoved += u.BytesMoved - m.BytesRead += u.BytesRead - m.BytesCompacted += u.BytesCompacted - m.BytesFlushed += u.BytesFlushed - m.TablesCompacted += u.TablesCompacted - m.TablesFlushed += u.TablesFlushed - m.TablesIngested += u.TablesIngested - m.TablesMoved += u.TablesMoved - m.MultiLevel.BytesInTop += u.MultiLevel.BytesInTop - m.MultiLevel.BytesRead += u.MultiLevel.BytesRead - m.MultiLevel.BytesIn += u.MultiLevel.BytesIn - m.Additional.BytesWrittenDataBlocks += u.Additional.BytesWrittenDataBlocks - m.Additional.BytesWrittenValueBlocks += u.Additional.BytesWrittenValueBlocks - m.Additional.ValueBlocksSize += u.Additional.ValueBlocksSize -} - -// WriteAmp computes the write amplification for compactions at this -// level. Computed as (BytesFlushed + BytesCompacted) / BytesIn. -func (m *LevelMetrics) WriteAmp() float64 { - if m.BytesIn == 0 { - return 0 - } - return float64(m.BytesFlushed+m.BytesCompacted) / float64(m.BytesIn) -} - -// Metrics holds metrics for various subsystems of the DB such as the Cache, -// Compactions, WAL, and per-Level metrics. -// -// TODO(peter): The testing of these metrics is relatively weak. There should -// be testing that performs various operations on a DB and verifies that the -// metrics reflect those operations. -type Metrics struct { - BlockCache CacheMetrics - - Compact struct { - // The total number of compactions, and per-compaction type counts. - Count int64 - DefaultCount int64 - DeleteOnlyCount int64 - ElisionOnlyCount int64 - MoveCount int64 - ReadCount int64 - RewriteCount int64 - MultiLevelCount int64 - CounterLevelCount int64 - // An estimate of the number of bytes that need to be compacted for the LSM - // to reach a stable state. - EstimatedDebt uint64 - // Number of bytes present in sstables being written by in-progress - // compactions. This value will be zero if there are no in-progress - // compactions. - InProgressBytes int64 - // Number of compactions that are in-progress. - NumInProgress int64 - // MarkedFiles is a count of files that are marked for - // compaction. Such files are compacted in a rewrite compaction - // when no other compactions are picked. - MarkedFiles int - // Duration records the cumulative duration of all compactions since the - // database was opened. - Duration time.Duration - } - - Ingest struct { - // The total number of ingestions - Count uint64 - } - - Flush struct { - // The total number of flushes. - Count int64 - WriteThroughput ThroughputMetric - // Number of flushes that are in-progress. In the current implementation - // this will always be zero or one. - NumInProgress int64 - // AsIngestCount is a monotonically increasing counter of flush operations - // handling ingested tables. - AsIngestCount uint64 - // AsIngestCount is a monotonically increasing counter of tables ingested as - // flushables. - AsIngestTableCount uint64 - // AsIngestBytes is a monotonically increasing counter of the bytes flushed - // for flushables that originated as ingestion operations. - AsIngestBytes uint64 - } - - Filter FilterMetrics - - Levels [numLevels]LevelMetrics - - MemTable struct { - // The number of bytes allocated by memtables and large (flushable) - // batches. - Size uint64 - // The count of memtables. - Count int64 - // The number of bytes present in zombie memtables which are no longer - // referenced by the current DB state. An unbounded number of memtables - // may be zombie if they're still in use by an iterator. One additional - // memtable may be zombie if it's no longer in use and waiting to be - // recycled. - ZombieSize uint64 - // The count of zombie memtables. - ZombieCount int64 - } - - Keys struct { - // The approximate count of internal range key set keys in the database. - RangeKeySetsCount uint64 - // The approximate count of internal tombstones (DEL, SINGLEDEL and - // RANGEDEL key kinds) within the database. - TombstoneCount uint64 - // A cumulative total number of missized DELSIZED keys encountered by - // compactions since the database was opened. - MissizedTombstonesCount uint64 - } - - Snapshots struct { - // The number of currently open snapshots. - Count int - // The sequence number of the earliest, currently open snapshot. - EarliestSeqNum uint64 - // A running tally of keys written to sstables during flushes or - // compactions that would've been elided if it weren't for open - // snapshots. - PinnedKeys uint64 - // A running cumulative sum of the size of keys and values written to - // sstables during flushes or compactions that would've been elided if - // it weren't for open snapshots. - PinnedSize uint64 - } - - Table struct { - // The number of bytes present in obsolete tables which are no longer - // referenced by the current DB state or any open iterators. - ObsoleteSize uint64 - // The count of obsolete tables. - ObsoleteCount int64 - // The number of bytes present in zombie tables which are no longer - // referenced by the current DB state but are still in use by an iterator. - ZombieSize uint64 - // The count of zombie tables. - ZombieCount int64 - // The count of the backing sstables. - BackingTableCount uint64 - // The sum of the sizes of the all of the backing sstables. - BackingTableSize uint64 - } - - TableCache CacheMetrics - - // Count of the number of open sstable iterators. - TableIters int64 - // Uptime is the total time since this DB was opened. - Uptime time.Duration - - WAL struct { - // Number of live WAL files. - Files int64 - // Number of obsolete WAL files. - ObsoleteFiles int64 - // Physical size of the obsolete WAL files. - ObsoletePhysicalSize uint64 - // Size of the live data in the WAL files. Note that with WAL file - // recycling this is less than the actual on-disk size of the WAL files. - Size uint64 - // Physical size of the WAL files on-disk. With WAL file recycling, - // this is greater than the live data in WAL files. - PhysicalSize uint64 - // Number of logical bytes written to the WAL. - BytesIn uint64 - // Number of bytes written to the WAL. - BytesWritten uint64 - } - - LogWriter struct { - FsyncLatency prometheus.Histogram - record.LogWriterMetrics - } - - SecondaryCacheMetrics SecondaryCacheMetrics - - private struct { - optionsFileSize uint64 - manifestFileSize uint64 - } -} - -var ( - // FsyncLatencyBuckets are prometheus histogram buckets suitable for a histogram - // that records latencies for fsyncs. - FsyncLatencyBuckets = append( - prometheus.LinearBuckets(0.0, float64(time.Microsecond*100), 50), - prometheus.ExponentialBucketsRange(float64(time.Millisecond*5), float64(10*time.Second), 50)..., - ) - - // SecondaryCacheIOBuckets exported to enable exporting from package pebble to - // enable exporting metrics with below buckets in CRDB. - SecondaryCacheIOBuckets = sharedcache.IOBuckets - // SecondaryCacheChannelWriteBuckets exported to enable exporting from package - // pebble to enable exporting metrics with below buckets in CRDB. - SecondaryCacheChannelWriteBuckets = sharedcache.ChannelWriteBuckets -) - -// DiskSpaceUsage returns the total disk space used by the database in bytes, -// including live and obsolete files. -func (m *Metrics) DiskSpaceUsage() uint64 { - var usageBytes uint64 - usageBytes += m.WAL.PhysicalSize - usageBytes += m.WAL.ObsoletePhysicalSize - for _, lm := range m.Levels { - usageBytes += uint64(lm.Size) - } - usageBytes += m.Table.ObsoleteSize - usageBytes += m.Table.ZombieSize - usageBytes += m.private.optionsFileSize - usageBytes += m.private.manifestFileSize - usageBytes += uint64(m.Compact.InProgressBytes) - return usageBytes -} - -// NumVirtual is the number of virtual sstables in the latest version -// summed over every level in the lsm. -func (m *Metrics) NumVirtual() uint64 { - var n uint64 - for _, level := range m.Levels { - n += level.NumVirtualFiles - } - return n -} - -// VirtualSize is the sum of the sizes of the virtual sstables in the -// latest version. BackingTableSize - VirtualSize gives an estimate for -// the space amplification caused by not compacting virtual sstables. -func (m *Metrics) VirtualSize() uint64 { - var size uint64 - for _, level := range m.Levels { - size += level.VirtualSize - } - return size -} - -// ReadAmp returns the current read amplification of the database. -// It's computed as the number of sublevels in L0 + the number of non-empty -// levels below L0. -func (m *Metrics) ReadAmp() int { - var ramp int32 - for _, l := range m.Levels { - ramp += l.Sublevels - } - return int(ramp) -} - -// Total returns the sum of the per-level metrics and WAL metrics. -func (m *Metrics) Total() LevelMetrics { - var total LevelMetrics - for level := 0; level < numLevels; level++ { - l := &m.Levels[level] - total.Add(l) - total.Sublevels += l.Sublevels - } - // Compute total bytes-in as the bytes written to the WAL + bytes ingested. - total.BytesIn = m.WAL.BytesWritten + total.BytesIngested - // Add the total bytes-in to the total bytes-flushed. This is to account for - // the bytes written to the log and bytes written externally and then - // ingested. - total.BytesFlushed += total.BytesIn - return total -} - -// String pretty-prints the metrics as below: -// -// | | | | ingested | moved | written | | amp -// level | tables size val-bl vtables | score | in | tables size | tables size | tables size | read | r w -// ------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+--------- -// 0 | 101 102B 0B 0 | 103.0 | 104B | 112 104B | 113 106B | 221 217B | 107B | 1 2.1 -// 1 | 201 202B 0B 0 | 203.0 | 204B | 212 204B | 213 206B | 421 417B | 207B | 2 2.0 -// 2 | 301 302B 0B 0 | 303.0 | 304B | 312 304B | 313 306B | 621 617B | 307B | 3 2.0 -// 3 | 401 402B 0B 0 | 403.0 | 404B | 412 404B | 413 406B | 821 817B | 407B | 4 2.0 -// 4 | 501 502B 0B 0 | 503.0 | 504B | 512 504B | 513 506B | 1.0K 1017B | 507B | 5 2.0 -// 5 | 601 602B 0B 0 | 603.0 | 604B | 612 604B | 613 606B | 1.2K 1.2KB | 607B | 6 2.0 -// 6 | 701 702B 0B 0 | - | 704B | 712 704B | 713 706B | 1.4K 1.4KB | 707B | 7 2.0 -// total | 2.8K 2.7KB 0B 0 | - | 2.8KB | 2.9K 2.8KB | 2.9K 2.8KB | 5.7K 8.4KB | 2.8KB | 28 3.0 -// ------------------------------------------------------------------------------------------------------------------- -// WAL: 22 files (24B) in: 25B written: 26B (4% overhead) -// Flushes: 8 -// Compactions: 5 estimated debt: 6B in progress: 2 (7B) -// default: 27 delete: 28 elision: 29 move: 30 read: 31 rewrite: 32 multi-level: 33 -// MemTables: 12 (11B) zombie: 14 (13B) -// Zombie tables: 16 (15B) -// Backing tables: 0 (0B) -// Block cache: 2 entries (1B) hit rate: 42.9% -// Table cache: 18 entries (17B) hit rate: 48.7% -// Secondary cache: 40 entries (40B) hit rate: 49.9% -// Snapshots: 4 earliest seq num: 1024 -// Table iters: 21 -// Filter utility: 47.4% -// Ingestions: 27 as flushable: 36 (34B in 35 tables) -func (m *Metrics) String() string { - return redact.StringWithoutMarkers(m) -} - -var _ redact.SafeFormatter = &Metrics{} - -// SafeFormat implements redact.SafeFormatter. -func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune) { - // NB: Pebble does not make any assumptions as to which Go primitive types - // have been registered as safe with redact.RegisterSafeType and does not - // register any types itself. Some of the calls to `redact.Safe`, etc are - // superfluous in the context of CockroachDB, which registers all the Go - // numeric types as safe. - - // TODO(jackson): There are a few places where we use redact.SafeValue - // instead of redact.RedactableString. This is necessary because of a bug - // whereby formatting a redact.RedactableString argument does not respect - // width specifiers. When the issue is fixed, we can convert these to - // RedactableStrings. https://github.com/cockroachdb/redact/issues/17 - - multiExists := m.Compact.MultiLevelCount > 0 - appendIfMulti := func(line redact.SafeString) { - if multiExists { - w.SafeString(line) - } - } - newline := func() { - w.SafeString("\n") - } - - w.SafeString(" | | | | ingested | moved | written | | amp") - appendIfMulti(" | multilevel") - newline() - w.SafeString("level | tables size val-bl vtables | score | in | tables size | tables size | tables size | read | r w") - appendIfMulti(" | top in read") - newline() - w.SafeString("------+-----------------------------+-------+-------+--------------+--------------+--------------+-------+---------") - appendIfMulti("-+------------------") - newline() - - // formatRow prints out a row of the table. - formatRow := func(m *LevelMetrics, score float64) { - scoreStr := "-" - if !math.IsNaN(score) { - // Try to keep the string no longer than 5 characters. - switch { - case score < 99.995: - scoreStr = fmt.Sprintf("%.2f", score) - case score < 999.95: - scoreStr = fmt.Sprintf("%.1f", score) - default: - scoreStr = fmt.Sprintf("%.0f", score) - } - } - var wampStr string - if wamp := m.WriteAmp(); wamp > 99.5 { - wampStr = fmt.Sprintf("%.0f", wamp) - } else { - wampStr = fmt.Sprintf("%.1f", wamp) - } - - w.Printf("| %5s %6s %6s %7s | %5s | %5s | %5s %6s | %5s %6s | %5s %6s | %5s | %3d %4s", - humanize.Count.Int64(m.NumFiles), - humanize.Bytes.Int64(m.Size), - humanize.Bytes.Uint64(m.Additional.ValueBlocksSize), - humanize.Count.Uint64(m.NumVirtualFiles), - redact.Safe(scoreStr), - humanize.Bytes.Uint64(m.BytesIn), - humanize.Count.Uint64(m.TablesIngested), - humanize.Bytes.Uint64(m.BytesIngested), - humanize.Count.Uint64(m.TablesMoved), - humanize.Bytes.Uint64(m.BytesMoved), - humanize.Count.Uint64(m.TablesFlushed+m.TablesCompacted), - humanize.Bytes.Uint64(m.BytesFlushed+m.BytesCompacted), - humanize.Bytes.Uint64(m.BytesRead), - redact.Safe(m.Sublevels), - redact.Safe(wampStr)) - - if multiExists { - w.Printf(" | %5s %5s %5s", - humanize.Bytes.Uint64(m.MultiLevel.BytesInTop), - humanize.Bytes.Uint64(m.MultiLevel.BytesIn), - humanize.Bytes.Uint64(m.MultiLevel.BytesRead)) - } - newline() - } - - var total LevelMetrics - for level := 0; level < numLevels; level++ { - l := &m.Levels[level] - w.Printf("%5d ", redact.Safe(level)) - - // Format the score. - score := math.NaN() - if level < numLevels-1 { - score = l.Score - } - formatRow(l, score) - total.Add(l) - total.Sublevels += l.Sublevels - } - // Compute total bytes-in as the bytes written to the WAL + bytes ingested. - total.BytesIn = m.WAL.BytesWritten + total.BytesIngested - // Add the total bytes-in to the total bytes-flushed. This is to account for - // the bytes written to the log and bytes written externally and then - // ingested. - total.BytesFlushed += total.BytesIn - w.SafeString("total ") - formatRow(&total, math.NaN()) - - w.SafeString("-------------------------------------------------------------------------------------------------------------------") - appendIfMulti("--------------------") - newline() - w.Printf("WAL: %d files (%s) in: %s written: %s (%.0f%% overhead)\n", - redact.Safe(m.WAL.Files), - humanize.Bytes.Uint64(m.WAL.Size), - humanize.Bytes.Uint64(m.WAL.BytesIn), - humanize.Bytes.Uint64(m.WAL.BytesWritten), - redact.Safe(percent(int64(m.WAL.BytesWritten)-int64(m.WAL.BytesIn), int64(m.WAL.BytesIn)))) - - w.Printf("Flushes: %d\n", redact.Safe(m.Flush.Count)) - - w.Printf("Compactions: %d estimated debt: %s in progress: %d (%s)\n", - redact.Safe(m.Compact.Count), - humanize.Bytes.Uint64(m.Compact.EstimatedDebt), - redact.Safe(m.Compact.NumInProgress), - humanize.Bytes.Int64(m.Compact.InProgressBytes)) - - w.Printf(" default: %d delete: %d elision: %d move: %d read: %d rewrite: %d multi-level: %d\n", - redact.Safe(m.Compact.DefaultCount), - redact.Safe(m.Compact.DeleteOnlyCount), - redact.Safe(m.Compact.ElisionOnlyCount), - redact.Safe(m.Compact.MoveCount), - redact.Safe(m.Compact.ReadCount), - redact.Safe(m.Compact.RewriteCount), - redact.Safe(m.Compact.MultiLevelCount)) - - w.Printf("MemTables: %d (%s) zombie: %d (%s)\n", - redact.Safe(m.MemTable.Count), - humanize.Bytes.Uint64(m.MemTable.Size), - redact.Safe(m.MemTable.ZombieCount), - humanize.Bytes.Uint64(m.MemTable.ZombieSize)) - - w.Printf("Zombie tables: %d (%s)\n", - redact.Safe(m.Table.ZombieCount), - humanize.Bytes.Uint64(m.Table.ZombieSize)) - - w.Printf("Backing tables: %d (%s)\n", - redact.Safe(m.Table.BackingTableCount), - humanize.Bytes.Uint64(m.Table.BackingTableSize)) - w.Printf("Virtual tables: %d (%s)\n", - redact.Safe(m.NumVirtual()), - humanize.Bytes.Uint64(m.VirtualSize())) - - formatCacheMetrics := func(m *CacheMetrics, name redact.SafeString) { - w.Printf("%s: %s entries (%s) hit rate: %.1f%%\n", - name, - humanize.Count.Int64(m.Count), - humanize.Bytes.Int64(m.Size), - redact.Safe(hitRate(m.Hits, m.Misses))) - } - formatCacheMetrics(&m.BlockCache, "Block cache") - formatCacheMetrics(&m.TableCache, "Table cache") - - formatSharedCacheMetrics := func(w redact.SafePrinter, m *SecondaryCacheMetrics, name redact.SafeString) { - w.Printf("%s: %s entries (%s) hit rate: %.1f%%\n", - name, - humanize.Count.Int64(m.Count), - humanize.Bytes.Int64(m.Size), - redact.Safe(hitRate(m.ReadsWithFullHit, m.ReadsWithPartialHit+m.ReadsWithNoHit))) - } - formatSharedCacheMetrics(w, &m.SecondaryCacheMetrics, "Secondary cache") - - w.Printf("Snapshots: %d earliest seq num: %d\n", - redact.Safe(m.Snapshots.Count), - redact.Safe(m.Snapshots.EarliestSeqNum)) - - w.Printf("Table iters: %d\n", redact.Safe(m.TableIters)) - w.Printf("Filter utility: %.1f%%\n", redact.Safe(hitRate(m.Filter.Hits, m.Filter.Misses))) - w.Printf("Ingestions: %d as flushable: %d (%s in %d tables)\n", - redact.Safe(m.Ingest.Count), - redact.Safe(m.Flush.AsIngestCount), - humanize.Bytes.Uint64(m.Flush.AsIngestBytes), - redact.Safe(m.Flush.AsIngestTableCount)) -} - -func hitRate(hits, misses int64) float64 { - return percent(hits, hits+misses) -} - -func percent(numerator, denominator int64) float64 { - if denominator == 0 { - return 0 - } - return 100 * float64(numerator) / float64(denominator) -} - -// StringForTests is identical to m.String() on 64-bit platforms. It is used to -// provide a platform-independent result for tests. -func (m *Metrics) StringForTests() string { - mCopy := *m - if math.MaxInt == math.MaxInt32 { - // This is the difference in Sizeof(sstable.Reader{})) between 64 and 32 bit - // platforms. - const tableCacheSizeAdjustment = 212 - mCopy.TableCache.Size += mCopy.TableCache.Count * tableCacheSizeAdjustment - } - return redact.StringWithoutMarkers(&mCopy) -} diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote_readable.go b/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote_readable.go deleted file mode 100644 index 991a1ba..0000000 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote_readable.go +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package objstorageprovider - -import ( - "context" - "io" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" - "github.com/cockroachdb/pebble/objstorage/remote" -) - -const remoteMaxReadaheadSize = 1024 * 1024 /* 1MB */ - -// remoteReadable is a very simple implementation of Readable on top of the -// ReadCloser returned by remote.Storage.CreateObject. -type remoteReadable struct { - objReader remote.ObjectReader - size int64 - fileNum base.DiskFileNum - provider *provider -} - -var _ objstorage.Readable = (*remoteReadable)(nil) - -func (p *provider) newRemoteReadable( - objReader remote.ObjectReader, size int64, fileNum base.DiskFileNum, -) *remoteReadable { - return &remoteReadable{ - objReader: objReader, - size: size, - fileNum: fileNum, - provider: p, - } -} - -// ReadAt is part of the objstorage.Readable interface. -func (r *remoteReadable) ReadAt(ctx context.Context, p []byte, offset int64) error { - return r.readInternal(ctx, p, offset, false /* forCompaction */) -} - -// readInternal performs a read for the object, using the cache when -// appropriate. -func (r *remoteReadable) readInternal( - ctx context.Context, p []byte, offset int64, forCompaction bool, -) error { - if cache := r.provider.remote.cache; cache != nil { - flags := sharedcache.ReadFlags{ - // Don't add data to the cache if this read is for a compaction. - ReadOnly: forCompaction, - } - return r.provider.remote.cache.ReadAt(ctx, r.fileNum, p, offset, r.objReader, r.size, flags) - } - return r.objReader.ReadAt(ctx, p, offset) -} - -func (r *remoteReadable) Close() error { - defer func() { r.objReader = nil }() - return r.objReader.Close() -} - -func (r *remoteReadable) Size() int64 { - return r.size -} - -func (r *remoteReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle { - // TODO(radu): use a pool. - rh := &remoteReadHandle{readable: r} - rh.readahead.state = makeReadaheadState(remoteMaxReadaheadSize) - return rh -} - -type remoteReadHandle struct { - readable *remoteReadable - readahead struct { - state readaheadState - data []byte - offset int64 - } - forCompaction bool -} - -var _ objstorage.ReadHandle = (*remoteReadHandle)(nil) - -// ReadAt is part of the objstorage.ReadHandle interface. -func (r *remoteReadHandle) ReadAt(ctx context.Context, p []byte, offset int64) error { - readaheadSize := r.maybeReadahead(offset, len(p)) - - // Check if we already have the data from a previous read-ahead. - if rhSize := int64(len(r.readahead.data)); rhSize > 0 { - if r.readahead.offset <= offset && r.readahead.offset+rhSize > offset { - n := copy(p, r.readahead.data[offset-r.readahead.offset:]) - if n == len(p) { - // All data was available. - return nil - } - // Use the data that we had and do a shorter read. - offset += int64(n) - p = p[n:] - readaheadSize -= n - } - } - - if readaheadSize > len(p) { - // Don't try to read past EOF. - if offset+int64(readaheadSize) > r.readable.size { - readaheadSize = int(r.readable.size - offset) - if readaheadSize <= 0 { - // This shouldn't happen in practice (Pebble should never try to read - // past EOF). - return io.EOF - } - } - r.readahead.offset = offset - // TODO(radu): we need to somehow account for this memory. - if cap(r.readahead.data) >= readaheadSize { - r.readahead.data = r.readahead.data[:readaheadSize] - } else { - r.readahead.data = make([]byte, readaheadSize) - } - - if err := r.readable.readInternal(ctx, r.readahead.data, offset, r.forCompaction); err != nil { - // Make sure we don't treat the data as valid next time. - r.readahead.data = r.readahead.data[:0] - return err - } - copy(p, r.readahead.data) - return nil - } - - return r.readable.readInternal(ctx, p, offset, r.forCompaction) -} - -func (r *remoteReadHandle) maybeReadahead(offset int64, len int) int { - if r.forCompaction { - return remoteMaxReadaheadSize - } - return int(r.readahead.state.maybeReadahead(offset, int64(len))) -} - -// Close is part of the objstorage.ReadHandle interface. -func (r *remoteReadHandle) Close() error { - r.readable = nil - r.readahead.data = nil - return nil -} - -// SetupForCompaction is part of the objstorage.ReadHandle interface. -func (r *remoteReadHandle) SetupForCompaction() { - r.forCompaction = true -} - -// RecordCacheHit is part of the objstorage.ReadHandle interface. -func (r *remoteReadHandle) RecordCacheHit(_ context.Context, offset, size int64) { - if !r.forCompaction { - r.readahead.state.recordCacheHit(offset, size) - } -} diff --git a/vendor/github.com/cockroachdb/pebble/open.go b/vendor/github.com/cockroachdb/pebble/open.go deleted file mode 100644 index 3963d9c..0000000 --- a/vendor/github.com/cockroachdb/pebble/open.go +++ /dev/null @@ -1,1191 +0,0 @@ -// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "bytes" - "context" - "encoding/binary" - "fmt" - "io" - "math" - "os" - "sort" - "sync/atomic" - "time" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/errors/oserror" - "github.com/cockroachdb/pebble/internal/arenaskl" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/cache" - "github.com/cockroachdb/pebble/internal/constants" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/internal/manual" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider" - "github.com/cockroachdb/pebble/record" - "github.com/cockroachdb/pebble/sstable" - "github.com/cockroachdb/pebble/vfs" - "github.com/prometheus/client_golang/prometheus" -) - -const ( - initialMemTableSize = 256 << 10 // 256 KB - - // The max batch size is limited by the uint32 offsets stored in - // internal/batchskl.node, DeferredBatchOp, and flushableBatchEntry. - // - // We limit the size to MaxUint32 (just short of 4GB) so that the exclusive - // end of an allocation fits in uint32. - // - // On 32-bit systems, slices are naturally limited to MaxInt (just short of - // 2GB). - maxBatchSize = constants.MaxUint32OrInt - - // The max memtable size is limited by the uint32 offsets stored in - // internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry. - // - // We limit the size to MaxUint32 (just short of 4GB) so that the exclusive - // end of an allocation fits in uint32. - // - // On 32-bit systems, slices are naturally limited to MaxInt (just short of - // 2GB). - maxMemTableSize = constants.MaxUint32OrInt -) - -// TableCacheSize can be used to determine the table -// cache size for a single db, given the maximum open -// files which can be used by a table cache which is -// only used by a single db. -func TableCacheSize(maxOpenFiles int) int { - tableCacheSize := maxOpenFiles - numNonTableCacheFiles - if tableCacheSize < minTableCacheSize { - tableCacheSize = minTableCacheSize - } - return tableCacheSize -} - -// Open opens a DB whose files live in the given directory. -func Open(dirname string, opts *Options) (db *DB, _ error) { - // Make a copy of the options so that we don't mutate the passed in options. - opts = opts.Clone() - opts = opts.EnsureDefaults() - if err := opts.Validate(); err != nil { - return nil, err - } - if opts.LoggerAndTracer == nil { - opts.LoggerAndTracer = &base.LoggerWithNoopTracer{Logger: opts.Logger} - } else { - opts.Logger = opts.LoggerAndTracer - } - - // In all error cases, we return db = nil; this is used by various - // deferred cleanups. - - // Open the database and WAL directories first. - walDirname, dataDir, walDir, err := prepareAndOpenDirs(dirname, opts) - if err != nil { - return nil, errors.Wrapf(err, "error opening database at %q", dirname) - } - defer func() { - if db == nil { - if walDir != dataDir { - walDir.Close() - } - dataDir.Close() - } - }() - - // Lock the database directory. - var fileLock *Lock - if opts.Lock != nil { - // The caller already acquired the database lock. Ensure that the - // directory matches. - if dirname != opts.Lock.dirname { - return nil, errors.Newf("pebble: opts.Lock acquired in %q not %q", opts.Lock.dirname, dirname) - } - if err := opts.Lock.refForOpen(); err != nil { - return nil, err - } - fileLock = opts.Lock - } else { - fileLock, err = LockDirectory(dirname, opts.FS) - if err != nil { - return nil, err - } - } - defer func() { - if db == nil { - fileLock.Close() - } - }() - - // Establish the format major version. - formatVersion, formatVersionMarker, err := lookupFormatMajorVersion(opts.FS, dirname) - if err != nil { - return nil, err - } - defer func() { - if db == nil { - formatVersionMarker.Close() - } - }() - - // Find the currently active manifest, if there is one. - manifestMarker, manifestFileNum, manifestExists, err := findCurrentManifest(formatVersion, opts.FS, dirname) - if err != nil { - return nil, errors.Wrapf(err, "pebble: database %q", dirname) - } - defer func() { - if db == nil { - manifestMarker.Close() - } - }() - - // Atomic markers may leave behind obsolete files if there's a crash - // mid-update. Clean these up if we're not in read-only mode. - if !opts.ReadOnly { - if err := formatVersionMarker.RemoveObsolete(); err != nil { - return nil, err - } - if err := manifestMarker.RemoveObsolete(); err != nil { - return nil, err - } - } - - if opts.Cache == nil { - opts.Cache = cache.New(cacheDefaultSize) - } else { - opts.Cache.Ref() - } - - d := &DB{ - cacheID: opts.Cache.NewID(), - dirname: dirname, - walDirname: walDirname, - opts: opts, - cmp: opts.Comparer.Compare, - equal: opts.equal(), - merge: opts.Merger.Merge, - split: opts.Comparer.Split, - abbreviatedKey: opts.Comparer.AbbreviatedKey, - largeBatchThreshold: (opts.MemTableSize - uint64(memTableEmptySize)) / 2, - fileLock: fileLock, - dataDir: dataDir, - walDir: walDir, - logRecycler: logRecycler{limit: opts.MemTableStopWritesThreshold + 1}, - closed: new(atomic.Value), - closedCh: make(chan struct{}), - } - d.mu.versions = &versionSet{} - d.diskAvailBytes.Store(math.MaxUint64) - - defer func() { - // If an error or panic occurs during open, attempt to release the manually - // allocated memory resources. Note that rather than look for an error, we - // look for the return of a nil DB pointer. - if r := recover(); db == nil { - // Release our references to the Cache. Note that both the DB, and - // tableCache have a reference. When we release the reference to - // the tableCache, and if there are no other references to - // the tableCache, then the tableCache will also release its - // reference to the cache. - opts.Cache.Unref() - - if d.tableCache != nil { - _ = d.tableCache.close() - } - - for _, mem := range d.mu.mem.queue { - switch t := mem.flushable.(type) { - case *memTable: - manual.Free(t.arenaBuf) - t.arenaBuf = nil - } - } - if d.cleanupManager != nil { - d.cleanupManager.Close() - } - if d.objProvider != nil { - d.objProvider.Close() - } - if r != nil { - panic(r) - } - } - }() - - d.commit = newCommitPipeline(commitEnv{ - logSeqNum: &d.mu.versions.logSeqNum, - visibleSeqNum: &d.mu.versions.visibleSeqNum, - apply: d.commitApply, - write: d.commitWrite, - }) - d.mu.nextJobID = 1 - d.mu.mem.nextSize = opts.MemTableSize - if d.mu.mem.nextSize > initialMemTableSize { - d.mu.mem.nextSize = initialMemTableSize - } - d.mu.compact.cond.L = &d.mu.Mutex - d.mu.compact.inProgress = make(map[*compaction]struct{}) - d.mu.compact.noOngoingFlushStartTime = time.Now() - d.mu.snapshots.init() - // logSeqNum is the next sequence number that will be assigned. - // Start assigning sequence numbers from base.SeqNumStart to leave - // room for reserved sequence numbers (see comments around - // SeqNumStart). - d.mu.versions.logSeqNum.Store(base.SeqNumStart) - d.mu.formatVers.vers.Store(uint64(formatVersion)) - d.mu.formatVers.marker = formatVersionMarker - - d.timeNow = time.Now - d.openedAt = d.timeNow() - - d.mu.Lock() - defer d.mu.Unlock() - - jobID := d.mu.nextJobID - d.mu.nextJobID++ - - setCurrent := setCurrentFunc(d.FormatMajorVersion(), manifestMarker, opts.FS, dirname, d.dataDir) - - if !manifestExists { - // DB does not exist. - if d.opts.ErrorIfNotExists || d.opts.ReadOnly { - return nil, errors.Wrapf(ErrDBDoesNotExist, "dirname=%q", dirname) - } - - // Create the DB. - if err := d.mu.versions.create(jobID, dirname, opts, manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil { - return nil, err - } - } else { - if opts.ErrorIfExists { - return nil, errors.Wrapf(ErrDBAlreadyExists, "dirname=%q", dirname) - } - // Load the version set. - if err := d.mu.versions.load(dirname, opts, manifestFileNum.FileNum(), manifestMarker, setCurrent, d.FormatMajorVersion, &d.mu.Mutex); err != nil { - return nil, err - } - if opts.ErrorIfNotPristine { - liveFileNums := make(map[base.DiskFileNum]struct{}) - d.mu.versions.addLiveFileNums(liveFileNums) - if len(liveFileNums) != 0 { - return nil, errors.Wrapf(ErrDBNotPristine, "dirname=%q", dirname) - } - } - } - - // In read-only mode, we replay directly into the mutable memtable but never - // flush it. We need to delay creation of the memtable until we know the - // sequence number of the first batch that will be inserted. - if !d.opts.ReadOnly { - var entry *flushableEntry - d.mu.mem.mutable, entry = d.newMemTable(0 /* logNum */, d.mu.versions.logSeqNum.Load()) - d.mu.mem.queue = append(d.mu.mem.queue, entry) - } - - // List the objects - ls, err := opts.FS.List(d.walDirname) - if err != nil { - return nil, err - } - if d.dirname != d.walDirname { - ls2, err := opts.FS.List(d.dirname) - if err != nil { - return nil, err - } - ls = append(ls, ls2...) - } - providerSettings := objstorageprovider.Settings{ - Logger: opts.Logger, - FS: opts.FS, - FSDirName: dirname, - FSDirInitialListing: ls, - FSCleaner: opts.Cleaner, - NoSyncOnClose: opts.NoSyncOnClose, - BytesPerSync: opts.BytesPerSync, - } - providerSettings.Local.ReadaheadConfigFn = opts.Local.ReadaheadConfigFn - providerSettings.Remote.StorageFactory = opts.Experimental.RemoteStorage - providerSettings.Remote.CreateOnShared = opts.Experimental.CreateOnShared - providerSettings.Remote.CreateOnSharedLocator = opts.Experimental.CreateOnSharedLocator - providerSettings.Remote.CacheSizeBytes = opts.Experimental.SecondaryCacheSizeBytes - - d.objProvider, err = objstorageprovider.Open(providerSettings) - if err != nil { - return nil, err - } - - d.cleanupManager = openCleanupManager(opts, d.objProvider, d.onObsoleteTableDelete, d.getDeletionPacerInfo) - - if manifestExists { - curVersion := d.mu.versions.currentVersion() - if err := checkConsistency(curVersion, dirname, d.objProvider); err != nil { - return nil, err - } - } - - tableCacheSize := TableCacheSize(opts.MaxOpenFiles) - d.tableCache = newTableCacheContainer(opts.TableCache, d.cacheID, d.objProvider, d.opts, tableCacheSize) - d.newIters = d.tableCache.newIters - d.tableNewRangeKeyIter = d.tableCache.newRangeKeyIter - - // Replay any newer log files than the ones named in the manifest. - type fileNumAndName struct { - num FileNum - name string - } - var logFiles []fileNumAndName - var previousOptionsFileNum FileNum - var previousOptionsFilename string - for _, filename := range ls { - ft, fn, ok := base.ParseFilename(opts.FS, filename) - if !ok { - continue - } - - // Don't reuse any obsolete file numbers to avoid modifying an - // ingested sstable's original external file. - if d.mu.versions.nextFileNum <= fn.FileNum() { - d.mu.versions.nextFileNum = fn.FileNum() + 1 - } - - switch ft { - case fileTypeLog: - if fn.FileNum() >= d.mu.versions.minUnflushedLogNum { - logFiles = append(logFiles, fileNumAndName{fn.FileNum(), filename}) - } - if d.logRecycler.minRecycleLogNum <= fn.FileNum() { - d.logRecycler.minRecycleLogNum = fn.FileNum() + 1 - } - case fileTypeOptions: - if previousOptionsFileNum < fn.FileNum() { - previousOptionsFileNum = fn.FileNum() - previousOptionsFilename = filename - } - case fileTypeTemp, fileTypeOldTemp: - if !d.opts.ReadOnly { - // Some codepaths write to a temporary file and then - // rename it to its final location when complete. A - // temp file is leftover if a process exits before the - // rename. Remove it. - err := opts.FS.Remove(opts.FS.PathJoin(dirname, filename)) - if err != nil { - return nil, err - } - } - } - } - - // Ratchet d.mu.versions.nextFileNum ahead of all known objects in the - // objProvider. This avoids FileNum collisions with obsolete sstables. - objects := d.objProvider.List() - for _, obj := range objects { - if d.mu.versions.nextFileNum <= obj.DiskFileNum.FileNum() { - d.mu.versions.nextFileNum = obj.DiskFileNum.FileNum() + 1 - } - } - - // Validate the most-recent OPTIONS file, if there is one. - var strictWALTail bool - if previousOptionsFilename != "" { - path := opts.FS.PathJoin(dirname, previousOptionsFilename) - strictWALTail, err = checkOptions(opts, path) - if err != nil { - return nil, err - } - } - - sort.Slice(logFiles, func(i, j int) bool { - return logFiles[i].num < logFiles[j].num - }) - - var ve versionEdit - var toFlush flushableList - for i, lf := range logFiles { - lastWAL := i == len(logFiles)-1 - flush, maxSeqNum, err := d.replayWAL(jobID, &ve, opts.FS, - opts.FS.PathJoin(d.walDirname, lf.name), lf.num, strictWALTail && !lastWAL) - if err != nil { - return nil, err - } - toFlush = append(toFlush, flush...) - d.mu.versions.markFileNumUsed(lf.num) - if d.mu.versions.logSeqNum.Load() < maxSeqNum { - d.mu.versions.logSeqNum.Store(maxSeqNum) - } - } - d.mu.versions.visibleSeqNum.Store(d.mu.versions.logSeqNum.Load()) - - if !d.opts.ReadOnly { - // Create an empty .log file. - newLogNum := d.mu.versions.getNextFileNum() - - // This logic is slightly different than RocksDB's. Specifically, RocksDB - // sets MinUnflushedLogNum to max-recovered-log-num + 1. We set it to the - // newLogNum. There should be no difference in using either value. - ve.MinUnflushedLogNum = newLogNum - - // Create the manifest with the updated MinUnflushedLogNum before - // creating the new log file. If we created the log file first, a - // crash before the manifest is synced could leave two WALs with - // unclean tails. - d.mu.versions.logLock() - if err := d.mu.versions.logAndApply(jobID, &ve, newFileMetrics(ve.NewFiles), false /* forceRotation */, func() []compactionInfo { - return nil - }); err != nil { - return nil, err - } - - for _, entry := range toFlush { - entry.readerUnrefLocked(true) - } - - newLogName := base.MakeFilepath(opts.FS, d.walDirname, fileTypeLog, newLogNum.DiskFileNum()) - d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum.DiskFileNum(), fileSize: 0}) - logFile, err := opts.FS.Create(newLogName) - if err != nil { - return nil, err - } - if err := d.walDir.Sync(); err != nil { - return nil, err - } - d.opts.EventListener.WALCreated(WALCreateInfo{ - JobID: jobID, - Path: newLogName, - FileNum: newLogNum, - }) - // This isn't strictly necessary as we don't use the log number for - // memtables being flushed, only for the next unflushed memtable. - d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum = newLogNum - - logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{ - NoSyncOnClose: d.opts.NoSyncOnClose, - BytesPerSync: d.opts.WALBytesPerSync, - PreallocateSize: d.walPreallocateSize(), - }) - d.mu.log.metrics.fsyncLatency = prometheus.NewHistogram(prometheus.HistogramOpts{ - Buckets: FsyncLatencyBuckets, - }) - - logWriterConfig := record.LogWriterConfig{ - WALMinSyncInterval: d.opts.WALMinSyncInterval, - WALFsyncLatency: d.mu.log.metrics.fsyncLatency, - QueueSemChan: d.commit.logSyncQSem, - } - d.mu.log.LogWriter = record.NewLogWriter(logFile, newLogNum, logWriterConfig) - d.mu.versions.metrics.WAL.Files++ - } - d.updateReadStateLocked(d.opts.DebugCheck) - - // If the Options specify a format major version higher than the - // loaded database's, upgrade it. If this is a new database, this - // code path also performs an initial upgrade from the starting - // implicit MostCompatible version. - // - // We ratchet the version this far into Open so that migrations have a read - // state available. - if !d.opts.ReadOnly && opts.FormatMajorVersion > d.FormatMajorVersion() { - if err := d.ratchetFormatMajorVersionLocked(opts.FormatMajorVersion); err != nil { - return nil, err - } - } - - if !d.opts.ReadOnly { - // Write the current options to disk. - d.optionsFileNum = d.mu.versions.getNextFileNum().DiskFileNum() - tmpPath := base.MakeFilepath(opts.FS, dirname, fileTypeTemp, d.optionsFileNum) - optionsPath := base.MakeFilepath(opts.FS, dirname, fileTypeOptions, d.optionsFileNum) - - // Write them to a temporary file first, in case we crash before - // we're done. A corrupt options file prevents opening the - // database. - optionsFile, err := opts.FS.Create(tmpPath) - if err != nil { - return nil, err - } - serializedOpts := []byte(opts.String()) - if _, err := optionsFile.Write(serializedOpts); err != nil { - return nil, errors.CombineErrors(err, optionsFile.Close()) - } - d.optionsFileSize = uint64(len(serializedOpts)) - if err := optionsFile.Sync(); err != nil { - return nil, errors.CombineErrors(err, optionsFile.Close()) - } - if err := optionsFile.Close(); err != nil { - return nil, err - } - // Atomically rename to the OPTIONS-XXXXXX path. This rename is - // guaranteed to be atomic because the destination path does not - // exist. - if err := opts.FS.Rename(tmpPath, optionsPath); err != nil { - return nil, err - } - if err := d.dataDir.Sync(); err != nil { - return nil, err - } - } - - if !d.opts.ReadOnly { - d.scanObsoleteFiles(ls) - d.deleteObsoleteFiles(jobID) - } else { - // All the log files are obsolete. - d.mu.versions.metrics.WAL.Files = int64(len(logFiles)) - } - d.mu.tableStats.cond.L = &d.mu.Mutex - d.mu.tableValidation.cond.L = &d.mu.Mutex - if !d.opts.ReadOnly { - d.maybeCollectTableStatsLocked() - } - d.calculateDiskAvailableBytes() - - d.maybeScheduleFlush() - d.maybeScheduleCompaction() - - // Note: this is a no-op if invariants are disabled or race is enabled. - // - // Setting a finalizer on *DB causes *DB to never be reclaimed and the - // finalizer to never be run. The problem is due to this limitation of - // finalizers mention in the SetFinalizer docs: - // - // If a cyclic structure includes a block with a finalizer, that cycle is - // not guaranteed to be garbage collected and the finalizer is not - // guaranteed to run, because there is no ordering that respects the - // dependencies. - // - // DB has cycles with several of its internal structures: readState, - // newIters, tableCache, versions, etc. Each of this individually cause a - // cycle and prevent the finalizer from being run. But we can workaround this - // finializer limitation by setting a finalizer on another object that is - // tied to the lifetime of DB: the DB.closed atomic.Value. - dPtr := fmt.Sprintf("%p", d) - invariants.SetFinalizer(d.closed, func(obj interface{}) { - v := obj.(*atomic.Value) - if err := v.Load(); err == nil { - fmt.Fprintf(os.Stderr, "%s: unreferenced DB not closed\n", dPtr) - os.Exit(1) - } - }) - - return d, nil -} - -// prepareAndOpenDirs opens the directories for the store (and creates them if -// necessary). -// -// Returns an error if ReadOnly is set and the directories don't exist. -func prepareAndOpenDirs( - dirname string, opts *Options, -) (walDirname string, dataDir vfs.File, walDir vfs.File, err error) { - walDirname = opts.WALDir - if opts.WALDir == "" { - walDirname = dirname - } - - // Create directories if needed. - if !opts.ReadOnly { - if err := opts.FS.MkdirAll(dirname, 0755); err != nil { - return "", nil, nil, err - } - if walDirname != dirname { - if err := opts.FS.MkdirAll(walDirname, 0755); err != nil { - return "", nil, nil, err - } - } - } - - dataDir, err = opts.FS.OpenDir(dirname) - if err != nil { - if opts.ReadOnly && oserror.IsNotExist(err) { - return "", nil, nil, errors.Errorf("pebble: database %q does not exist", dirname) - } - return "", nil, nil, err - } - - if walDirname == dirname { - walDir = dataDir - } else { - walDir, err = opts.FS.OpenDir(walDirname) - if err != nil { - dataDir.Close() - return "", nil, nil, err - } - } - return walDirname, dataDir, walDir, nil -} - -// GetVersion returns the engine version string from the latest options -// file present in dir. Used to check what Pebble or RocksDB version was last -// used to write to the database stored in this directory. An empty string is -// returned if no valid OPTIONS file with a version key was found. -func GetVersion(dir string, fs vfs.FS) (string, error) { - ls, err := fs.List(dir) - if err != nil { - return "", err - } - var version string - lastOptionsSeen := FileNum(0) - for _, filename := range ls { - ft, fn, ok := base.ParseFilename(fs, filename) - if !ok { - continue - } - switch ft { - case fileTypeOptions: - // If this file has a higher number than the last options file - // processed, reset version. This is because rocksdb often - // writes multiple options files without deleting previous ones. - // Otherwise, skip parsing this options file. - if fn.FileNum() > lastOptionsSeen { - version = "" - lastOptionsSeen = fn.FileNum() - } else { - continue - } - f, err := fs.Open(fs.PathJoin(dir, filename)) - if err != nil { - return "", err - } - data, err := io.ReadAll(f) - f.Close() - - if err != nil { - return "", err - } - err = parseOptions(string(data), func(section, key, value string) error { - switch { - case section == "Version": - switch key { - case "pebble_version": - version = value - case "rocksdb_version": - version = fmt.Sprintf("rocksdb v%s", value) - } - } - return nil - }) - if err != nil { - return "", err - } - } - } - return version, nil -} - -// replayWAL replays the edits in the specified log file. If the DB is in -// read only mode, then the WALs are replayed into memtables and not flushed. If -// the DB is not in read only mode, then the contents of the WAL are guaranteed -// to be flushed. -// -// The toFlush return value is a list of flushables associated with the WAL -// being replayed which will be flushed. Once the version edit has been applied -// to the manifest, it is up to the caller of replayWAL to unreference the -// toFlush flushables returned by replayWAL. -// -// d.mu must be held when calling this, but the mutex may be dropped and -// re-acquired during the course of this method. -func (d *DB) replayWAL( - jobID int, ve *versionEdit, fs vfs.FS, filename string, logNum FileNum, strictWALTail bool, -) (toFlush flushableList, maxSeqNum uint64, err error) { - file, err := fs.Open(filename) - if err != nil { - return nil, 0, err - } - defer file.Close() - var ( - b Batch - buf bytes.Buffer - mem *memTable - entry *flushableEntry - rr = record.NewReader(file, logNum) - offset int64 // byte offset in rr - lastFlushOffset int64 - keysReplayed int64 // number of keys replayed - batchesReplayed int64 // number of batches replayed - ) - - // TODO(jackson): This function is interspersed with panics, in addition to - // corruption error propagation. Audit them to ensure we're truly only - // panicking where the error points to Pebble bug and not user or - // hardware-induced corruption. - - if d.opts.ReadOnly { - // In read-only mode, we replay directly into the mutable memtable which will - // never be flushed. - mem = d.mu.mem.mutable - if mem != nil { - entry = d.mu.mem.queue[len(d.mu.mem.queue)-1] - } - } - - // Flushes the current memtable, if not nil. - flushMem := func() { - if mem == nil { - return - } - var logSize uint64 - if offset >= lastFlushOffset { - logSize = uint64(offset - lastFlushOffset) - } - // Else, this was the initial memtable in the read-only case which must have - // been empty, but we need to flush it since we don't want to add to it later. - lastFlushOffset = offset - entry.logSize = logSize - if !d.opts.ReadOnly { - toFlush = append(toFlush, entry) - } - mem, entry = nil, nil - } - // Creates a new memtable if there is no current memtable. - ensureMem := func(seqNum uint64) { - if mem != nil { - return - } - mem, entry = d.newMemTable(logNum, seqNum) - if d.opts.ReadOnly { - d.mu.mem.mutable = mem - d.mu.mem.queue = append(d.mu.mem.queue, entry) - } - } - - // updateVE is used to update ve with information about new files created - // during the flush of any flushable not of type ingestedFlushable. For the - // flushable of type ingestedFlushable we use custom handling below. - updateVE := func() error { - // TODO(bananabrick): See if we can use the actual base level here, - // instead of using 1. - c := newFlush(d.opts, d.mu.versions.currentVersion(), - 1 /* base level */, toFlush, d.timeNow()) - newVE, _, _, err := d.runCompaction(jobID, c) - if err != nil { - return errors.Wrapf(err, "running compaction during WAL replay") - } - ve.NewFiles = append(ve.NewFiles, newVE.NewFiles...) - return nil - } - defer func() { - if err != nil { - err = errors.WithDetailf(err, "replaying log %s, offset %d", logNum, offset) - } - }() - - for { - offset = rr.Offset() - r, err := rr.Next() - if err == nil { - _, err = io.Copy(&buf, r) - } - if err != nil { - // It is common to encounter a zeroed or invalid chunk due to WAL - // preallocation and WAL recycling. We need to distinguish these - // errors from EOF in order to recognize that the record was - // truncated and to avoid replaying subsequent WALs, but want - // to otherwise treat them like EOF. - if err == io.EOF { - break - } else if record.IsInvalidRecord(err) && !strictWALTail { - break - } - return nil, 0, errors.Wrap(err, "pebble: error when replaying WAL") - } - - if buf.Len() < batchHeaderLen { - return nil, 0, base.CorruptionErrorf("pebble: corrupt log file %q (num %s)", - filename, errors.Safe(logNum)) - } - - if d.opts.ErrorIfNotPristine { - return nil, 0, errors.WithDetailf(ErrDBNotPristine, "location: %q", d.dirname) - } - - // Specify Batch.db so that Batch.SetRepr will compute Batch.memTableSize - // which is used below. - b = Batch{} - b.db = d - b.SetRepr(buf.Bytes()) - seqNum := b.SeqNum() - maxSeqNum = seqNum + uint64(b.Count()) - keysReplayed += int64(b.Count()) - batchesReplayed++ - { - br := b.Reader() - if kind, encodedFileNum, _, ok, err := br.Next(); err != nil { - return nil, 0, err - } else if ok && kind == InternalKeyKindIngestSST { - fileNums := make([]base.DiskFileNum, 0, b.Count()) - addFileNum := func(encodedFileNum []byte) { - fileNum, n := binary.Uvarint(encodedFileNum) - if n <= 0 { - panic("pebble: ingest sstable file num is invalid.") - } - fileNums = append(fileNums, base.FileNum(fileNum).DiskFileNum()) - } - addFileNum(encodedFileNum) - - for i := 1; i < int(b.Count()); i++ { - kind, encodedFileNum, _, ok, err := br.Next() - if err != nil { - return nil, 0, err - } - if kind != InternalKeyKindIngestSST { - panic("pebble: invalid batch key kind.") - } - if !ok { - panic("pebble: invalid batch count.") - } - addFileNum(encodedFileNum) - } - - if _, _, _, ok, err := br.Next(); err != nil { - return nil, 0, err - } else if ok { - panic("pebble: invalid number of entries in batch.") - } - - meta := make([]*fileMetadata, len(fileNums)) - for i, n := range fileNums { - var readable objstorage.Readable - objMeta, err := d.objProvider.Lookup(fileTypeTable, n) - if err != nil { - return nil, 0, errors.Wrap(err, "pebble: error when looking up ingested SSTs") - } - if objMeta.IsRemote() { - readable, err = d.objProvider.OpenForReading(context.TODO(), fileTypeTable, n, objstorage.OpenOptions{MustExist: true}) - if err != nil { - return nil, 0, errors.Wrap(err, "pebble: error when opening flushable ingest files") - } - } else { - path := base.MakeFilepath(d.opts.FS, d.dirname, fileTypeTable, n) - f, err := d.opts.FS.Open(path) - if err != nil { - return nil, 0, err - } - - readable, err = sstable.NewSimpleReadable(f) - if err != nil { - return nil, 0, err - } - } - // NB: ingestLoad1 will close readable. - meta[i], err = ingestLoad1(d.opts, d.FormatMajorVersion(), readable, d.cacheID, n) - if err != nil { - return nil, 0, errors.Wrap(err, "pebble: error when loading flushable ingest files") - } - } - - if uint32(len(meta)) != b.Count() { - panic("pebble: couldn't load all files in WAL entry.") - } - - entry, err = d.newIngestedFlushableEntry( - meta, seqNum, logNum, - ) - if err != nil { - return nil, 0, err - } - - if d.opts.ReadOnly { - d.mu.mem.queue = append(d.mu.mem.queue, entry) - // We added the IngestSST flushable to the queue. But there - // must be at least one WAL entry waiting to be replayed. We - // have to ensure this newer WAL entry isn't replayed into - // the current value of d.mu.mem.mutable because the current - // mutable memtable exists before this flushable entry in - // the memtable queue. To ensure this, we just need to unset - // d.mu.mem.mutable. When a newer WAL is replayed, we will - // set d.mu.mem.mutable to a newer value. - d.mu.mem.mutable = nil - } else { - toFlush = append(toFlush, entry) - // During WAL replay, the lsm only has L0, hence, the - // baseLevel is 1. For the sake of simplicity, we place the - // ingested files in L0 here, instead of finding their - // target levels. This is a simplification for the sake of - // simpler code. It is expected that WAL replay should be - // rare, and that flushables of type ingestedFlushable - // should also be rare. So, placing the ingested files in L0 - // is alright. - // - // TODO(bananabrick): Maybe refactor this function to allow - // us to easily place ingested files in levels as low as - // possible during WAL replay. It would require breaking up - // the application of ve to the manifest into chunks and is - // not pretty w/o a refactor to this function and how it's - // used. - c := newFlush( - d.opts, d.mu.versions.currentVersion(), - 1, /* base level */ - []*flushableEntry{entry}, - d.timeNow(), - ) - for _, file := range c.flushing[0].flushable.(*ingestedFlushable).files { - ve.NewFiles = append(ve.NewFiles, newFileEntry{Level: 0, Meta: file.FileMetadata}) - } - } - return toFlush, maxSeqNum, nil - } - } - - if b.memTableSize >= uint64(d.largeBatchThreshold) { - flushMem() - // Make a copy of the data slice since it is currently owned by buf and will - // be reused in the next iteration. - b.data = append([]byte(nil), b.data...) - b.flushable, err = newFlushableBatch(&b, d.opts.Comparer) - if err != nil { - return nil, 0, err - } - entry := d.newFlushableEntry(b.flushable, logNum, b.SeqNum()) - // Disable memory accounting by adding a reader ref that will never be - // removed. - entry.readerRefs.Add(1) - if d.opts.ReadOnly { - d.mu.mem.queue = append(d.mu.mem.queue, entry) - // We added the flushable batch to the flushable to the queue. - // But there must be at least one WAL entry waiting to be - // replayed. We have to ensure this newer WAL entry isn't - // replayed into the current value of d.mu.mem.mutable because - // the current mutable memtable exists before this flushable - // entry in the memtable queue. To ensure this, we just need to - // unset d.mu.mem.mutable. When a newer WAL is replayed, we will - // set d.mu.mem.mutable to a newer value. - d.mu.mem.mutable = nil - } else { - toFlush = append(toFlush, entry) - } - } else { - ensureMem(seqNum) - if err = mem.prepare(&b); err != nil && err != arenaskl.ErrArenaFull { - return nil, 0, err - } - // We loop since DB.newMemTable() slowly grows the size of allocated memtables, so the - // batch may not initially fit, but will eventually fit (since it is smaller than - // largeBatchThreshold). - for err == arenaskl.ErrArenaFull { - flushMem() - ensureMem(seqNum) - err = mem.prepare(&b) - if err != nil && err != arenaskl.ErrArenaFull { - return nil, 0, err - } - } - if err = mem.apply(&b, seqNum); err != nil { - return nil, 0, err - } - mem.writerUnref() - } - buf.Reset() - } - - d.opts.Logger.Infof("[JOB %d] WAL file %s with log number %s stopped reading at offset: %d; replayed %d keys in %d batches", jobID, filename, logNum.String(), offset, keysReplayed, batchesReplayed) - flushMem() - - // mem is nil here. - if !d.opts.ReadOnly { - err = updateVE() - if err != nil { - return nil, 0, err - } - } - return toFlush, maxSeqNum, err -} - -func checkOptions(opts *Options, path string) (strictWALTail bool, err error) { - f, err := opts.FS.Open(path) - if err != nil { - return false, err - } - defer f.Close() - - data, err := io.ReadAll(f) - if err != nil { - return false, err - } - return opts.checkOptions(string(data)) -} - -// DBDesc briefly describes high-level state about a database. -type DBDesc struct { - // Exists is true if an existing database was found. - Exists bool - // FormatMajorVersion indicates the database's current format - // version. - FormatMajorVersion FormatMajorVersion - // ManifestFilename is the filename of the current active manifest, - // if the database exists. - ManifestFilename string -} - -// Peek looks for an existing database in dirname on the provided FS. It -// returns a brief description of the database. Peek is read-only and -// does not open the database -func Peek(dirname string, fs vfs.FS) (*DBDesc, error) { - vers, versMarker, err := lookupFormatMajorVersion(fs, dirname) - if err != nil { - return nil, err - } - // TODO(jackson): Immediately closing the marker is clunky. Add a - // PeekMarker variant that avoids opening the directory. - if err := versMarker.Close(); err != nil { - return nil, err - } - - // Find the currently active manifest, if there is one. - manifestMarker, manifestFileNum, exists, err := findCurrentManifest(vers, fs, dirname) - if err != nil { - return nil, err - } - // TODO(jackson): Immediately closing the marker is clunky. Add a - // PeekMarker variant that avoids opening the directory. - if err := manifestMarker.Close(); err != nil { - return nil, err - } - - desc := &DBDesc{ - Exists: exists, - FormatMajorVersion: vers, - } - if exists { - desc.ManifestFilename = base.MakeFilepath(fs, dirname, fileTypeManifest, manifestFileNum) - } - return desc, nil -} - -// LockDirectory acquires the database directory lock in the named directory, -// preventing another process from opening the database. LockDirectory returns a -// handle to the held lock that may be passed to Open through Options.Lock to -// subsequently open the database, skipping lock acquistion during Open. -// -// LockDirectory may be used to expand the critical section protected by the -// database lock to include setup before the call to Open. -func LockDirectory(dirname string, fs vfs.FS) (*Lock, error) { - fileLock, err := fs.Lock(base.MakeFilepath(fs, dirname, fileTypeLock, base.FileNum(0).DiskFileNum())) - if err != nil { - return nil, err - } - l := &Lock{dirname: dirname, fileLock: fileLock} - l.refs.Store(1) - invariants.SetFinalizer(l, func(obj interface{}) { - if refs := obj.(*Lock).refs.Load(); refs > 0 { - panic(errors.AssertionFailedf("lock for %q finalized with %d refs", dirname, refs)) - } - }) - return l, nil -} - -// Lock represents a file lock on a directory. It may be passed to Open through -// Options.Lock to elide lock aquisition during Open. -type Lock struct { - dirname string - fileLock io.Closer - // refs is a count of the number of handles on the lock. refs must be 0, 1 - // or 2. - // - // When acquired by the client and passed to Open, refs = 1 and the Open - // call increments it to 2. When the database is closed, it's decremented to - // 1. Finally when the original caller, calls Close on the Lock, it's - // drecemented to zero and the underlying file lock is released. - // - // When Open acquires the file lock, refs remains at 1 until the database is - // closed. - refs atomic.Int32 -} - -func (l *Lock) refForOpen() error { - // During Open, when a user passed in a lock, the reference count must be - // exactly 1. If it's zero, the lock is no longer held and is invalid. If - // it's 2, the lock is already in use by another database within the - // process. - if !l.refs.CompareAndSwap(1, 2) { - return errors.Errorf("pebble: unexpected Lock reference count; is the lock already in use?") - } - return nil -} - -// Close releases the lock, permitting another process to lock and open the -// database. Close must not be called until after a database using the Lock has -// been closed. -func (l *Lock) Close() error { - if l.refs.Add(-1) > 0 { - return nil - } - defer func() { l.fileLock = nil }() - return l.fileLock.Close() -} - -// ErrDBDoesNotExist is generated when ErrorIfNotExists is set and the database -// does not exist. -// -// Note that errors can be wrapped with more details; use errors.Is(). -var ErrDBDoesNotExist = errors.New("pebble: database does not exist") - -// ErrDBAlreadyExists is generated when ErrorIfExists is set and the database -// already exists. -// -// Note that errors can be wrapped with more details; use errors.Is(). -var ErrDBAlreadyExists = errors.New("pebble: database already exists") - -// ErrDBNotPristine is generated when ErrorIfNotPristine is set and the database -// already exists and is not pristine. -// -// Note that errors can be wrapped with more details; use errors.Is(). -var ErrDBNotPristine = errors.New("pebble: database already exists and is not pristine") - -// IsCorruptionError returns true if the given error indicates database -// corruption. -func IsCorruptionError(err error) bool { - return errors.Is(err, base.ErrCorruption) -} - -func checkConsistency(v *manifest.Version, dirname string, objProvider objstorage.Provider) error { - var buf bytes.Buffer - var args []interface{} - - dedup := make(map[base.DiskFileNum]struct{}) - for level, files := range v.Levels { - iter := files.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - backingState := f.FileBacking - if _, ok := dedup[backingState.DiskFileNum]; ok { - continue - } - dedup[backingState.DiskFileNum] = struct{}{} - fileNum := backingState.DiskFileNum - fileSize := backingState.Size - // We allow foreign objects to have a mismatch between sizes. This is - // because we might skew the backing size stored by our objprovider - // to prevent us from over-prioritizing this file for compaction. - meta, err := objProvider.Lookup(base.FileTypeTable, fileNum) - var size int64 - if err == nil { - if objProvider.IsSharedForeign(meta) { - continue - } - size, err = objProvider.Size(meta) - } - if err != nil { - buf.WriteString("L%d: %s: %v\n") - args = append(args, errors.Safe(level), errors.Safe(fileNum), err) - continue - } - - if size != int64(fileSize) { - buf.WriteString("L%d: %s: object size mismatch (%s): %d (disk) != %d (MANIFEST)\n") - args = append(args, errors.Safe(level), errors.Safe(fileNum), objProvider.Path(meta), - errors.Safe(size), errors.Safe(fileSize)) - continue - } - } - } - - if buf.Len() == 0 { - return nil - } - return errors.Errorf(buf.String(), args...) -} diff --git a/vendor/github.com/cockroachdb/pebble/record/record.go b/vendor/github.com/cockroachdb/pebble/record/record.go deleted file mode 100644 index 9b42a4c..0000000 --- a/vendor/github.com/cockroachdb/pebble/record/record.go +++ /dev/null @@ -1,644 +0,0 @@ -// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -// Package record reads and writes sequences of records. Each record is a stream -// of bytes that completes before the next record starts. -// -// When reading, call Next to obtain an io.Reader for the next record. Next will -// return io.EOF when there are no more records. It is valid to call Next -// without reading the current record to exhaustion. -// -// When writing, call Next to obtain an io.Writer for the next record. Calling -// Next finishes the current record. Call Close to finish the final record. -// -// Optionally, call Flush to finish the current record and flush the underlying -// writer without starting a new record. To start a new record after flushing, -// call Next. -// -// Neither Readers or Writers are safe to use concurrently. -// -// Example code: -// -// func read(r io.Reader) ([]string, error) { -// var ss []string -// records := record.NewReader(r) -// for { -// rec, err := records.Next() -// if err == io.EOF { -// break -// } -// if err != nil { -// log.Printf("recovering from %v", err) -// r.Recover() -// continue -// } -// s, err := io.ReadAll(rec) -// if err != nil { -// log.Printf("recovering from %v", err) -// r.Recover() -// continue -// } -// ss = append(ss, string(s)) -// } -// return ss, nil -// } -// -// func write(w io.Writer, ss []string) error { -// records := record.NewWriter(w) -// for _, s := range ss { -// rec, err := records.Next() -// if err != nil { -// return err -// } -// if _, err := rec.Write([]byte(s)), err != nil { -// return err -// } -// } -// return records.Close() -// } -// -// The wire format is that the stream is divided into 32KiB blocks, and each -// block contains a number of tightly packed chunks. Chunks cannot cross block -// boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a -// block must be zero. -// -// A record maps to one or more chunks. There are two chunk formats: legacy and -// recyclable. The legacy chunk format: -// -// +----------+-----------+-----------+--- ... ---+ -// | CRC (4B) | Size (2B) | Type (1B) | Payload | -// +----------+-----------+-----------+--- ... ---+ -// -// CRC is computed over the type and payload -// Size is the length of the payload in bytes -// Type is the chunk type -// -// There are four chunk types: whether the chunk is the full record, or the -// first, middle or last chunk of a multi-chunk record. A multi-chunk record -// has one first chunk, zero or more middle chunks, and one last chunk. -// -// The recyclyable chunk format is similar to the legacy format, but extends -// the chunk header with an additional log number field. This allows reuse -// (recycling) of log files which can provide significantly better performance -// when syncing frequently as it avoids needing to update the file -// metadata. Additionally, recycling log files is a prequisite for using direct -// IO with log writing. The recyclyable format is: -// -// +----------+-----------+-----------+----------------+--- ... ---+ -// | CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload | -// +----------+-----------+-----------+----------------+--- ... ---+ -// -// Recyclable chunks are distinguished from legacy chunks by the addition of 4 -// extra "recyclable" chunk types that map directly to the legacy chunk types -// (i.e. full, first, middle, last). The CRC is computed over the type, log -// number, and payload. -// -// The wire format allows for limited recovery in the face of data corruption: -// on a format error (such as a checksum mismatch), the reader moves to the -// next block and looks for the next full or first chunk. -package record - -// The C++ Level-DB code calls this the log, but it has been renamed to record -// to avoid clashing with the standard log package, and because it is generally -// useful outside of logging. The C++ code also uses the term "physical record" -// instead of "chunk", but "chunk" is shorter and less confusing. - -import ( - "encoding/binary" - "io" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/crc" -) - -// These constants are part of the wire format and should not be changed. -const ( - fullChunkType = 1 - firstChunkType = 2 - middleChunkType = 3 - lastChunkType = 4 - - recyclableFullChunkType = 5 - recyclableFirstChunkType = 6 - recyclableMiddleChunkType = 7 - recyclableLastChunkType = 8 -) - -const ( - blockSize = 32 * 1024 - blockSizeMask = blockSize - 1 - legacyHeaderSize = 7 - recyclableHeaderSize = legacyHeaderSize + 4 -) - -var ( - // ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker. - ErrNotAnIOSeeker = errors.New("pebble/record: reader does not implement io.Seeker") - - // ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record. - ErrNoLastRecord = errors.New("pebble/record: no last record exists") - - // ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This - // usually occurs due to log file preallocation. - ErrZeroedChunk = base.CorruptionErrorf("pebble/record: zeroed chunk") - - // ErrInvalidChunk is returned if a chunk is encountered with an invalid - // header, length, or checksum. This usually occurs when a log is recycled, - // but can also occur due to corruption. - ErrInvalidChunk = base.CorruptionErrorf("pebble/record: invalid chunk") -) - -// IsInvalidRecord returns true if the error matches one of the error types -// returned for invalid records. These are treated in a way similar to io.EOF -// in recovery code. -func IsInvalidRecord(err error) bool { - return err == ErrZeroedChunk || err == ErrInvalidChunk || err == io.ErrUnexpectedEOF -} - -// Reader reads records from an underlying io.Reader. -type Reader struct { - // r is the underlying reader. - r io.Reader - // logNum is the low 32-bits of the log's file number. May be zero when used - // with log files that do not have a file number (e.g. the MANIFEST). - logNum uint32 - // blockNum is the zero based block number currently held in buf. - blockNum int64 - // seq is the sequence number of the current record. - seq int - // buf[begin:end] is the unread portion of the current chunk's payload. The - // low bound, begin, excludes the chunk header. - begin, end int - // n is the number of bytes of buf that are valid. Once reading has started, - // only the final block can have n < blockSize. - n int - // recovering is true when recovering from corruption. - recovering bool - // last is whether the current chunk is the last chunk of the record. - last bool - // err is any accumulated error. - err error - // buf is the buffer. - buf [blockSize]byte -} - -// NewReader returns a new reader. If the file contains records encoded using -// the recyclable record format, then the log number in those records must -// match the specified logNum. -func NewReader(r io.Reader, logNum base.FileNum) *Reader { - return &Reader{ - r: r, - logNum: uint32(logNum), - blockNum: -1, - } -} - -// nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the -// next block into the buffer if necessary. -func (r *Reader) nextChunk(wantFirst bool) error { - for { - if r.end+legacyHeaderSize <= r.n { - checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4]) - length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6]) - chunkType := r.buf[r.end+6] - - if checksum == 0 && length == 0 && chunkType == 0 { - if r.end+recyclableHeaderSize > r.n { - // Skip the rest of the block if the recyclable header size does not - // fit within it. - r.end = r.n - continue - } - if r.recovering { - // Skip the rest of the block, if it looks like it is all - // zeroes. This is common with WAL preallocation. - // - // Set r.err to be an error so r.recover actually recovers. - r.err = ErrZeroedChunk - r.recover() - continue - } - return ErrZeroedChunk - } - - headerSize := legacyHeaderSize - if chunkType >= recyclableFullChunkType && chunkType <= recyclableLastChunkType { - headerSize = recyclableHeaderSize - if r.end+headerSize > r.n { - return ErrInvalidChunk - } - - logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11]) - if logNum != r.logNum { - if wantFirst { - // If we're looking for the first chunk of a record, we can treat a - // previous instance of the log as EOF. - return io.EOF - } - // Otherwise, treat this chunk as invalid in order to prevent reading - // of a partial record. - return ErrInvalidChunk - } - - chunkType -= (recyclableFullChunkType - 1) - } - - r.begin = r.end + headerSize - r.end = r.begin + int(length) - if r.end > r.n { - // The chunk straddles a 32KB boundary (or the end of file). - if r.recovering { - r.recover() - continue - } - return ErrInvalidChunk - } - if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() { - if r.recovering { - r.recover() - continue - } - return ErrInvalidChunk - } - if wantFirst { - if chunkType != fullChunkType && chunkType != firstChunkType { - continue - } - } - r.last = chunkType == fullChunkType || chunkType == lastChunkType - r.recovering = false - return nil - } - if r.n < blockSize && r.blockNum >= 0 { - if !wantFirst || r.end != r.n { - // This can happen if the previous instance of the log ended with a - // partial block at the same blockNum as the new log but extended - // beyond the partial block of the new log. - return ErrInvalidChunk - } - return io.EOF - } - n, err := io.ReadFull(r.r, r.buf[:]) - if err != nil && err != io.ErrUnexpectedEOF { - if err == io.EOF && !wantFirst { - return io.ErrUnexpectedEOF - } - return err - } - r.begin, r.end, r.n = 0, 0, n - r.blockNum++ - } -} - -// Next returns a reader for the next record. It returns io.EOF if there are no -// more records. The reader returned becomes stale after the next Next call, -// and should no longer be used. -func (r *Reader) Next() (io.Reader, error) { - r.seq++ - if r.err != nil { - return nil, r.err - } - r.begin = r.end - r.err = r.nextChunk(true) - if r.err != nil { - return nil, r.err - } - return singleReader{r, r.seq}, nil -} - -// Offset returns the current offset within the file. If called immediately -// before a call to Next(), Offset() will return the record offset. -func (r *Reader) Offset() int64 { - if r.blockNum < 0 { - return 0 - } - return int64(r.blockNum)*blockSize + int64(r.end) -} - -// recover clears any errors read so far, so that calling Next will start -// reading from the next good 32KiB block. If there are no such blocks, Next -// will return io.EOF. recover also marks the current reader, the one most -// recently returned by Next, as stale. If recover is called without any -// prior error, then recover is a no-op. -func (r *Reader) recover() { - if r.err == nil { - return - } - r.recovering = true - r.err = nil - // Discard the rest of the current block. - r.begin, r.end, r.last = r.n, r.n, false - // Invalidate any outstanding singleReader. - r.seq++ -} - -// seekRecord seeks in the underlying io.Reader such that calling r.Next -// returns the record whose first chunk header starts at the provided offset. -// Its behavior is undefined if the argument given is not such an offset, as -// the bytes at that offset may coincidentally appear to be a valid header. -// -// It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement -// io.Seeker. -// -// seekRecord will fail and return an error if the Reader previously -// encountered an error, including io.EOF. Such errors can be cleared by -// calling Recover. Calling seekRecord after Recover will make calling Next -// return the record at the given offset, instead of the record at the next -// good 32KiB block as Recover normally would. Calling seekRecord before -// Recover has no effect on Recover's semantics other than changing the -// starting point for determining the next good 32KiB block. -// -// The offset is always relative to the start of the underlying io.Reader, so -// negative values will result in an error as per io.Seeker. -func (r *Reader) seekRecord(offset int64) error { - r.seq++ - if r.err != nil { - return r.err - } - - s, ok := r.r.(io.Seeker) - if !ok { - return ErrNotAnIOSeeker - } - - // Only seek to an exact block offset. - c := int(offset & blockSizeMask) - if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil { - return r.err - } - - // Clear the state of the internal reader. - r.begin, r.end, r.n = 0, 0, 0 - r.blockNum, r.recovering, r.last = -1, false, false - if r.err = r.nextChunk(false); r.err != nil { - return r.err - } - - // Now skip to the offset requested within the block. A subsequent - // call to Next will return the block at the requested offset. - r.begin, r.end = c, c - - return nil -} - -type singleReader struct { - r *Reader - seq int -} - -func (x singleReader) Read(p []byte) (int, error) { - r := x.r - if r.seq != x.seq { - return 0, errors.New("pebble/record: stale reader") - } - if r.err != nil { - return 0, r.err - } - for r.begin == r.end { - if r.last { - return 0, io.EOF - } - if r.err = r.nextChunk(false); r.err != nil { - return 0, r.err - } - } - n := copy(p, r.buf[r.begin:r.end]) - r.begin += n - return n, nil -} - -// Writer writes records to an underlying io.Writer. -type Writer struct { - // w is the underlying writer. - w io.Writer - // seq is the sequence number of the current record. - seq int - // f is w as a flusher. - f flusher - // buf[i:j] is the bytes that will become the current chunk. - // The low bound, i, includes the chunk header. - i, j int - // buf[:written] has already been written to w. - // written is zero unless Flush has been called. - written int - // baseOffset is the base offset in w at which writing started. If - // w implements io.Seeker, it's relative to the start of w, 0 otherwise. - baseOffset int64 - // blockNumber is the zero based block number currently held in buf. - blockNumber int64 - // lastRecordOffset is the offset in w where the last record was - // written (including the chunk header). It is a relative offset to - // baseOffset, thus the absolute offset of the last record is - // baseOffset + lastRecordOffset. - lastRecordOffset int64 - // first is whether the current chunk is the first chunk of the record. - first bool - // pending is whether a chunk is buffered but not yet written. - pending bool - // err is any accumulated error. - err error - // buf is the buffer. - buf [blockSize]byte -} - -// NewWriter returns a new Writer. -func NewWriter(w io.Writer) *Writer { - f, _ := w.(flusher) - - var o int64 - if s, ok := w.(io.Seeker); ok { - var err error - if o, err = s.Seek(0, io.SeekCurrent); err != nil { - o = 0 - } - } - return &Writer{ - w: w, - f: f, - baseOffset: o, - lastRecordOffset: -1, - } -} - -// fillHeader fills in the header for the pending chunk. -func (w *Writer) fillHeader(last bool) { - if w.i+legacyHeaderSize > w.j || w.j > blockSize { - panic("pebble/record: bad writer state") - } - if last { - if w.first { - w.buf[w.i+6] = fullChunkType - } else { - w.buf[w.i+6] = lastChunkType - } - } else { - if w.first { - w.buf[w.i+6] = firstChunkType - } else { - w.buf[w.i+6] = middleChunkType - } - } - binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value()) - binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize)) -} - -// writeBlock writes the buffered block to the underlying writer, and reserves -// space for the next chunk's header. -func (w *Writer) writeBlock() { - _, w.err = w.w.Write(w.buf[w.written:]) - w.i = 0 - w.j = legacyHeaderSize - w.written = 0 - w.blockNumber++ -} - -// writePending finishes the current record and writes the buffer to the -// underlying writer. -func (w *Writer) writePending() { - if w.err != nil { - return - } - if w.pending { - w.fillHeader(true) - w.pending = false - } - _, w.err = w.w.Write(w.buf[w.written:w.j]) - w.written = w.j -} - -// Close finishes the current record and closes the writer. -func (w *Writer) Close() error { - w.seq++ - w.writePending() - if w.err != nil { - return w.err - } - w.err = errors.New("pebble/record: closed Writer") - return nil -} - -// Flush finishes the current record, writes to the underlying writer, and -// flushes it if that writer implements interface{ Flush() error }. -func (w *Writer) Flush() error { - w.seq++ - w.writePending() - if w.err != nil { - return w.err - } - if w.f != nil { - w.err = w.f.Flush() - return w.err - } - return nil -} - -// Next returns a writer for the next record. The writer returned becomes stale -// after the next Close, Flush or Next call, and should no longer be used. -func (w *Writer) Next() (io.Writer, error) { - w.seq++ - if w.err != nil { - return nil, w.err - } - if w.pending { - w.fillHeader(true) - } - w.i = w.j - w.j = w.j + legacyHeaderSize - // Check if there is room in the block for the header. - if w.j > blockSize { - // Fill in the rest of the block with zeroes. - for k := w.i; k < blockSize; k++ { - w.buf[k] = 0 - } - w.writeBlock() - if w.err != nil { - return nil, w.err - } - } - w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i) - w.first = true - w.pending = true - return singleWriter{w, w.seq}, nil -} - -// WriteRecord writes a complete record. Returns the offset just past the end -// of the record. -func (w *Writer) WriteRecord(p []byte) (int64, error) { - if w.err != nil { - return -1, w.err - } - t, err := w.Next() - if err != nil { - return -1, err - } - if _, err := t.Write(p); err != nil { - return -1, err - } - w.writePending() - offset := w.blockNumber*blockSize + int64(w.j) - return offset, w.err -} - -// Size returns the current size of the file. -func (w *Writer) Size() int64 { - if w == nil { - return 0 - } - return w.blockNumber*blockSize + int64(w.j) -} - -// LastRecordOffset returns the offset in the underlying io.Writer of the last -// record so far - the one created by the most recent Next call. It is the -// offset of the first chunk header, suitable to pass to Reader.SeekRecord. -// -// If that io.Writer also implements io.Seeker, the return value is an absolute -// offset, in the sense of io.SeekStart, regardless of whether the io.Writer -// was initially at the zero position when passed to NewWriter. Otherwise, the -// return value is a relative offset, being the number of bytes written between -// the NewWriter call and any records written prior to the last record. -// -// If there is no last record, i.e. nothing was written, LastRecordOffset will -// return ErrNoLastRecord. -func (w *Writer) LastRecordOffset() (int64, error) { - if w.err != nil { - return 0, w.err - } - if w.lastRecordOffset < 0 { - return 0, ErrNoLastRecord - } - return w.lastRecordOffset, nil -} - -type singleWriter struct { - w *Writer - seq int -} - -func (x singleWriter) Write(p []byte) (int, error) { - w := x.w - if w.seq != x.seq { - return 0, errors.New("pebble/record: stale writer") - } - if w.err != nil { - return 0, w.err - } - n0 := len(p) - for len(p) > 0 { - // Write a block, if it is full. - if w.j == blockSize { - w.fillHeader(false) - w.writeBlock() - if w.err != nil { - return 0, w.err - } - w.first = false - } - // Copy bytes into the buffer. - n := copy(w.buf[w.j:], p) - w.j += n - p = p[n:] - } - return n0, nil -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/block.go b/vendor/github.com/cockroachdb/pebble/sstable/block.go deleted file mode 100644 index c6345ea..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/block.go +++ /dev/null @@ -1,1860 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "encoding/binary" - "unsafe" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manual" - "github.com/cockroachdb/pebble/internal/rangedel" - "github.com/cockroachdb/pebble/internal/rangekey" -) - -func uvarintLen(v uint32) int { - i := 0 - for v >= 0x80 { - v >>= 7 - i++ - } - return i + 1 -} - -type blockWriter struct { - restartInterval int - nEntries int - nextRestart int - buf []byte - // For datablocks in TableFormatPebblev3, we steal the most significant bit - // in restarts for encoding setHasSameKeyPrefixSinceLastRestart. This leaves - // us with 31 bits, which is more than enough (no one needs > 2GB blocks). - // Typically, restarts occur every 16 keys, and by storing this bit with the - // restart, we can optimize for the case where a user wants to skip to the - // next prefix which happens to be in the same data block, but is > 16 keys - // away. We have seen production situations with 100+ versions per MVCC key - // (which share the same prefix). Additionally, for such writers, the prefix - // compression of the key, that shares the key with the preceding key, is - // limited to the prefix part of the preceding key -- this ensures that when - // doing NPrefix (see blockIter) we don't need to assemble the full key - // for each step since by limiting the length of the shared key we are - // ensuring that any of the keys with the same prefix can be used to - // assemble the full key when the prefix does change. - restarts []uint32 - // Do not read curKey directly from outside blockWriter since it can have - // the InternalKeyKindSSTableInternalObsoleteBit set. Use getCurKey() or - // getCurUserKey() instead. - curKey []byte - // curValue excludes the optional prefix provided to - // storeWithOptionalValuePrefix. - curValue []byte - prevKey []byte - tmp [4]byte - // We don't know the state of the sets that were at the end of the previous - // block, so this is initially 0. It may be true for the second and later - // restarts in a block. Not having inter-block information is fine since we - // will optimize by stepping through restarts only within the same block. - // Note that the first restart is the first key in the block. - setHasSameKeyPrefixSinceLastRestart bool -} - -func (w *blockWriter) clear() { - *w = blockWriter{ - buf: w.buf[:0], - restarts: w.restarts[:0], - curKey: w.curKey[:0], - curValue: w.curValue[:0], - prevKey: w.prevKey[:0], - } -} - -// MaximumBlockSize is an extremely generous maximum block size of 256MiB. We -// explicitly place this limit to reserve a few bits in the restart for -// internal use. -const MaximumBlockSize = 1 << 28 -const setHasSameKeyPrefixRestartMask uint32 = 1 << 31 -const restartMaskLittleEndianHighByteWithoutSetHasSamePrefix byte = 0b0111_1111 -const restartMaskLittleEndianHighByteOnlySetHasSamePrefix byte = 0b1000_0000 - -func (w *blockWriter) getCurKey() InternalKey { - k := base.DecodeInternalKey(w.curKey) - k.Trailer = k.Trailer & trailerObsoleteMask - return k -} - -func (w *blockWriter) getCurUserKey() []byte { - n := len(w.curKey) - base.InternalTrailerLen - if n < 0 { - panic(errors.AssertionFailedf("corrupt key in blockWriter buffer")) - } - return w.curKey[:n:n] -} - -// If !addValuePrefix, the valuePrefix is ignored. -func (w *blockWriter) storeWithOptionalValuePrefix( - keySize int, - value []byte, - maxSharedKeyLen int, - addValuePrefix bool, - valuePrefix valuePrefix, - setHasSameKeyPrefix bool, -) { - shared := 0 - if !setHasSameKeyPrefix { - w.setHasSameKeyPrefixSinceLastRestart = false - } - if w.nEntries == w.nextRestart { - w.nextRestart = w.nEntries + w.restartInterval - restart := uint32(len(w.buf)) - if w.setHasSameKeyPrefixSinceLastRestart { - restart = restart | setHasSameKeyPrefixRestartMask - } - w.setHasSameKeyPrefixSinceLastRestart = true - w.restarts = append(w.restarts, restart) - } else { - // TODO(peter): Manually inlined version of base.SharedPrefixLen(). This - // is 3% faster on BenchmarkWriter on go1.16. Remove if future versions - // show this to not be a performance win. For now, functions that use of - // unsafe cannot be inlined. - n := maxSharedKeyLen - if n > len(w.prevKey) { - n = len(w.prevKey) - } - asUint64 := func(b []byte, i int) uint64 { - return binary.LittleEndian.Uint64(b[i:]) - } - for shared < n-7 && asUint64(w.curKey, shared) == asUint64(w.prevKey, shared) { - shared += 8 - } - for shared < n && w.curKey[shared] == w.prevKey[shared] { - shared++ - } - } - - lenValuePlusOptionalPrefix := len(value) - if addValuePrefix { - lenValuePlusOptionalPrefix++ - } - needed := 3*binary.MaxVarintLen32 + len(w.curKey[shared:]) + lenValuePlusOptionalPrefix - n := len(w.buf) - if cap(w.buf) < n+needed { - newCap := 2 * cap(w.buf) - if newCap == 0 { - newCap = 1024 - } - for newCap < n+needed { - newCap *= 2 - } - newBuf := make([]byte, n, newCap) - copy(newBuf, w.buf) - w.buf = newBuf - } - w.buf = w.buf[:n+needed] - - // TODO(peter): Manually inlined versions of binary.PutUvarint(). This is 15% - // faster on BenchmarkWriter on go1.13. Remove if go1.14 or future versions - // show this to not be a performance win. - { - x := uint32(shared) - for x >= 0x80 { - w.buf[n] = byte(x) | 0x80 - x >>= 7 - n++ - } - w.buf[n] = byte(x) - n++ - } - - { - x := uint32(keySize - shared) - for x >= 0x80 { - w.buf[n] = byte(x) | 0x80 - x >>= 7 - n++ - } - w.buf[n] = byte(x) - n++ - } - - { - x := uint32(lenValuePlusOptionalPrefix) - for x >= 0x80 { - w.buf[n] = byte(x) | 0x80 - x >>= 7 - n++ - } - w.buf[n] = byte(x) - n++ - } - - n += copy(w.buf[n:], w.curKey[shared:]) - if addValuePrefix { - w.buf[n : n+1][0] = byte(valuePrefix) - n++ - } - n += copy(w.buf[n:], value) - w.buf = w.buf[:n] - - w.curValue = w.buf[n-len(value):] - - w.nEntries++ -} - -func (w *blockWriter) add(key InternalKey, value []byte) { - w.addWithOptionalValuePrefix( - key, false, value, len(key.UserKey), false, 0, false) -} - -// Callers that always set addValuePrefix to false should use add() instead. -// -// isObsolete indicates whether this key-value pair is obsolete in this -// sstable (only applicable when writing data blocks) -- see the comment in -// table.go and the longer one in format.go. addValuePrefix adds a 1 byte -// prefix to the value, specified in valuePrefix -- this is used for data -// blocks in TableFormatPebblev3 onwards for SETs (see the comment in -// format.go, with more details in value_block.go). setHasSameKeyPrefix is -// also used in TableFormatPebblev3 onwards for SETs. -func (w *blockWriter) addWithOptionalValuePrefix( - key InternalKey, - isObsolete bool, - value []byte, - maxSharedKeyLen int, - addValuePrefix bool, - valuePrefix valuePrefix, - setHasSameKeyPrefix bool, -) { - w.curKey, w.prevKey = w.prevKey, w.curKey - - size := key.Size() - if cap(w.curKey) < size { - w.curKey = make([]byte, 0, size*2) - } - w.curKey = w.curKey[:size] - if isObsolete { - key.Trailer = key.Trailer | trailerObsoleteBit - } - key.Encode(w.curKey) - - w.storeWithOptionalValuePrefix( - size, value, maxSharedKeyLen, addValuePrefix, valuePrefix, setHasSameKeyPrefix) -} - -func (w *blockWriter) finish() []byte { - // Write the restart points to the buffer. - if w.nEntries == 0 { - // Every block must have at least one restart point. - if cap(w.restarts) > 0 { - w.restarts = w.restarts[:1] - w.restarts[0] = 0 - } else { - w.restarts = append(w.restarts, 0) - } - } - tmp4 := w.tmp[:4] - for _, x := range w.restarts { - binary.LittleEndian.PutUint32(tmp4, x) - w.buf = append(w.buf, tmp4...) - } - binary.LittleEndian.PutUint32(tmp4, uint32(len(w.restarts))) - w.buf = append(w.buf, tmp4...) - result := w.buf - - // Reset the block state. - w.nEntries = 0 - w.nextRestart = 0 - w.buf = w.buf[:0] - w.restarts = w.restarts[:0] - return result -} - -// emptyBlockSize holds the size of an empty block. Every block ends -// in a uint32 trailer encoding the number of restart points within the -// block. -const emptyBlockSize = 4 - -func (w *blockWriter) estimatedSize() int { - return len(w.buf) + 4*len(w.restarts) + emptyBlockSize -} - -type blockEntry struct { - offset int32 - keyStart int32 - keyEnd int32 - valStart int32 - valSize int32 -} - -// blockIter is an iterator over a single block of data. -// -// A blockIter provides an additional guarantee around key stability when a -// block has a restart interval of 1 (i.e. when there is no prefix -// compression). Key stability refers to whether the InternalKey.UserKey bytes -// returned by a positioning call will remain stable after a subsequent -// positioning call. The normal case is that a positioning call will invalidate -// any previously returned InternalKey.UserKey. If a block has a restart -// interval of 1 (no prefix compression), blockIter guarantees that -// InternalKey.UserKey will point to the key as stored in the block itself -// which will remain valid until the blockIter is closed. The key stability -// guarantee is used by the range tombstone and range key code, which knows that -// the respective blocks are always encoded with a restart interval of 1. This -// per-block key stability guarantee is sufficient for range tombstones and -// range deletes as they are always encoded in a single block. -// -// A blockIter also provides a value stability guarantee for range deletions and -// range keys since there is only a single range deletion and range key block -// per sstable and the blockIter will not release the bytes for the block until -// it is closed. -// -// Note on why blockIter knows about lazyValueHandling: -// -// blockIter's positioning functions (that return a LazyValue), are too -// complex to inline even prior to lazyValueHandling. blockIter.Next and -// blockIter.First were by far the cheapest and had costs 195 and 180 -// respectively, which exceeds the budget of 80. We initially tried to keep -// the lazyValueHandling logic out of blockIter by wrapping it with a -// lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this -// wrapped iter. The functions in lazyValueDataBlockIter were simple, in that -// they called the corresponding blockIter func and then decided whether the -// value was in fact in-place (so return immediately) or needed further -// handling. But these also turned out too costly for mid-stack inlining since -// simple calls like the following have a high cost that is barely under the -// budget of 80 -// -// k, v := i.data.SeekGE(key, flags) // cost 74 -// k, v := i.data.Next() // cost 72 -// -// We have 2 options for minimizing performance regressions: -// - Include the lazyValueHandling logic in the already non-inlineable -// blockIter functions: Since most of the time is spent in data block iters, -// it is acceptable to take the small hit of unnecessary branching (which -// hopefully branch prediction will predict correctly) for other kinds of -// blocks. -// - Duplicate the logic of singleLevelIterator and twoLevelIterator for the -// v3 sstable and only use the aforementioned lazyValueDataBlockIter for a -// v3 sstable. We would want to manage these copies via code generation. -// -// We have picked the first option here. -type blockIter struct { - cmp Compare - // offset is the byte index that marks where the current key/value is - // encoded in the block. - offset int32 - // nextOffset is the byte index where the next key/value is encoded in the - // block. - nextOffset int32 - // A "restart point" in a block is a point where the full key is encoded, - // instead of just having a suffix of the key encoded. See readEntry() for - // how prefix compression of keys works. Keys in between two restart points - // only have a suffix encoded in the block. When restart interval is 1, no - // prefix compression of keys happens. This is the case with range tombstone - // blocks. - // - // All restart offsets are listed in increasing order in - // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last - // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can - // therefore be seen as the point where data in the block ends, and a list - // of offsets of all restart points begins. - restarts int32 - // Number of restart points in this block. Encoded at the end of the block - // as a uint32. - numRestarts int32 - globalSeqNum uint64 - ptr unsafe.Pointer - data []byte - // key contains the raw key the iterator is currently pointed at. This may - // point directly to data stored in the block (for a key which has no prefix - // compression), to fullKey (for a prefix compressed key), or to a slice of - // data stored in cachedBuf (during reverse iteration). - key []byte - // fullKey is a buffer used for key prefix decompression. - fullKey []byte - // val contains the value the iterator is currently pointed at. If non-nil, - // this points to a slice of the block data. - val []byte - // lazyValue is val turned into a LazyValue, whenever a positioning method - // returns a non-nil key-value pair. - lazyValue base.LazyValue - // ikey contains the decoded InternalKey the iterator is currently pointed - // at. Note that the memory backing ikey.UserKey is either data stored - // directly in the block, fullKey, or cachedBuf. The key stability guarantee - // for blocks built with a restart interval of 1 is achieved by having - // ikey.UserKey always point to data stored directly in the block. - ikey InternalKey - // cached and cachedBuf are used during reverse iteration. They are needed - // because we can't perform prefix decoding in reverse, only in the forward - // direction. In order to iterate in reverse, we decode and cache the entries - // between two restart points. - // - // Note that cached[len(cached)-1] contains the previous entry to the one the - // blockIter is currently pointed at. As usual, nextOffset will contain the - // offset of the next entry. During reverse iteration, nextOffset will be - // updated to point to offset, and we'll set the blockIter to point at the - // entry cached[len(cached)-1]. See Prev() for more details. - // - // For a block encoded with a restart interval of 1, cached and cachedBuf - // will not be used as there are no prefix compressed entries between the - // restart points. - cached []blockEntry - cachedBuf []byte - handle bufferHandle - // for block iteration for already loaded blocks. - firstUserKey []byte - lazyValueHandling struct { - vbr *valueBlockReader - hasValuePrefix bool - } - hideObsoletePoints bool -} - -// blockIter implements the base.InternalIterator interface. -var _ base.InternalIterator = (*blockIter)(nil) - -func newBlockIter(cmp Compare, block block) (*blockIter, error) { - i := &blockIter{} - return i, i.init(cmp, block, 0, false) -} - -func (i *blockIter) String() string { - return "block" -} - -func (i *blockIter) init( - cmp Compare, block block, globalSeqNum uint64, hideObsoletePoints bool, -) error { - numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:])) - if numRestarts == 0 { - return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)") - } - i.cmp = cmp - i.restarts = int32(len(block)) - 4*(1+numRestarts) - i.numRestarts = numRestarts - i.globalSeqNum = globalSeqNum - i.ptr = unsafe.Pointer(&block[0]) - i.data = block - i.fullKey = i.fullKey[:0] - i.val = nil - i.hideObsoletePoints = hideObsoletePoints - i.clearCache() - if i.restarts > 0 { - if err := i.readFirstKey(); err != nil { - return err - } - } else { - // Block is empty. - i.firstUserKey = nil - } - return nil -} - -// NB: two cases of hideObsoletePoints: -// - Local sstable iteration: globalSeqNum will be set iff the sstable was -// ingested. -// - Foreign sstable iteration: globalSeqNum is always set. -func (i *blockIter) initHandle( - cmp Compare, block bufferHandle, globalSeqNum uint64, hideObsoletePoints bool, -) error { - i.handle.Release() - i.handle = block - return i.init(cmp, block.Get(), globalSeqNum, hideObsoletePoints) -} - -func (i *blockIter) invalidate() { - i.clearCache() - i.offset = 0 - i.nextOffset = 0 - i.restarts = 0 - i.numRestarts = 0 - i.data = nil -} - -// isDataInvalidated returns true when the blockIter has been invalidated -// using an invalidate call. NB: this is different from blockIter.Valid -// which is part of the InternalIterator implementation. -func (i *blockIter) isDataInvalidated() bool { - return i.data == nil -} - -func (i *blockIter) resetForReuse() blockIter { - return blockIter{ - fullKey: i.fullKey[:0], - cached: i.cached[:0], - cachedBuf: i.cachedBuf[:0], - data: nil, - } -} - -func (i *blockIter) readEntry() { - ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset)) - - // This is an ugly performance hack. Reading entries from blocks is one of - // the inner-most routines and decoding the 3 varints per-entry takes - // significant time. Neither go1.11 or go1.12 will inline decodeVarint for - // us, so we do it manually. This provides a 10-15% performance improvement - // on blockIter benchmarks on both go1.11 and go1.12. - // - // TODO(peter): remove this hack if go:inline is ever supported. - - var shared uint32 - if a := *((*uint8)(ptr)); a < 128 { - shared = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - shared = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - - var unshared uint32 - if a := *((*uint8)(ptr)); a < 128 { - unshared = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - unshared = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - - var value uint32 - if a := *((*uint8)(ptr)); a < 128 { - value = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - value = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - value = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - - unsharedKey := getBytes(ptr, int(unshared)) - // TODO(sumeer): move this into the else block below. - i.fullKey = append(i.fullKey[:shared], unsharedKey...) - if shared == 0 { - // Provide stability for the key across positioning calls if the key - // doesn't share a prefix with the previous key. This removes requiring the - // key to be copied if the caller knows the block has a restart interval of - // 1. An important example of this is range-del blocks. - i.key = unsharedKey - } else { - i.key = i.fullKey - } - ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared)) - i.val = getBytes(ptr, int(value)) - i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value) -} - -func (i *blockIter) readFirstKey() error { - ptr := i.ptr - - // This is an ugly performance hack. Reading entries from blocks is one of - // the inner-most routines and decoding the 3 varints per-entry takes - // significant time. Neither go1.11 or go1.12 will inline decodeVarint for - // us, so we do it manually. This provides a 10-15% performance improvement - // on blockIter benchmarks on both go1.11 and go1.12. - // - // TODO(peter): remove this hack if go:inline is ever supported. - - if shared := *((*uint8)(ptr)); shared == 0 { - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else { - // The shared length is != 0, which is invalid. - panic("first key in block must have zero shared length") - } - - var unshared uint32 - if a := *((*uint8)(ptr)); a < 128 { - unshared = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - unshared = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - - // Skip the value length. - if a := *((*uint8)(ptr)); a < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); a < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); a < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if a := *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); a < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - - firstKey := getBytes(ptr, int(unshared)) - // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on - // BlockIter benchmarks. - if n := len(firstKey) - 8; n >= 0 { - i.firstUserKey = firstKey[:n:n] - } else { - i.firstUserKey = nil - return base.CorruptionErrorf("pebble/table: invalid firstKey in block") - } - return nil -} - -// The sstable internal obsolete bit is set when writing a block and unset by -// blockIter, so no code outside block writing/reading code ever sees it. -const trailerObsoleteBit = uint64(base.InternalKeyKindSSTableInternalObsoleteBit) -const trailerObsoleteMask = (InternalKeySeqNumMax << 8) | uint64(base.InternalKeyKindSSTableInternalObsoleteMask) - -func (i *blockIter) decodeInternalKey(key []byte) (hiddenPoint bool) { - // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on - // BlockIter benchmarks. - if n := len(key) - 8; n >= 0 { - trailer := binary.LittleEndian.Uint64(key[n:]) - hiddenPoint = i.hideObsoletePoints && - (trailer&trailerObsoleteBit != 0) - i.ikey.Trailer = trailer & trailerObsoleteMask - i.ikey.UserKey = key[:n:n] - if i.globalSeqNum != 0 { - i.ikey.SetSeqNum(i.globalSeqNum) - } - } else { - i.ikey.Trailer = uint64(InternalKeyKindInvalid) - i.ikey.UserKey = nil - } - return hiddenPoint -} - -func (i *blockIter) clearCache() { - i.cached = i.cached[:0] - i.cachedBuf = i.cachedBuf[:0] -} - -func (i *blockIter) cacheEntry() { - var valStart int32 - valSize := int32(len(i.val)) - if valSize > 0 { - valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr)) - } - - i.cached = append(i.cached, blockEntry{ - offset: i.offset, - keyStart: int32(len(i.cachedBuf)), - keyEnd: int32(len(i.cachedBuf) + len(i.key)), - valStart: valStart, - valSize: valSize, - }) - i.cachedBuf = append(i.cachedBuf, i.key...) -} - -func (i *blockIter) getFirstUserKey() []byte { - return i.firstUserKey -} - -// SeekGE implements internalIterator.SeekGE, as documented in the pebble -// package. -func (i *blockIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { - if invariants.Enabled && i.isDataInvalidated() { - panic(errors.AssertionFailedf("invalidated blockIter used")) - } - - i.clearCache() - // Find the index of the smallest restart point whose key is > the key - // sought; index will be numRestarts if there is no such restart point. - i.offset = 0 - var index int32 - - { - // NB: manually inlined sort.Seach is ~5% faster. - // - // Define f(-1) == false and f(n) == true. - // Invariant: f(index-1) == false, f(upper) == true. - upper := i.numRestarts - for index < upper { - h := int32(uint(index+upper) >> 1) // avoid overflow when computing h - // index ≤ h < upper - offset := decodeRestart(i.data[i.restarts+4*h:]) - // For a restart point, there are 0 bytes shared with the previous key. - // The varint encoding of 0 occupies 1 byte. - ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) - - // Decode the key at that restart point, and compare it to the key - // sought. See the comment in readEntry for why we manually inline the - // varint decoding. - var v1 uint32 - if a := *((*uint8)(ptr)); a < 128 { - v1 = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - v1 = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - - if *((*uint8)(ptr)) < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - - // Manually inlining part of base.DecodeInternalKey provides a 5-10% - // speedup on BlockIter benchmarks. - s := getBytes(ptr, int(v1)) - var k []byte - if n := len(s) - 8; n >= 0 { - k = s[:n:n] - } - // Else k is invalid, and left as nil - - if i.cmp(key, k) > 0 { - // The search key is greater than the user key at this restart point. - // Search beyond this restart point, since we are trying to find the - // first restart point with a user key >= the search key. - index = h + 1 // preserves f(i-1) == false - } else { - // k >= search key, so prune everything after index (since index - // satisfies the property we are looking for). - upper = h // preserves f(j) == true - } - } - // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true - // => answer is index. - } - - // index is the first restart point with key >= search key. Define the keys - // between a restart point and the next restart point as belonging to that - // restart point. - // - // Since keys are strictly increasing, if index > 0 then the restart point - // at index-1 will be the first one that has some keys belonging to it that - // could be equal to the search key. If index == 0, then all keys in this - // block are larger than the key sought, and offset remains at zero. - if index > 0 { - i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) - } - i.readEntry() - hiddenPoint := i.decodeInternalKey(i.key) - - // Iterate from that restart point to somewhere >= the key sought. - if !i.valid() { - return nil, base.LazyValue{} - } - if !hiddenPoint && i.cmp(i.ikey.UserKey, key) >= 0 { - // Initialize i.lazyValue - if !i.lazyValueHandling.hasValuePrefix || - base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { - i.lazyValue = base.MakeInPlaceValue(i.val) - } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { - i.lazyValue = base.MakeInPlaceValue(i.val[1:]) - } else { - i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) - } - return &i.ikey, i.lazyValue - } - for i.Next(); i.valid(); i.Next() { - if i.cmp(i.ikey.UserKey, key) >= 0 { - // i.Next() has already initialized i.lazyValue. - return &i.ikey, i.lazyValue - } - } - return nil, base.LazyValue{} -} - -// SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the -// pebble package. -func (i *blockIter) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - // This should never be called as prefix iteration is handled by sstable.Iterator. - panic("pebble: SeekPrefixGE unimplemented") -} - -// SeekLT implements internalIterator.SeekLT, as documented in the pebble -// package. -func (i *blockIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { - if invariants.Enabled && i.isDataInvalidated() { - panic(errors.AssertionFailedf("invalidated blockIter used")) - } - - i.clearCache() - // Find the index of the smallest restart point whose key is >= the key - // sought; index will be numRestarts if there is no such restart point. - i.offset = 0 - var index int32 - - { - // NB: manually inlined sort.Search is ~5% faster. - // - // Define f(-1) == false and f(n) == true. - // Invariant: f(index-1) == false, f(upper) == true. - upper := i.numRestarts - for index < upper { - h := int32(uint(index+upper) >> 1) // avoid overflow when computing h - // index ≤ h < upper - offset := decodeRestart(i.data[i.restarts+4*h:]) - // For a restart point, there are 0 bytes shared with the previous key. - // The varint encoding of 0 occupies 1 byte. - ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) - - // Decode the key at that restart point, and compare it to the key - // sought. See the comment in readEntry for why we manually inline the - // varint decoding. - var v1 uint32 - if a := *((*uint8)(ptr)); a < 128 { - v1 = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - v1 = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - - if *((*uint8)(ptr)) < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))) < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))) < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))) < 128 { - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - - // Manually inlining part of base.DecodeInternalKey provides a 5-10% - // speedup on BlockIter benchmarks. - s := getBytes(ptr, int(v1)) - var k []byte - if n := len(s) - 8; n >= 0 { - k = s[:n:n] - } - // Else k is invalid, and left as nil - - if i.cmp(key, k) > 0 { - // The search key is greater than the user key at this restart point. - // Search beyond this restart point, since we are trying to find the - // first restart point with a user key >= the search key. - index = h + 1 // preserves f(i-1) == false - } else { - // k >= search key, so prune everything after index (since index - // satisfies the property we are looking for). - upper = h // preserves f(j) == true - } - } - // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true - // => answer is index. - } - - // index is the first restart point with key >= search key. Define the keys - // between a restart point and the next restart point as belonging to that - // restart point. Note that index could be equal to i.numRestarts, i.e., we - // are past the last restart. - // - // Since keys are strictly increasing, if index > 0 then the restart point - // at index-1 will be the first one that has some keys belonging to it that - // are less than the search key. If index == 0, then all keys in this block - // are larger than the search key, so there is no match. - targetOffset := i.restarts - if index > 0 { - i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) - if index < i.numRestarts { - targetOffset = decodeRestart(i.data[i.restarts+4*(index):]) - } - } else if index == 0 { - // If index == 0 then all keys in this block are larger than the key - // sought. - i.offset = -1 - i.nextOffset = 0 - return nil, base.LazyValue{} - } - - // Iterate from that restart point to somewhere >= the key sought, then back - // up to the previous entry. The expectation is that we'll be performing - // reverse iteration, so we cache the entries as we advance forward. - i.nextOffset = i.offset - - for { - i.offset = i.nextOffset - i.readEntry() - // When hidden keys are common, there is additional optimization possible - // by not caching entries that are hidden (note that some calls to - // cacheEntry don't decode the internal key before caching, but checking - // whether a key is hidden does not require full decoding). However, we do - // need to use the blockEntry.offset in the cache for the first entry at - // the reset point to do the binary search when the cache is empty -- so - // we would need to cache that first entry (though not the key) even if - // was hidden. Our current assumption is that if there are large numbers - // of hidden keys we will be able to skip whole blocks (using block - // property filters) so we don't bother optimizing. - hiddenPoint := i.decodeInternalKey(i.key) - - // NB: we don't use the hiddenPoint return value of decodeInternalKey - // since we want to stop as soon as we reach a key >= ikey.UserKey, so - // that we can reverse. - if i.cmp(i.ikey.UserKey, key) >= 0 { - // The current key is greater than or equal to our search key. Back up to - // the previous key which was less than our search key. Note that this for - // loop will execute at least once with this if-block not being true, so - // the key we are backing up to is the last one this loop cached. - return i.Prev() - } - - if i.nextOffset >= targetOffset { - // We've reached the end of the current restart block. Return the - // current key if not hidden, else call Prev(). - // - // When the restart interval is 1, the first iteration of the for loop - // will bring us here. In that case ikey is backed by the block so we - // get the desired key stability guarantee for the lifetime of the - // blockIter. That is, we never cache anything and therefore never - // return a key backed by cachedBuf. - if hiddenPoint { - return i.Prev() - } - break - } - - i.cacheEntry() - } - - if !i.valid() { - return nil, base.LazyValue{} - } - if !i.lazyValueHandling.hasValuePrefix || - base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { - i.lazyValue = base.MakeInPlaceValue(i.val) - } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { - i.lazyValue = base.MakeInPlaceValue(i.val[1:]) - } else { - i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) - } - return &i.ikey, i.lazyValue -} - -// First implements internalIterator.First, as documented in the pebble -// package. -func (i *blockIter) First() (*InternalKey, base.LazyValue) { - if invariants.Enabled && i.isDataInvalidated() { - panic(errors.AssertionFailedf("invalidated blockIter used")) - } - - i.offset = 0 - if !i.valid() { - return nil, base.LazyValue{} - } - i.clearCache() - i.readEntry() - hiddenPoint := i.decodeInternalKey(i.key) - if hiddenPoint { - return i.Next() - } - if !i.lazyValueHandling.hasValuePrefix || - base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { - i.lazyValue = base.MakeInPlaceValue(i.val) - } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { - i.lazyValue = base.MakeInPlaceValue(i.val[1:]) - } else { - i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) - } - return &i.ikey, i.lazyValue -} - -func decodeRestart(b []byte) int32 { - _ = b[3] // bounds check hint to compiler; see golang.org/issue/14808 - return int32(uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | - uint32(b[3]&restartMaskLittleEndianHighByteWithoutSetHasSamePrefix)<<24) -} - -// Last implements internalIterator.Last, as documented in the pebble package. -func (i *blockIter) Last() (*InternalKey, base.LazyValue) { - if invariants.Enabled && i.isDataInvalidated() { - panic(errors.AssertionFailedf("invalidated blockIter used")) - } - - // Seek forward from the last restart point. - i.offset = decodeRestart(i.data[i.restarts+4*(i.numRestarts-1):]) - if !i.valid() { - return nil, base.LazyValue{} - } - - i.readEntry() - i.clearCache() - - for i.nextOffset < i.restarts { - i.cacheEntry() - i.offset = i.nextOffset - i.readEntry() - } - - hiddenPoint := i.decodeInternalKey(i.key) - if hiddenPoint { - return i.Prev() - } - if !i.lazyValueHandling.hasValuePrefix || - base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { - i.lazyValue = base.MakeInPlaceValue(i.val) - } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { - i.lazyValue = base.MakeInPlaceValue(i.val[1:]) - } else { - i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) - } - return &i.ikey, i.lazyValue -} - -// Next implements internalIterator.Next, as documented in the pebble -// package. -func (i *blockIter) Next() (*InternalKey, base.LazyValue) { - if len(i.cachedBuf) > 0 { - // We're switching from reverse iteration to forward iteration. We need to - // populate i.fullKey with the current key we're positioned at so that - // readEntry() can use i.fullKey for key prefix decompression. Note that we - // don't know whether i.key is backed by i.cachedBuf or i.fullKey (if - // SeekLT was the previous call, i.key may be backed by i.fullKey), but - // copying into i.fullKey works for both cases. - // - // TODO(peter): Rather than clearing the cache, we could instead use the - // cache until it is exhausted. This would likely be faster than falling - // through to the normal forward iteration code below. - i.fullKey = append(i.fullKey[:0], i.key...) - i.clearCache() - } - -start: - i.offset = i.nextOffset - if !i.valid() { - return nil, base.LazyValue{} - } - i.readEntry() - // Manually inlined version of i.decodeInternalKey(i.key). - if n := len(i.key) - 8; n >= 0 { - trailer := binary.LittleEndian.Uint64(i.key[n:]) - hiddenPoint := i.hideObsoletePoints && - (trailer&trailerObsoleteBit != 0) - i.ikey.Trailer = trailer & trailerObsoleteMask - i.ikey.UserKey = i.key[:n:n] - if i.globalSeqNum != 0 { - i.ikey.SetSeqNum(i.globalSeqNum) - } - if hiddenPoint { - goto start - } - } else { - i.ikey.Trailer = uint64(InternalKeyKindInvalid) - i.ikey.UserKey = nil - } - if !i.lazyValueHandling.hasValuePrefix || - base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { - i.lazyValue = base.MakeInPlaceValue(i.val) - } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { - i.lazyValue = base.MakeInPlaceValue(i.val[1:]) - } else { - i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) - } - return &i.ikey, i.lazyValue -} - -// NextPrefix implements (base.InternalIterator).NextPrefix. -func (i *blockIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { - if i.lazyValueHandling.hasValuePrefix { - return i.nextPrefixV3(succKey) - } - const nextsBeforeSeek = 3 - k, v := i.Next() - for j := 1; k != nil && i.cmp(k.UserKey, succKey) < 0; j++ { - if j >= nextsBeforeSeek { - return i.SeekGE(succKey, base.SeekGEFlagsNone) - } - k, v = i.Next() - } - return k, v -} - -func (i *blockIter) nextPrefixV3(succKey []byte) (*InternalKey, base.LazyValue) { - // Doing nexts that involve a key comparison can be expensive (and the cost - // depends on the key length), so we use the same threshold of 3 that we use - // for TableFormatPebblev2 in blockIter.nextPrefix above. The next fast path - // that looks at setHasSamePrefix takes ~5ns per key, which is ~150x faster - // than doing a SeekGE within the block, so we do this 16 times - // (~5ns*16=80ns), and then switch to looking at restarts. Doing the binary - // search for the restart consumes > 100ns. If the number of versions is > - // 17, we will increment nextFastCount to 17, then do a binary search, and - // on average need to find a key between two restarts, so another 8 steps - // corresponding to nextFastCount, for a mean total of 17 + 8 = 25 such - // steps. - // - // TODO(sumeer): use the configured restartInterval for the sstable when it - // was written (which we don't currently store) instead of the default value - // of 16. - const nextCmpThresholdBeforeSeek = 3 - const nextFastThresholdBeforeRestarts = 16 - nextCmpCount := 0 - nextFastCount := 0 - usedRestarts := false - // INVARIANT: blockIter is valid. - if invariants.Enabled && !i.valid() { - panic(errors.AssertionFailedf("nextPrefixV3 called on invalid blockIter")) - } - prevKeyIsSet := i.ikey.Kind() == InternalKeyKindSet - for { - i.offset = i.nextOffset - if !i.valid() { - return nil, base.LazyValue{} - } - // Need to decode the length integers, so we can compute nextOffset. - ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset)) - // This is an ugly performance hack. Reading entries from blocks is one of - // the inner-most routines and decoding the 3 varints per-entry takes - // significant time. Neither go1.11 or go1.12 will inline decodeVarint for - // us, so we do it manually. This provides a 10-15% performance improvement - // on blockIter benchmarks on both go1.11 and go1.12. - // - // TODO(peter): remove this hack if go:inline is ever supported. - - // Decode the shared key length integer. - var shared uint32 - if a := *((*uint8)(ptr)); a < 128 { - shared = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - shared = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - // Decode the unshared key length integer. - var unshared uint32 - if a := *((*uint8)(ptr)); a < 128 { - unshared = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - unshared = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - // Decode the value length integer. - var value uint32 - if a := *((*uint8)(ptr)); a < 128 { - value = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - value = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - value = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - // The starting position of the value. - valuePtr := unsafe.Pointer(uintptr(ptr) + uintptr(unshared)) - i.nextOffset = int32(uintptr(valuePtr)-uintptr(i.ptr)) + int32(value) - if invariants.Enabled && unshared < 8 { - // This should not happen since only the key prefix is shared, so even - // if the prefix length is the same as the user key length, the unshared - // will include the trailer. - panic(errors.AssertionFailedf("unshared %d is too small", unshared)) - } - // The trailer is written in little endian, so the key kind is the first - // byte in the trailer that is encoded in the slice [unshared-8:unshared]. - keyKind := InternalKeyKind((*[manual.MaxArrayLen]byte)(ptr)[unshared-8]) - keyKind = keyKind & base.InternalKeyKindSSTableInternalObsoleteMask - prefixChanged := false - if keyKind == InternalKeyKindSet { - if invariants.Enabled && value == 0 { - panic(errors.AssertionFailedf("value is of length 0, but we expect a valuePrefix")) - } - valPrefix := *((*valuePrefix)(valuePtr)) - if setHasSamePrefix(valPrefix) { - // Fast-path. No need to assemble i.fullKey, or update i.key. We know - // that subsequent keys will not have a shared length that is greater - // than the prefix of the current key, which is also the prefix of - // i.key. Since we are continuing to iterate, we don't need to - // initialize i.ikey and i.lazyValue (these are initialized before - // returning). - nextFastCount++ - if nextFastCount > nextFastThresholdBeforeRestarts { - if usedRestarts { - // Exhausted iteration budget. This will never happen unless - // someone is using a restart interval > 16. It is just to guard - // against long restart intervals causing too much iteration. - break - } - // Haven't used restarts yet, so find the first restart at or beyond - // the current offset. - targetOffset := i.offset - var index int32 - { - // NB: manually inlined sort.Sort is ~5% faster. - // - // f defined for a restart point is true iff the offset >= - // targetOffset. - // Define f(-1) == false and f(i.numRestarts) == true. - // Invariant: f(index-1) == false, f(upper) == true. - upper := i.numRestarts - for index < upper { - h := int32(uint(index+upper) >> 1) // avoid overflow when computing h - // index ≤ h < upper - offset := decodeRestart(i.data[i.restarts+4*h:]) - if offset < targetOffset { - index = h + 1 // preserves f(index-1) == false - } else { - upper = h // preserves f(upper) == true - } - } - // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true - // => answer is index. - } - usedRestarts = true - nextFastCount = 0 - if index == i.numRestarts { - // Already past the last real restart, so iterate a bit more until - // we are done with the block. - continue - } - // Have some real restarts after index. NB: index is the first - // restart at or beyond the current offset. - startingIndex := index - for index != i.numRestarts && - // The restart at index is 4 bytes written in little endian format - // starting at i.restart+4*index. The 0th byte is the least - // significant and the 3rd byte is the most significant. Since the - // most significant bit of the 3rd byte is what we use for - // encoding the set-has-same-prefix information, the indexing - // below has +3. - i.data[i.restarts+4*index+3]&restartMaskLittleEndianHighByteOnlySetHasSamePrefix != 0 { - // We still have the same prefix, so move to the next restart. - index++ - } - // index is the first restart that did not have the same prefix. - if index != startingIndex { - // Managed to skip past at least one restart. Resume iteration - // from index-1. Since nextFastCount has been reset to 0, we - // should be able to iterate to the next prefix. - i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) - i.readEntry() - } - // Else, unable to skip past any restart. Resume iteration. Since - // nextFastCount has been reset to 0, we should be able to iterate - // to the next prefix. - continue - } - continue - } else if prevKeyIsSet { - prefixChanged = true - } - } else { - prevKeyIsSet = false - } - // Slow-path cases: - // - (Likely) The prefix has changed. - // - (Unlikely) The prefix has not changed. - // We assemble the key etc. under the assumption that it is the likely - // case. - unsharedKey := getBytes(ptr, int(unshared)) - // TODO(sumeer): move this into the else block below. This is a bit tricky - // since the current logic assumes we have always copied the latest key - // into fullKey, which is why when we get to the next key we can (a) - // access i.fullKey[:shared], (b) append only the unsharedKey to - // i.fullKey. For (a), we can access i.key[:shared] since that memory is - // valid (even if unshared). For (b), we will need to remember whether - // i.key refers to i.fullKey or not, and can append the unsharedKey only - // in the former case and for the latter case need to copy the shared part - // too. This same comment applies to the other place where we can do this - // optimization, in readEntry(). - i.fullKey = append(i.fullKey[:shared], unsharedKey...) - i.val = getBytes(valuePtr, int(value)) - if shared == 0 { - // Provide stability for the key across positioning calls if the key - // doesn't share a prefix with the previous key. This removes requiring the - // key to be copied if the caller knows the block has a restart interval of - // 1. An important example of this is range-del blocks. - i.key = unsharedKey - } else { - i.key = i.fullKey - } - // Manually inlined version of i.decodeInternalKey(i.key). - hiddenPoint := false - if n := len(i.key) - 8; n >= 0 { - trailer := binary.LittleEndian.Uint64(i.key[n:]) - hiddenPoint = i.hideObsoletePoints && - (trailer&trailerObsoleteBit != 0) - i.ikey.Trailer = trailer & trailerObsoleteMask - i.ikey.UserKey = i.key[:n:n] - if i.globalSeqNum != 0 { - i.ikey.SetSeqNum(i.globalSeqNum) - } - } else { - i.ikey.Trailer = uint64(InternalKeyKindInvalid) - i.ikey.UserKey = nil - } - nextCmpCount++ - if invariants.Enabled && prefixChanged && i.cmp(i.ikey.UserKey, succKey) < 0 { - panic(errors.AssertionFailedf("prefix should have changed but %x < %x", - i.ikey.UserKey, succKey)) - } - if prefixChanged || i.cmp(i.ikey.UserKey, succKey) >= 0 { - // Prefix has changed. - if hiddenPoint { - return i.Next() - } - if invariants.Enabled && !i.lazyValueHandling.hasValuePrefix { - panic(errors.AssertionFailedf("nextPrefixV3 being run for non-v3 sstable")) - } - if base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { - i.lazyValue = base.MakeInPlaceValue(i.val) - } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { - i.lazyValue = base.MakeInPlaceValue(i.val[1:]) - } else { - i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) - } - return &i.ikey, i.lazyValue - } - // Else prefix has not changed. - - if nextCmpCount >= nextCmpThresholdBeforeSeek { - break - } - } - return i.SeekGE(succKey, base.SeekGEFlagsNone) -} - -// Prev implements internalIterator.Prev, as documented in the pebble -// package. -func (i *blockIter) Prev() (*InternalKey, base.LazyValue) { -start: - for n := len(i.cached) - 1; n >= 0; n-- { - i.nextOffset = i.offset - e := &i.cached[n] - i.offset = e.offset - i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize)) - // Manually inlined version of i.decodeInternalKey(i.key). - i.key = i.cachedBuf[e.keyStart:e.keyEnd] - if n := len(i.key) - 8; n >= 0 { - trailer := binary.LittleEndian.Uint64(i.key[n:]) - hiddenPoint := i.hideObsoletePoints && - (trailer&trailerObsoleteBit != 0) - if hiddenPoint { - continue - } - i.ikey.Trailer = trailer & trailerObsoleteMask - i.ikey.UserKey = i.key[:n:n] - if i.globalSeqNum != 0 { - i.ikey.SetSeqNum(i.globalSeqNum) - } - } else { - i.ikey.Trailer = uint64(InternalKeyKindInvalid) - i.ikey.UserKey = nil - } - i.cached = i.cached[:n] - if !i.lazyValueHandling.hasValuePrefix || - base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { - i.lazyValue = base.MakeInPlaceValue(i.val) - } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { - i.lazyValue = base.MakeInPlaceValue(i.val[1:]) - } else { - i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) - } - return &i.ikey, i.lazyValue - } - - i.clearCache() - if i.offset <= 0 { - i.offset = -1 - i.nextOffset = 0 - return nil, base.LazyValue{} - } - - targetOffset := i.offset - var index int32 - - { - // NB: manually inlined sort.Sort is ~5% faster. - // - // Define f(-1) == false and f(n) == true. - // Invariant: f(index-1) == false, f(upper) == true. - upper := i.numRestarts - for index < upper { - h := int32(uint(index+upper) >> 1) // avoid overflow when computing h - // index ≤ h < upper - offset := decodeRestart(i.data[i.restarts+4*h:]) - if offset < targetOffset { - // Looking for the first restart that has offset >= targetOffset, so - // ignore h and earlier. - index = h + 1 // preserves f(i-1) == false - } else { - upper = h // preserves f(j) == true - } - } - // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true - // => answer is index. - } - - // index is first restart with offset >= targetOffset. Note that - // targetOffset may not be at a restart point since one can call Prev() - // after Next() (so the cache was not populated) and targetOffset refers to - // the current entry. index-1 must have an offset < targetOffset (it can't - // be equal to targetOffset since the binary search would have selected that - // as the index). - i.offset = 0 - if index > 0 { - i.offset = decodeRestart(i.data[i.restarts+4*(index-1):]) - } - // TODO(sumeer): why is the else case not an error given targetOffset is a - // valid offset. - - i.readEntry() - - // We stop when i.nextOffset == targetOffset since the targetOffset is the - // entry we are stepping back from, and we don't need to cache the entry - // before it, since it is the candidate to return. - for i.nextOffset < targetOffset { - i.cacheEntry() - i.offset = i.nextOffset - i.readEntry() - } - - hiddenPoint := i.decodeInternalKey(i.key) - if hiddenPoint { - // Use the cache. - goto start - } - if !i.lazyValueHandling.hasValuePrefix || - base.TrailerKind(i.ikey.Trailer) != InternalKeyKindSet { - i.lazyValue = base.MakeInPlaceValue(i.val) - } else if i.lazyValueHandling.vbr == nil || !isValueHandle(valuePrefix(i.val[0])) { - i.lazyValue = base.MakeInPlaceValue(i.val[1:]) - } else { - i.lazyValue = i.lazyValueHandling.vbr.getLazyValueForPrefixAndValueHandle(i.val) - } - return &i.ikey, i.lazyValue -} - -// Key implements internalIterator.Key, as documented in the pebble package. -func (i *blockIter) Key() *InternalKey { - return &i.ikey -} - -func (i *blockIter) value() base.LazyValue { - return i.lazyValue -} - -// Error implements internalIterator.Error, as documented in the pebble -// package. -func (i *blockIter) Error() error { - return nil // infallible -} - -// Close implements internalIterator.Close, as documented in the pebble -// package. -func (i *blockIter) Close() error { - i.handle.Release() - i.handle = bufferHandle{} - i.val = nil - i.lazyValue = base.LazyValue{} - i.lazyValueHandling.vbr = nil - return nil -} - -func (i *blockIter) SetBounds(lower, upper []byte) { - // This should never be called as bounds are handled by sstable.Iterator. - panic("pebble: SetBounds unimplemented") -} - -func (i *blockIter) valid() bool { - return i.offset >= 0 && i.offset < i.restarts -} - -// fragmentBlockIter wraps a blockIter, implementing the -// keyspan.FragmentIterator interface. It's used for reading range deletion and -// range key blocks. -// -// Range deletions and range keys are fragmented before they're persisted to the -// block. Overlapping fragments have identical bounds. The fragmentBlockIter -// gathers all the fragments with identical bounds within a block and returns a -// single keyspan.Span describing all the keys defined over the span. -// -// # Memory lifetime -// -// A Span returned by fragmentBlockIter is only guaranteed to be stable until -// the next fragmentBlockIter iteration positioning method. A Span's Keys slice -// may be reused, so the user must not assume it's stable. -// -// Blocks holding range deletions and range keys are configured to use a restart -// interval of 1. This provides key stability. The caller may treat the various -// byte slices (start, end, suffix, value) as stable for the lifetime of the -// iterator. -type fragmentBlockIter struct { - blockIter blockIter - keyBuf [2]keyspan.Key - span keyspan.Span - err error - dir int8 - closeHook func(i keyspan.FragmentIterator) error - - // elideSameSeqnum, if true, returns only the first-occurring (in forward - // order) Key for each sequence number. - elideSameSeqnum bool -} - -func (i *fragmentBlockIter) resetForReuse() fragmentBlockIter { - return fragmentBlockIter{blockIter: i.blockIter.resetForReuse()} -} - -func (i *fragmentBlockIter) decodeSpanKeys(k *InternalKey, internalValue []byte) { - // TODO(jackson): The use of i.span.Keys to accumulate keys across multiple - // calls to Decode is too confusing and subtle. Refactor to make it - // explicit. - - // decode the contents of the fragment's value. This always includes at - // least the end key: RANGEDELs store the end key directly as the value, - // whereas the various range key kinds store are more complicated. The - // details of the range key internal value format are documented within the - // internal/rangekey package. - switch k.Kind() { - case base.InternalKeyKindRangeDelete: - i.span = rangedel.Decode(*k, internalValue, i.span.Keys) - i.err = nil - case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete: - i.span, i.err = rangekey.Decode(*k, internalValue, i.span.Keys) - default: - i.span = keyspan.Span{} - i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragment of kind %d", k.Kind()) - } -} - -func (i *fragmentBlockIter) elideKeysOfSameSeqNum() { - if invariants.Enabled { - if !i.elideSameSeqnum || len(i.span.Keys) == 0 { - panic("elideKeysOfSameSeqNum called when it should not be") - } - } - lastSeqNum := i.span.Keys[0].SeqNum() - k := 1 - for j := 1; j < len(i.span.Keys); j++ { - if lastSeqNum != i.span.Keys[j].SeqNum() { - lastSeqNum = i.span.Keys[j].SeqNum() - i.span.Keys[k] = i.span.Keys[j] - k++ - } - } - i.span.Keys = i.span.Keys[:k] -} - -// gatherForward gathers internal keys with identical bounds. Keys defined over -// spans of the keyspace are fragmented such that any overlapping key spans have -// identical bounds. When these spans are persisted to a range deletion or range -// key block, they may be persisted as multiple internal keys in order to encode -// multiple sequence numbers or key kinds. -// -// gatherForward iterates forward, re-combining the fragmented internal keys to -// reconstruct a keyspan.Span that holds all the keys defined over the span. -func (i *fragmentBlockIter) gatherForward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span { - i.span = keyspan.Span{} - if k == nil || !i.blockIter.valid() { - return nil - } - i.err = nil - // Use the i.keyBuf array to back the Keys slice to prevent an allocation - // when a span contains few keys. - i.span.Keys = i.keyBuf[:0] - - // Decode the span's end key and individual keys from the value. - internalValue := lazyValue.InPlaceValue() - i.decodeSpanKeys(k, internalValue) - if i.err != nil { - return nil - } - prevEnd := i.span.End - - // There might exist additional internal keys with identical bounds encoded - // within the block. Iterate forward, accumulating all the keys with - // identical bounds to s. - k, lazyValue = i.blockIter.Next() - internalValue = lazyValue.InPlaceValue() - for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 { - i.decodeSpanKeys(k, internalValue) - if i.err != nil { - return nil - } - - // Since k indicates an equal start key, the encoded end key must - // exactly equal the original end key from the first internal key. - // Overlapping fragments are required to have exactly equal start and - // end bounds. - if i.blockIter.cmp(prevEnd, i.span.End) != 0 { - i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation") - i.span = keyspan.Span{} - return nil - } - k, lazyValue = i.blockIter.Next() - internalValue = lazyValue.InPlaceValue() - } - if i.elideSameSeqnum && len(i.span.Keys) > 0 { - i.elideKeysOfSameSeqNum() - } - // i.blockIter is positioned over the first internal key for the next span. - return &i.span -} - -// gatherBackward gathers internal keys with identical bounds. Keys defined over -// spans of the keyspace are fragmented such that any overlapping key spans have -// identical bounds. When these spans are persisted to a range deletion or range -// key block, they may be persisted as multiple internal keys in order to encode -// multiple sequence numbers or key kinds. -// -// gatherBackward iterates backwards, re-combining the fragmented internal keys -// to reconstruct a keyspan.Span that holds all the keys defined over the span. -func (i *fragmentBlockIter) gatherBackward(k *InternalKey, lazyValue base.LazyValue) *keyspan.Span { - i.span = keyspan.Span{} - if k == nil || !i.blockIter.valid() { - return nil - } - i.err = nil - // Use the i.keyBuf array to back the Keys slice to prevent an allocation - // when a span contains few keys. - i.span.Keys = i.keyBuf[:0] - - // Decode the span's end key and individual keys from the value. - internalValue := lazyValue.InPlaceValue() - i.decodeSpanKeys(k, internalValue) - if i.err != nil { - return nil - } - prevEnd := i.span.End - - // There might exist additional internal keys with identical bounds encoded - // within the block. Iterate backward, accumulating all the keys with - // identical bounds to s. - k, lazyValue = i.blockIter.Prev() - internalValue = lazyValue.InPlaceValue() - for k != nil && i.blockIter.cmp(k.UserKey, i.span.Start) == 0 { - i.decodeSpanKeys(k, internalValue) - if i.err != nil { - return nil - } - - // Since k indicates an equal start key, the encoded end key must - // exactly equal the original end key from the first internal key. - // Overlapping fragments are required to have exactly equal start and - // end bounds. - if i.blockIter.cmp(prevEnd, i.span.End) != 0 { - i.err = base.CorruptionErrorf("pebble: corrupt keyspan fragmentation") - i.span = keyspan.Span{} - return nil - } - k, lazyValue = i.blockIter.Prev() - internalValue = lazyValue.InPlaceValue() - } - // i.blockIter is positioned over the last internal key for the previous - // span. - - // Backwards iteration encounters internal keys in the wrong order. - keyspan.SortKeysByTrailer(&i.span.Keys) - - if i.elideSameSeqnum && len(i.span.Keys) > 0 { - i.elideKeysOfSameSeqNum() - } - return &i.span -} - -// Error implements (keyspan.FragmentIterator).Error. -func (i *fragmentBlockIter) Error() error { - return i.err -} - -// Close implements (keyspan.FragmentIterator).Close. -func (i *fragmentBlockIter) Close() error { - var err error - if i.closeHook != nil { - err = i.closeHook(i) - } - err = firstError(err, i.blockIter.Close()) - return err -} - -// First implements (keyspan.FragmentIterator).First -func (i *fragmentBlockIter) First() *keyspan.Span { - i.dir = +1 - return i.gatherForward(i.blockIter.First()) -} - -// Last implements (keyspan.FragmentIterator).Last. -func (i *fragmentBlockIter) Last() *keyspan.Span { - i.dir = -1 - return i.gatherBackward(i.blockIter.Last()) -} - -// Next implements (keyspan.FragmentIterator).Next. -func (i *fragmentBlockIter) Next() *keyspan.Span { - switch { - case i.dir == -1 && !i.span.Valid(): - // Switching directions. - // - // i.blockIter is exhausted, before the first key. Move onto the first. - i.blockIter.First() - i.dir = +1 - case i.dir == -1 && i.span.Valid(): - // Switching directions. - // - // i.blockIter is currently positioned over the last internal key for - // the previous span. Next it once to move to the first internal key - // that makes up the current span, and gatherForwaad to land on the - // first internal key making up the next span. - // - // In the diagram below, if the last span returned to the user during - // reverse iteration was [b,c), i.blockIter is currently positioned at - // [a,b). The block iter must be positioned over [d,e) to gather the - // next span's fragments. - // - // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... - // ^ ^ - // i.blockIter want - if x := i.gatherForward(i.blockIter.Next()); invariants.Enabled && !x.Valid() { - panic("pebble: invariant violation: next entry unexpectedly invalid") - } - i.dir = +1 - } - // We know that this blockIter has in-place values. - return i.gatherForward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val)) -} - -// Prev implements (keyspan.FragmentIterator).Prev. -func (i *fragmentBlockIter) Prev() *keyspan.Span { - switch { - case i.dir == +1 && !i.span.Valid(): - // Switching directions. - // - // i.blockIter is exhausted, after the last key. Move onto the last. - i.blockIter.Last() - i.dir = -1 - case i.dir == +1 && i.span.Valid(): - // Switching directions. - // - // i.blockIter is currently positioned over the first internal key for - // the next span. Prev it once to move to the last internal key that - // makes up the current span, and gatherBackward to land on the last - // internal key making up the previous span. - // - // In the diagram below, if the last span returned to the user during - // forward iteration was [b,c), i.blockIter is currently positioned at - // [d,e). The block iter must be positioned over [a,b) to gather the - // previous span's fragments. - // - // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... - // ^ ^ - // want i.blockIter - if x := i.gatherBackward(i.blockIter.Prev()); invariants.Enabled && !x.Valid() { - panic("pebble: invariant violation: previous entry unexpectedly invalid") - } - i.dir = -1 - } - // We know that this blockIter has in-place values. - return i.gatherBackward(&i.blockIter.ikey, base.MakeInPlaceValue(i.blockIter.val)) -} - -// SeekGE implements (keyspan.FragmentIterator).SeekGE. -func (i *fragmentBlockIter) SeekGE(k []byte) *keyspan.Span { - if s := i.SeekLT(k); s != nil && i.blockIter.cmp(k, s.End) < 0 { - return s - } - // TODO(jackson): If the above i.SeekLT(k) discovers a span but the span - // doesn't meet the k < s.End comparison, then there's no need for the - // SeekLT to gatherBackward. - return i.Next() -} - -// SeekLT implements (keyspan.FragmentIterator).SeekLT. -func (i *fragmentBlockIter) SeekLT(k []byte) *keyspan.Span { - i.dir = -1 - return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone)) -} - -// String implements fmt.Stringer. -func (i *fragmentBlockIter) String() string { - return "fragment-block-iter" -} - -// SetCloseHook implements sstable.FragmentIterator. -func (i *fragmentBlockIter) SetCloseHook(fn func(i keyspan.FragmentIterator) error) { - i.closeHook = fn -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/buffer_pool.go b/vendor/github.com/cockroachdb/pebble/sstable/buffer_pool.go deleted file mode 100644 index 2e98d44..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/buffer_pool.go +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/cache" -) - -// A bufferHandle is a handle to manually-managed memory. The handle may point -// to a block in the block cache (h.Get() != nil), or a buffer that exists -// outside the block cache allocated from a BufferPool (b.Valid()). -type bufferHandle struct { - h cache.Handle - b Buf -} - -// Get retrieves the underlying buffer referenced by the handle. -func (bh bufferHandle) Get() []byte { - if v := bh.h.Get(); v != nil { - return v - } else if bh.b.p != nil { - return bh.b.p.pool[bh.b.i].b - } - return nil -} - -// Release releases the buffer, either back to the block cache or BufferPool. -func (bh bufferHandle) Release() { - bh.h.Release() - bh.b.Release() -} - -// A BufferPool holds a pool of buffers for holding sstable blocks. An initial -// size of the pool is provided on Init, but a BufferPool will grow to meet the -// largest working set size. It'll never shrink. When a buffer is released, the -// BufferPool recycles the buffer for future allocations. -// -// A BufferPool should only be used for short-lived allocations with -// well-understood working set sizes to avoid excessive memory consumption. -// -// BufferPool is not thread-safe. -type BufferPool struct { - // pool contains all the buffers held by the pool, including buffers that - // are in-use. For every i < len(pool): pool[i].v is non-nil. - pool []allocedBuffer -} - -type allocedBuffer struct { - v *cache.Value - // b holds the current byte slice. It's backed by v, but may be a subslice - // of v's memory while the buffer is in-use [ len(b) ≤ len(v.Buf()) ]. - // - // If the buffer is not currently in-use, b is nil. When being recycled, the - // BufferPool.Alloc will reset b to be a subslice of v.Buf(). - b []byte -} - -// Init initializes the pool with an initial working set buffer size of -// `initialSize`. -func (p *BufferPool) Init(initialSize int) { - *p = BufferPool{ - pool: make([]allocedBuffer, 0, initialSize), - } -} - -// initPreallocated is like Init but for internal sstable package use in -// instances where a pre-allocated slice of []allocedBuffer already exists. It's -// used to avoid an extra allocation initializing BufferPool.pool. -func (p *BufferPool) initPreallocated(pool []allocedBuffer) { - *p = BufferPool{ - pool: pool[:0], - } -} - -// Release releases all buffers held by the pool and resets the pool to an -// uninitialized state. -func (p *BufferPool) Release() { - for i := range p.pool { - if p.pool[i].b != nil { - panic(errors.AssertionFailedf("Release called on a BufferPool with in-use buffers")) - } - cache.Free(p.pool[i].v) - } - *p = BufferPool{} -} - -// Alloc allocates a new buffer of size n. If the pool already holds a buffer at -// least as large as n, the pooled buffer is used instead. -// -// Alloc is O(MAX(N,M)) where N is the largest number of concurrently in-use -// buffers allocated and M is the initialSize passed to Init. -func (p *BufferPool) Alloc(n int) Buf { - unusableBufferIdx := -1 - for i := 0; i < len(p.pool); i++ { - if p.pool[i].b == nil { - if len(p.pool[i].v.Buf()) >= n { - p.pool[i].b = p.pool[i].v.Buf()[:n] - return Buf{p: p, i: i} - } - unusableBufferIdx = i - } - } - - // If we would need to grow the size of the pool to allocate another buffer, - // but there was a slot available occupied by a buffer that's just too - // small, replace the too-small buffer. - if len(p.pool) == cap(p.pool) && unusableBufferIdx >= 0 { - i := unusableBufferIdx - cache.Free(p.pool[i].v) - p.pool[i].v = cache.Alloc(n) - p.pool[i].b = p.pool[i].v.Buf() - return Buf{p: p, i: i} - } - - // Allocate a new buffer. - v := cache.Alloc(n) - p.pool = append(p.pool, allocedBuffer{v: v, b: v.Buf()[:n]}) - return Buf{p: p, i: len(p.pool) - 1} -} - -// A Buf holds a reference to a manually-managed, pooled byte buffer. -type Buf struct { - p *BufferPool - // i holds the index into p.pool where the buffer may be found. This scheme - // avoids needing to allocate the handle to the buffer on the heap at the - // cost of copying two words instead of one. - i int -} - -// Valid returns true if the buf holds a valid buffer. -func (b Buf) Valid() bool { - return b.p != nil -} - -// Release releases the buffer back to the pool. -func (b *Buf) Release() { - if b.p == nil { - return - } - // Clear the allocedBuffer's byte slice. This signals the allocated buffer - // is no longer in use and a future call to BufferPool.Alloc may reuse this - // buffer. - b.p.pool[b.i].b = nil - b.p = nil -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/compression.go b/vendor/github.com/cockroachdb/pebble/sstable/compression.go deleted file mode 100644 index 0db70c8..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/compression.go +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "encoding/binary" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/cache" - "github.com/golang/snappy" -) - -func decompressedLen(blockType blockType, b []byte) (int, int, error) { - switch blockType { - case noCompressionBlockType: - return 0, 0, nil - case snappyCompressionBlockType: - l, err := snappy.DecodedLen(b) - return l, 0, err - case zstdCompressionBlockType: - // This will also be used by zlib, bzip2 and lz4 to retrieve the decodedLen - // if we implement these algorithms in the future. - decodedLenU64, varIntLen := binary.Uvarint(b) - if varIntLen <= 0 { - return 0, 0, base.CorruptionErrorf("pebble/table: compression block has invalid length") - } - return int(decodedLenU64), varIntLen, nil - default: - return 0, 0, base.CorruptionErrorf("pebble/table: unknown block compression: %d", errors.Safe(blockType)) - } -} - -func decompressInto(blockType blockType, compressed []byte, buf []byte) ([]byte, error) { - var result []byte - var err error - switch blockType { - case snappyCompressionBlockType: - result, err = snappy.Decode(buf, compressed) - case zstdCompressionBlockType: - result, err = decodeZstd(buf, compressed) - } - if err != nil { - return nil, base.MarkCorruptionError(err) - } - if len(result) != 0 && (len(result) != len(buf) || &result[0] != &buf[0]) { - return nil, base.CorruptionErrorf("pebble/table: decompressed into unexpected buffer: %p != %p", - errors.Safe(result), errors.Safe(buf)) - } - return result, nil -} - -// decompressBlock decompresses an SST block, with manually-allocated space. -// NB: If decompressBlock returns (nil, nil), no decompression was necessary and -// the caller may use `b` directly. -func decompressBlock(blockType blockType, b []byte) (*cache.Value, error) { - if blockType == noCompressionBlockType { - return nil, nil - } - // first obtain the decoded length. - decodedLen, prefixLen, err := decompressedLen(blockType, b) - if err != nil { - return nil, err - } - b = b[prefixLen:] - // Allocate sufficient space from the cache. - decoded := cache.Alloc(decodedLen) - decodedBuf := decoded.Buf() - if _, err := decompressInto(blockType, b, decodedBuf); err != nil { - cache.Free(decoded) - return nil, err - } - return decoded, nil -} - -// compressBlock compresses an SST block, using compressBuf as the desired destination. -func compressBlock( - compression Compression, b []byte, compressedBuf []byte, -) (blockType blockType, compressed []byte) { - switch compression { - case SnappyCompression: - return snappyCompressionBlockType, snappy.Encode(compressedBuf, b) - case NoCompression: - return noCompressionBlockType, b - } - - if len(compressedBuf) < binary.MaxVarintLen64 { - compressedBuf = append(compressedBuf, make([]byte, binary.MaxVarintLen64-len(compressedBuf))...) - } - varIntLen := binary.PutUvarint(compressedBuf, uint64(len(b))) - switch compression { - case ZstdCompression: - return zstdCompressionBlockType, encodeZstd(compressedBuf, varIntLen, b) - default: - return noCompressionBlockType, b - } -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/compression_cgo.go b/vendor/github.com/cockroachdb/pebble/sstable/compression_cgo.go deleted file mode 100644 index ad7d844..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/compression_cgo.go +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build cgo -// +build cgo - -package sstable - -import ( - "bytes" - - "github.com/DataDog/zstd" -) - -// decodeZstd decompresses b with the Zstandard algorithm. -// It reuses the preallocated capacity of decodedBuf if it is sufficient. -// On success, it returns the decoded byte slice. -func decodeZstd(decodedBuf, b []byte) ([]byte, error) { - return zstd.Decompress(decodedBuf, b) -} - -// encodeZstd compresses b with the Zstandard algorithm at default compression -// level (level 3). It reuses the preallocated capacity of compressedBuf if it -// is sufficient. The subslice `compressedBuf[:varIntLen]` should already encode -// the length of `b` before calling encodeZstd. It returns the encoded byte -// slice, including the `compressedBuf[:varIntLen]` prefix. -func encodeZstd(compressedBuf []byte, varIntLen int, b []byte) []byte { - buf := bytes.NewBuffer(compressedBuf[:varIntLen]) - writer := zstd.NewWriterLevel(buf, 3) - writer.Write(b) - writer.Close() - return buf.Bytes() -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/compression_nocgo.go b/vendor/github.com/cockroachdb/pebble/sstable/compression_nocgo.go deleted file mode 100644 index 42c34fb..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/compression_nocgo.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -//go:build !cgo -// +build !cgo - -package sstable - -import "github.com/klauspost/compress/zstd" - -// decodeZstd decompresses b with the Zstandard algorithm. -// It reuses the preallocated capacity of decodedBuf if it is sufficient. -// On success, it returns the decoded byte slice. -func decodeZstd(decodedBuf, b []byte) ([]byte, error) { - decoder, _ := zstd.NewReader(nil) - defer decoder.Close() - return decoder.DecodeAll(b, decodedBuf[:0]) -} - -// encodeZstd compresses b with the Zstandard algorithm at default compression -// level (level 3). It reuses the preallocated capacity of compressedBuf if it -// is sufficient. The subslice `compressedBuf[:varIntLen]` should already encode -// the length of `b` before calling encodeZstd. It returns the encoded byte -// slice, including the `compressedBuf[:varIntLen]` prefix. -func encodeZstd(compressedBuf []byte, varIntLen int, b []byte) []byte { - encoder, _ := zstd.NewWriter(nil) - defer encoder.Close() - return encoder.EncodeAll(b, compressedBuf[:varIntLen]) -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/internal.go b/vendor/github.com/cockroachdb/pebble/sstable/internal.go deleted file mode 100644 index 0fe7c99..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/internal.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import "github.com/cockroachdb/pebble/internal/base" - -// InternalKeyKind exports the base.InternalKeyKind type. -type InternalKeyKind = base.InternalKeyKind - -// SeekGEFlags exports base.SeekGEFlags. -type SeekGEFlags = base.SeekGEFlags - -// SeekLTFlags exports base.SeekLTFlags. -type SeekLTFlags = base.SeekLTFlags - -// These constants are part of the file format, and should not be changed. -const ( - InternalKeyKindDelete = base.InternalKeyKindDelete - InternalKeyKindSet = base.InternalKeyKindSet - InternalKeyKindMerge = base.InternalKeyKindMerge - InternalKeyKindLogData = base.InternalKeyKindLogData - InternalKeyKindSingleDelete = base.InternalKeyKindSingleDelete - InternalKeyKindRangeDelete = base.InternalKeyKindRangeDelete - InternalKeyKindSetWithDelete = base.InternalKeyKindSetWithDelete - InternalKeyKindDeleteSized = base.InternalKeyKindDeleteSized - InternalKeyKindMax = base.InternalKeyKindMax - InternalKeyKindInvalid = base.InternalKeyKindInvalid - InternalKeySeqNumBatch = base.InternalKeySeqNumBatch - InternalKeySeqNumMax = base.InternalKeySeqNumMax - InternalKeyRangeDeleteSentinel = base.InternalKeyRangeDeleteSentinel -) - -// InternalKey exports the base.InternalKey type. -type InternalKey = base.InternalKey diff --git a/vendor/github.com/cockroachdb/pebble/sstable/layout.go b/vendor/github.com/cockroachdb/pebble/sstable/layout.go deleted file mode 100644 index bff1e30..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/layout.go +++ /dev/null @@ -1,306 +0,0 @@ -// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "bytes" - "context" - "encoding/binary" - "fmt" - "io" - "sort" - "unsafe" - - "github.com/cockroachdb/pebble/internal/base" -) - -// Layout describes the block organization of an sstable. -type Layout struct { - // NOTE: changes to fields in this struct should also be reflected in - // ValidateBlockChecksums, which validates a static list of BlockHandles - // referenced in this struct. - - Data []BlockHandleWithProperties - Index []BlockHandle - TopIndex BlockHandle - Filter BlockHandle - RangeDel BlockHandle - RangeKey BlockHandle - ValueBlock []BlockHandle - ValueIndex BlockHandle - Properties BlockHandle - MetaIndex BlockHandle - Footer BlockHandle - Format TableFormat -} - -// Describe returns a description of the layout. If the verbose parameter is -// true, details of the structure of each block are returned as well. -func (l *Layout) Describe( - w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte), -) { - ctx := context.TODO() - type block struct { - BlockHandle - name string - } - var blocks []block - - for i := range l.Data { - blocks = append(blocks, block{l.Data[i].BlockHandle, "data"}) - } - for i := range l.Index { - blocks = append(blocks, block{l.Index[i], "index"}) - } - if l.TopIndex.Length != 0 { - blocks = append(blocks, block{l.TopIndex, "top-index"}) - } - if l.Filter.Length != 0 { - blocks = append(blocks, block{l.Filter, "filter"}) - } - if l.RangeDel.Length != 0 { - blocks = append(blocks, block{l.RangeDel, "range-del"}) - } - if l.RangeKey.Length != 0 { - blocks = append(blocks, block{l.RangeKey, "range-key"}) - } - for i := range l.ValueBlock { - blocks = append(blocks, block{l.ValueBlock[i], "value-block"}) - } - if l.ValueIndex.Length != 0 { - blocks = append(blocks, block{l.ValueIndex, "value-index"}) - } - if l.Properties.Length != 0 { - blocks = append(blocks, block{l.Properties, "properties"}) - } - if l.MetaIndex.Length != 0 { - blocks = append(blocks, block{l.MetaIndex, "meta-index"}) - } - if l.Footer.Length != 0 { - if l.Footer.Length == levelDBFooterLen { - blocks = append(blocks, block{l.Footer, "leveldb-footer"}) - } else { - blocks = append(blocks, block{l.Footer, "footer"}) - } - } - - sort.Slice(blocks, func(i, j int) bool { - return blocks[i].Offset < blocks[j].Offset - }) - - for i := range blocks { - b := &blocks[i] - fmt.Fprintf(w, "%10d %s (%d)\n", b.Offset, b.name, b.Length) - - if !verbose { - continue - } - if b.name == "filter" { - continue - } - - if b.name == "footer" || b.name == "leveldb-footer" { - trailer, offset := make([]byte, b.Length), b.Offset - _ = r.readable.ReadAt(ctx, trailer, int64(offset)) - - if b.name == "footer" { - checksumType := ChecksumType(trailer[0]) - fmt.Fprintf(w, "%10d checksum type: %s\n", offset, checksumType) - trailer, offset = trailer[1:], offset+1 - } - - metaHandle, n := binary.Uvarint(trailer) - metaLen, m := binary.Uvarint(trailer[n:]) - fmt.Fprintf(w, "%10d meta: offset=%d, length=%d\n", offset, metaHandle, metaLen) - trailer, offset = trailer[n+m:], offset+uint64(n+m) - - indexHandle, n := binary.Uvarint(trailer) - indexLen, m := binary.Uvarint(trailer[n:]) - fmt.Fprintf(w, "%10d index: offset=%d, length=%d\n", offset, indexHandle, indexLen) - trailer, offset = trailer[n+m:], offset+uint64(n+m) - - fmt.Fprintf(w, "%10d [padding]\n", offset) - - trailing := 12 - if b.name == "leveldb-footer" { - trailing = 8 - } - - offset += uint64(len(trailer) - trailing) - trailer = trailer[len(trailer)-trailing:] - - if b.name == "footer" { - version := trailer[:4] - fmt.Fprintf(w, "%10d version: %d\n", offset, binary.LittleEndian.Uint32(version)) - trailer, offset = trailer[4:], offset+4 - } - - magicNumber := trailer - fmt.Fprintf(w, "%10d magic number: 0x%x\n", offset, magicNumber) - - continue - } - - h, err := r.readBlock( - context.Background(), b.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) - if err != nil { - fmt.Fprintf(w, " [err: %s]\n", err) - continue - } - - getRestart := func(data []byte, restarts, i int32) int32 { - return decodeRestart(data[restarts+4*i:]) - } - - formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) { - i := sort.Search(int(numRestarts), func(i int) bool { - return getRestart(data, restarts, int32(i)) >= offset - }) - if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset { - fmt.Fprintf(w, " [restart]\n") - } else { - fmt.Fprintf(w, "\n") - } - } - - formatRestarts := func(data []byte, restarts, numRestarts int32) { - for i := int32(0); i < numRestarts; i++ { - offset := getRestart(data, restarts, i) - fmt.Fprintf(w, "%10d [restart %d]\n", - b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset)) - } - } - - formatTrailer := func() { - trailer := make([]byte, blockTrailerLen) - offset := int64(b.Offset + b.Length) - _ = r.readable.ReadAt(ctx, trailer, offset) - bt := blockType(trailer[0]) - checksum := binary.LittleEndian.Uint32(trailer[1:]) - fmt.Fprintf(w, "%10d [trailer compression=%s checksum=0x%04x]\n", offset, bt, checksum) - } - - var lastKey InternalKey - switch b.name { - case "data", "range-del", "range-key": - iter, _ := newBlockIter(r.Compare, h.Get()) - for key, value := iter.First(); key != nil; key, value = iter.Next() { - ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset)) - shared, ptr := decodeVarint(ptr) - unshared, ptr := decodeVarint(ptr) - value2, _ := decodeVarint(ptr) - - total := iter.nextOffset - iter.offset - // The format of the numbers in the record line is: - // - // ( = [] + + ) - // - // is the total number of bytes for the record. - // is the size of the 3 varint encoded integers for , - // , and . - // is the number of key bytes shared with the previous key. - // is the number of unshared key bytes. - // is the number of value bytes. - fmt.Fprintf(w, "%10d record (%d = %d [%d] + %d + %d)", - b.Offset+uint64(iter.offset), total, - total-int32(unshared+value2), shared, unshared, value2) - formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) - if fmtRecord != nil { - fmt.Fprintf(w, " ") - if l.Format < TableFormatPebblev3 { - fmtRecord(key, value.InPlaceValue()) - } else { - // InPlaceValue() will succeed even for data blocks where the - // actual value is in a different location, since this value was - // fetched from a blockIter which does not know about value - // blocks. - v := value.InPlaceValue() - if base.TrailerKind(key.Trailer) != InternalKeyKindSet { - fmtRecord(key, v) - } else if !isValueHandle(valuePrefix(v[0])) { - fmtRecord(key, v[1:]) - } else { - vh := decodeValueHandle(v[1:]) - fmtRecord(key, []byte(fmt.Sprintf("value handle %+v", vh))) - } - } - } - - if base.InternalCompare(r.Compare, lastKey, *key) >= 0 { - fmt.Fprintf(w, " WARNING: OUT OF ORDER KEYS!\n") - } - lastKey.Trailer = key.Trailer - lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...) - } - formatRestarts(iter.data, iter.restarts, iter.numRestarts) - formatTrailer() - case "index", "top-index": - iter, _ := newBlockIter(r.Compare, h.Get()) - for key, value := iter.First(); key != nil; key, value = iter.Next() { - bh, err := decodeBlockHandleWithProperties(value.InPlaceValue()) - if err != nil { - fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) - continue - } - fmt.Fprintf(w, "%10d block:%d/%d", - b.Offset+uint64(iter.offset), bh.Offset, bh.Length) - formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) - } - formatRestarts(iter.data, iter.restarts, iter.numRestarts) - formatTrailer() - case "properties": - iter, _ := newRawBlockIter(r.Compare, h.Get()) - for valid := iter.First(); valid; valid = iter.Next() { - fmt.Fprintf(w, "%10d %s (%d)", - b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset) - formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) - } - formatRestarts(iter.data, iter.restarts, iter.numRestarts) - formatTrailer() - case "meta-index": - iter, _ := newRawBlockIter(r.Compare, h.Get()) - for valid := iter.First(); valid; valid = iter.Next() { - value := iter.Value() - var bh BlockHandle - var n int - var vbih valueBlocksIndexHandle - isValueBlocksIndexHandle := false - if bytes.Equal(iter.Key().UserKey, []byte(metaValueIndexName)) { - vbih, n, err = decodeValueBlocksIndexHandle(value) - bh = vbih.h - isValueBlocksIndexHandle = true - } else { - bh, n = decodeBlockHandle(value) - } - if n == 0 || n != len(value) { - fmt.Fprintf(w, "%10d [err: %s]\n", b.Offset+uint64(iter.offset), err) - continue - } - var vbihStr string - if isValueBlocksIndexHandle { - vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)", - vbih.blockNumByteLength, vbih.blockOffsetByteLength, vbih.blockLengthByteLength) - } - fmt.Fprintf(w, "%10d %s block:%d/%d%s", - b.Offset+uint64(iter.offset), iter.Key().UserKey, - bh.Offset, bh.Length, vbihStr) - formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset) - } - formatRestarts(iter.data, iter.restarts, iter.numRestarts) - formatTrailer() - case "value-block": - // We don't peer into the value-block since it can't be interpreted - // without the valueHandles. - case "value-index": - // We have already read the value-index to construct the list of - // value-blocks, so no need to do it again. - } - - h.Release() - } - - last := blocks[len(blocks)-1] - fmt.Fprintf(w, "%10d EOF\n", last.Offset+last.Length) -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/options.go b/vendor/github.com/cockroachdb/pebble/sstable/options.go deleted file mode 100644 index 2654f70..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/options.go +++ /dev/null @@ -1,311 +0,0 @@ -// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "github.com/cockroachdb/fifo" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/cache" -) - -// Compression is the per-block compression algorithm to use. -type Compression int - -// The available compression types. -const ( - DefaultCompression Compression = iota - NoCompression - SnappyCompression - ZstdCompression - NCompression -) - -var ignoredInternalProperties = map[string]struct{}{ - "rocksdb.column.family.id": {}, - "rocksdb.fixed.key.length": {}, - "rocksdb.index.key.is.user.key": {}, - "rocksdb.index.value.is.delta.encoded": {}, - "rocksdb.oldest.key.time": {}, - "rocksdb.creation.time": {}, - "rocksdb.file.creation.time": {}, - "rocksdb.format.version": {}, -} - -func (c Compression) String() string { - switch c { - case DefaultCompression: - return "Default" - case NoCompression: - return "NoCompression" - case SnappyCompression: - return "Snappy" - case ZstdCompression: - return "ZSTD" - default: - return "Unknown" - } -} - -// FilterType exports the base.FilterType type. -type FilterType = base.FilterType - -// Exported TableFilter constants. -const ( - TableFilter = base.TableFilter -) - -// FilterWriter exports the base.FilterWriter type. -type FilterWriter = base.FilterWriter - -// FilterPolicy exports the base.FilterPolicy type. -type FilterPolicy = base.FilterPolicy - -// TablePropertyCollector provides a hook for collecting user-defined -// properties based on the keys and values stored in an sstable. A new -// TablePropertyCollector is created for an sstable when the sstable is being -// written. -type TablePropertyCollector interface { - // Add is called with each new entry added to the sstable. While the sstable - // is itself sorted by key, do not assume that the entries are added in any - // order. In particular, the ordering of point entries and range tombstones - // is unspecified. - Add(key InternalKey, value []byte) error - - // Finish is called when all entries have been added to the sstable. The - // collected properties (if any) should be added to the specified map. Note - // that in case of an error during sstable construction, Finish may not be - // called. - Finish(userProps map[string]string) error - - // The name of the property collector. - Name() string -} - -// SuffixReplaceableTableCollector is an extension to the TablePropertyCollector -// interface that allows a table property collector to indicate that it supports -// being *updated* during suffix replacement, i.e. when an existing SST in which -// all keys have the same key suffix is updated to have a new suffix. -// -// A collector which supports being updated in such cases must be able to derive -// its updated value from its old value and the change being made to the suffix, -// without needing to be passed each updated K/V. -// -// For example, a collector that only inspects values can simply copy its -// previously computed property as-is, since key-suffix replacement does not -// change values, while a collector that depends only on key suffixes, like one -// which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just -// set its new bounds from the new suffix, as it is common to all keys, without -// needing to recompute it from every key. -type SuffixReplaceableTableCollector interface { - // UpdateKeySuffixes is called when a table is updated to change the suffix of - // all keys in the table, and is passed the old value for that prop, if any, - // for that table as well as the old and new suffix. - UpdateKeySuffixes(oldProps map[string]string, oldSuffix, newSuffix []byte) error -} - -// ReaderOptions holds the parameters needed for reading an sstable. -type ReaderOptions struct { - // Cache is used to cache uncompressed blocks from sstables. - // - // The default cache size is a zero-size cache. - Cache *cache.Cache - - // LoadBlockSema, if set, is used to limit the number of blocks that can be - // loaded (i.e. read from the filesystem) in parallel. Each load acquires one - // unit from the semaphore for the duration of the read. - LoadBlockSema *fifo.Semaphore - - // User properties specified in this map will not be added to sst.Properties.UserProperties. - DeniedUserProperties map[string]struct{} - - // Comparer defines a total ordering over the space of []byte keys: a 'less - // than' relationship. The same comparison algorithm must be used for reads - // and writes over the lifetime of the DB. - // - // The default value uses the same ordering as bytes.Compare. - Comparer *Comparer - - // Merge defines the Merge function in use for this keyspace. - Merge base.Merge - - // Filters is a map from filter policy name to filter policy. It is used for - // debugging tools which may be used on multiple databases configured with - // different filter policies. It is not necessary to populate this filters - // map during normal usage of a DB. - Filters map[string]FilterPolicy - - // Merger defines the associative merge operation to use for merging values - // written with {Batch,DB}.Merge. The MergerName is checked for consistency - // with the value stored in the sstable when it was written. - MergerName string - - // Logger is an optional logger and tracer. - LoggerAndTracer base.LoggerAndTracer -} - -func (o ReaderOptions) ensureDefaults() ReaderOptions { - if o.Comparer == nil { - o.Comparer = base.DefaultComparer - } - if o.Merge == nil { - o.Merge = base.DefaultMerger.Merge - } - if o.MergerName == "" { - o.MergerName = base.DefaultMerger.Name - } - if o.LoggerAndTracer == nil { - o.LoggerAndTracer = base.NoopLoggerAndTracer{} - } - if o.DeniedUserProperties == nil { - o.DeniedUserProperties = ignoredInternalProperties - } - return o -} - -// WriterOptions holds the parameters used to control building an sstable. -type WriterOptions struct { - // BlockRestartInterval is the number of keys between restart points - // for delta encoding of keys. - // - // The default value is 16. - BlockRestartInterval int - - // BlockSize is the target uncompressed size in bytes of each table block. - // - // The default value is 4096. - BlockSize int - - // BlockSizeThreshold finishes a block if the block size is larger than the - // specified percentage of the target block size and adding the next entry - // would cause the block to be larger than the target block size. - // - // The default value is 90 - BlockSizeThreshold int - - // Cache is used to cache uncompressed blocks from sstables. - // - // The default is a nil cache. - Cache *cache.Cache - - // Comparer defines a total ordering over the space of []byte keys: a 'less - // than' relationship. The same comparison algorithm must be used for reads - // and writes over the lifetime of the DB. - // - // The default value uses the same ordering as bytes.Compare. - Comparer *Comparer - - // Compression defines the per-block compression to use. - // - // The default value (DefaultCompression) uses snappy compression. - Compression Compression - - // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can - // reduce disk reads for Get calls. - // - // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom - // package. - // - // The default value means to use no filter. - FilterPolicy FilterPolicy - - // FilterType defines whether an existing filter policy is applied at a - // block-level or table-level. Block-level filters use less memory to create, - // but are slower to access as a check for the key in the index must first be - // performed to locate the filter block. A table-level filter will require - // memory proportional to the number of keys in an sstable to create, but - // avoids the index lookup when determining if a key is present. Table-level - // filters should be preferred except under constrained memory situations. - FilterType FilterType - - // IndexBlockSize is the target uncompressed size in bytes of each index - // block. When the index block size is larger than this target, two-level - // indexes are automatically enabled. Setting this option to a large value - // (such as math.MaxInt32) disables the automatic creation of two-level - // indexes. - // - // The default value is the value of BlockSize. - IndexBlockSize int - - // Merger defines the associative merge operation to use for merging values - // written with {Batch,DB}.Merge. The MergerName is checked for consistency - // with the value stored in the sstable when it was written. - MergerName string - - // TableFormat specifies the format version for writing sstables. The default - // is TableFormatRocksDBv2 which creates RocksDB compatible sstables. Use - // TableFormatLevelDB to create LevelDB compatible sstable which can be used - // by a wider range of tools and libraries. - TableFormat TableFormat - - // IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment - // in format.go. Must be false if format < TableFormatPebblev4. - // - // TODO(bilal): set this when writing shared ssts. - IsStrictObsolete bool - - // WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is - // used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the - // youngest for a userkey. - WritingToLowestLevel bool - - // TablePropertyCollectors is a list of TablePropertyCollector creation - // functions. A new TablePropertyCollector is created for each sstable built - // and lives for the lifetime of the table. - TablePropertyCollectors []func() TablePropertyCollector - - // BlockPropertyCollectors is a list of BlockPropertyCollector creation - // functions. A new BlockPropertyCollector is created for each sstable - // built and lives for the lifetime of writing that table. - BlockPropertyCollectors []func() BlockPropertyCollector - - // Checksum specifies which checksum to use. - Checksum ChecksumType - - // Parallelism is used to indicate that the sstable Writer is allowed to - // compress data blocks and write datablocks to disk in parallel with the - // Writer client goroutine. - Parallelism bool - - // ShortAttributeExtractor mirrors - // Options.Experimental.ShortAttributeExtractor. - ShortAttributeExtractor base.ShortAttributeExtractor - - // RequiredInPlaceValueBound mirrors - // Options.Experimental.RequiredInPlaceValueBound. - RequiredInPlaceValueBound UserKeyPrefixBound -} - -func (o WriterOptions) ensureDefaults() WriterOptions { - if o.BlockRestartInterval <= 0 { - o.BlockRestartInterval = base.DefaultBlockRestartInterval - } - if o.BlockSize <= 0 { - o.BlockSize = base.DefaultBlockSize - } - if o.BlockSizeThreshold <= 0 { - o.BlockSizeThreshold = base.DefaultBlockSizeThreshold - } - if o.Comparer == nil { - o.Comparer = base.DefaultComparer - } - if o.Compression <= DefaultCompression || o.Compression >= NCompression { - o.Compression = SnappyCompression - } - if o.IndexBlockSize <= 0 { - o.IndexBlockSize = o.BlockSize - } - if o.MergerName == "" { - o.MergerName = base.DefaultMerger.Name - } - if o.Checksum == ChecksumTypeNone { - o.Checksum = ChecksumTypeCRC32c - } - // By default, if the table format is not specified, fall back to using the - // most compatible format. - if o.TableFormat == TableFormatUnspecified { - o.TableFormat = TableFormatRocksDBv2 - } - return o -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/raw_block.go b/vendor/github.com/cockroachdb/pebble/sstable/raw_block.go deleted file mode 100644 index d33b51a..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/raw_block.go +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "encoding/binary" - "sort" - "unsafe" - - "github.com/cockroachdb/pebble/internal/base" -) - -type rawBlockWriter struct { - blockWriter -} - -func (w *rawBlockWriter) add(key InternalKey, value []byte) { - w.curKey, w.prevKey = w.prevKey, w.curKey - - size := len(key.UserKey) - if cap(w.curKey) < size { - w.curKey = make([]byte, 0, size*2) - } - w.curKey = w.curKey[:size] - copy(w.curKey, key.UserKey) - - w.storeWithOptionalValuePrefix( - size, value, len(key.UserKey), false, 0, false) -} - -// rawBlockIter is an iterator over a single block of data. Unlike blockIter, -// keys are stored in "raw" format (i.e. not as internal keys). Note that there -// is significant similarity between this code and the code in blockIter. Yet -// reducing duplication is difficult due to the blockIter being performance -// critical. rawBlockIter must only be used for blocks where the value is -// stored together with the key. -type rawBlockIter struct { - cmp Compare - offset int32 - nextOffset int32 - restarts int32 - numRestarts int32 - ptr unsafe.Pointer - data []byte - key, val []byte - ikey InternalKey - cached []blockEntry - cachedBuf []byte -} - -func newRawBlockIter(cmp Compare, block block) (*rawBlockIter, error) { - i := &rawBlockIter{} - return i, i.init(cmp, block) -} - -func (i *rawBlockIter) init(cmp Compare, block block) error { - numRestarts := int32(binary.LittleEndian.Uint32(block[len(block)-4:])) - if numRestarts == 0 { - return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)") - } - i.cmp = cmp - i.restarts = int32(len(block)) - 4*(1+numRestarts) - i.numRestarts = numRestarts - i.ptr = unsafe.Pointer(&block[0]) - i.data = block - if i.key == nil { - i.key = make([]byte, 0, 256) - } else { - i.key = i.key[:0] - } - i.val = nil - i.clearCache() - return nil -} - -func (i *rawBlockIter) readEntry() { - ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset)) - shared, ptr := decodeVarint(ptr) - unshared, ptr := decodeVarint(ptr) - value, ptr := decodeVarint(ptr) - i.key = append(i.key[:shared], getBytes(ptr, int(unshared))...) - i.key = i.key[:len(i.key):len(i.key)] - ptr = unsafe.Pointer(uintptr(ptr) + uintptr(unshared)) - i.val = getBytes(ptr, int(value)) - i.nextOffset = int32(uintptr(ptr)-uintptr(i.ptr)) + int32(value) -} - -func (i *rawBlockIter) loadEntry() { - i.readEntry() - i.ikey.UserKey = i.key -} - -func (i *rawBlockIter) clearCache() { - i.cached = i.cached[:0] - i.cachedBuf = i.cachedBuf[:0] -} - -func (i *rawBlockIter) cacheEntry() { - var valStart int32 - valSize := int32(len(i.val)) - if valSize > 0 { - valStart = int32(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr)) - } - - i.cached = append(i.cached, blockEntry{ - offset: i.offset, - keyStart: int32(len(i.cachedBuf)), - keyEnd: int32(len(i.cachedBuf) + len(i.key)), - valStart: valStart, - valSize: valSize, - }) - i.cachedBuf = append(i.cachedBuf, i.key...) -} - -// SeekGE implements internalIterator.SeekGE, as documented in the pebble -// package. -func (i *rawBlockIter) SeekGE(key []byte) bool { - // Find the index of the smallest restart point whose key is > the key - // sought; index will be numRestarts if there is no such restart point. - i.offset = 0 - index := sort.Search(int(i.numRestarts), func(j int) bool { - offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:])) - // For a restart point, there are 0 bytes shared with the previous key. - // The varint encoding of 0 occupies 1 byte. - ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) - // Decode the key at that restart point, and compare it to the key sought. - v1, ptr := decodeVarint(ptr) - _, ptr = decodeVarint(ptr) - s := getBytes(ptr, int(v1)) - return i.cmp(key, s) < 0 - }) - - // Since keys are strictly increasing, if index > 0 then the restart point at - // index-1 will be the largest whose key is <= the key sought. If index == - // 0, then all keys in this block are larger than the key sought, and offset - // remains at zero. - if index > 0 { - i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):])) - } - i.loadEntry() - - // Iterate from that restart point to somewhere >= the key sought. - for valid := i.Valid(); valid; valid = i.Next() { - if i.cmp(key, i.key) <= 0 { - break - } - } - return i.Valid() -} - -// First implements internalIterator.First, as documented in the pebble -// package. -func (i *rawBlockIter) First() bool { - i.offset = 0 - i.loadEntry() - return i.Valid() -} - -// Last implements internalIterator.Last, as documented in the pebble package. -func (i *rawBlockIter) Last() bool { - // Seek forward from the last restart point. - i.offset = int32(binary.LittleEndian.Uint32(i.data[i.restarts+4*(i.numRestarts-1):])) - - i.readEntry() - i.clearCache() - i.cacheEntry() - - for i.nextOffset < i.restarts { - i.offset = i.nextOffset - i.readEntry() - i.cacheEntry() - } - - i.ikey.UserKey = i.key - return i.Valid() -} - -// Next implements internalIterator.Next, as documented in the pebble -// package. -func (i *rawBlockIter) Next() bool { - i.offset = i.nextOffset - if !i.Valid() { - return false - } - i.loadEntry() - return true -} - -// Prev implements internalIterator.Prev, as documented in the pebble -// package. -func (i *rawBlockIter) Prev() bool { - if n := len(i.cached) - 1; n > 0 && i.cached[n].offset == i.offset { - i.nextOffset = i.offset - e := &i.cached[n-1] - i.offset = e.offset - i.val = getBytes(unsafe.Pointer(uintptr(i.ptr)+uintptr(e.valStart)), int(e.valSize)) - i.ikey.UserKey = i.cachedBuf[e.keyStart:e.keyEnd] - i.cached = i.cached[:n] - return true - } - - if i.offset == 0 { - i.offset = -1 - i.nextOffset = 0 - return false - } - - targetOffset := i.offset - index := sort.Search(int(i.numRestarts), func(j int) bool { - offset := int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*j:])) - return offset >= targetOffset - }) - i.offset = 0 - if index > 0 { - i.offset = int32(binary.LittleEndian.Uint32(i.data[int(i.restarts)+4*(index-1):])) - } - - i.readEntry() - i.clearCache() - i.cacheEntry() - - for i.nextOffset < targetOffset { - i.offset = i.nextOffset - i.readEntry() - i.cacheEntry() - } - - i.ikey.UserKey = i.key - return true -} - -// Key implements internalIterator.Key, as documented in the pebble package. -func (i *rawBlockIter) Key() InternalKey { - return i.ikey -} - -// Value implements internalIterator.Value, as documented in the pebble -// package. -func (i *rawBlockIter) Value() []byte { - return i.val -} - -// Valid implements internalIterator.Valid, as documented in the pebble -// package. -func (i *rawBlockIter) Valid() bool { - return i.offset >= 0 && i.offset < i.restarts -} - -// Error implements internalIterator.Error, as documented in the pebble -// package. -func (i *rawBlockIter) Error() error { - return nil -} - -// Close implements internalIterator.Close, as documented in the pebble -// package. -func (i *rawBlockIter) Close() error { - i.val = nil - return nil -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/reader.go b/vendor/github.com/cockroachdb/pebble/sstable/reader.go deleted file mode 100644 index 39c7a69..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/reader.go +++ /dev/null @@ -1,1241 +0,0 @@ -// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "bytes" - "context" - "encoding/binary" - "io" - "os" - "sort" - "time" - - "github.com/cespare/xxhash/v2" - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/bytealloc" - "github.com/cockroachdb/pebble/internal/cache" - "github.com/cockroachdb/pebble/internal/crc" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/private" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" -) - -var errCorruptIndexEntry = base.CorruptionErrorf("pebble/table: corrupt index entry") -var errReaderClosed = errors.New("pebble/table: reader is closed") - -// decodeBlockHandle returns the block handle encoded at the start of src, as -// well as the number of bytes it occupies. It returns zero if given invalid -// input. A block handle for a data block or a first/lower level index block -// should not be decoded using decodeBlockHandle since the caller may validate -// that the number of bytes decoded is equal to the length of src, which will -// be false if the properties are not decoded. In those cases the caller -// should use decodeBlockHandleWithProperties. -func decodeBlockHandle(src []byte) (BlockHandle, int) { - offset, n := binary.Uvarint(src) - length, m := binary.Uvarint(src[n:]) - if n == 0 || m == 0 { - return BlockHandle{}, 0 - } - return BlockHandle{offset, length}, n + m -} - -// decodeBlockHandleWithProperties returns the block handle and properties -// encoded in src. src needs to be exactly the length that was encoded. This -// method must be used for data block and first/lower level index blocks. The -// properties in the block handle point to the bytes in src. -func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) { - bh, n := decodeBlockHandle(src) - if n == 0 { - return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle") - } - return BlockHandleWithProperties{ - BlockHandle: bh, - Props: src[n:], - }, nil -} - -func encodeBlockHandle(dst []byte, b BlockHandle) int { - n := binary.PutUvarint(dst, b.Offset) - m := binary.PutUvarint(dst[n:], b.Length) - return n + m -} - -func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte { - n := encodeBlockHandle(dst, b.BlockHandle) - dst = append(dst[:n], b.Props...) - return dst -} - -// block is a []byte that holds a sequence of key/value pairs plus an index -// over those pairs. -type block []byte - -type loadBlockResult int8 - -const ( - loadBlockOK loadBlockResult = iota - // Could be due to error or because no block left to load. - loadBlockFailed - loadBlockIrrelevant -) - -type blockTransform func([]byte) ([]byte, error) - -// ReaderOption provide an interface to do work on Reader while it is being -// opened. -type ReaderOption interface { - // readerApply is called on the reader during opening in order to set internal - // parameters. - readerApply(*Reader) -} - -// Comparers is a map from comparer name to comparer. It is used for debugging -// tools which may be used on multiple databases configured with different -// comparers. Comparers implements the OpenOption interface and can be passed -// as a parameter to NewReader. -type Comparers map[string]*Comparer - -func (c Comparers) readerApply(r *Reader) { - if r.Compare != nil || r.Properties.ComparerName == "" { - return - } - if comparer, ok := c[r.Properties.ComparerName]; ok { - r.Compare = comparer.Compare - r.FormatKey = comparer.FormatKey - r.Split = comparer.Split - } -} - -// Mergers is a map from merger name to merger. It is used for debugging tools -// which may be used on multiple databases configured with different -// mergers. Mergers implements the OpenOption interface and can be passed as -// a parameter to NewReader. -type Mergers map[string]*Merger - -func (m Mergers) readerApply(r *Reader) { - if r.mergerOK || r.Properties.MergerName == "" { - return - } - _, r.mergerOK = m[r.Properties.MergerName] -} - -// cacheOpts is a Reader open option for specifying the cache ID and sstable file -// number. If not specified, a unique cache ID will be used. -type cacheOpts struct { - cacheID uint64 - fileNum base.DiskFileNum -} - -// Marker function to indicate the option should be applied before reading the -// sstable properties and, in the write path, before writing the default -// sstable properties. -func (c *cacheOpts) preApply() {} - -func (c *cacheOpts) readerApply(r *Reader) { - if r.cacheID == 0 { - r.cacheID = c.cacheID - } - if r.fileNum.FileNum() == 0 { - r.fileNum = c.fileNum - } -} - -func (c *cacheOpts) writerApply(w *Writer) { - if w.cacheID == 0 { - w.cacheID = c.cacheID - } - if w.fileNum.FileNum() == 0 { - w.fileNum = c.fileNum - } -} - -// rawTombstonesOpt is a Reader open option for specifying that range -// tombstones returned by Reader.NewRangeDelIter() should not be -// fragmented. Used by debug tools to get a raw view of the tombstones -// contained in an sstable. -type rawTombstonesOpt struct{} - -func (rawTombstonesOpt) preApply() {} - -func (rawTombstonesOpt) readerApply(r *Reader) { - r.rawTombstones = true -} - -func init() { - private.SSTableCacheOpts = func(cacheID uint64, fileNum base.DiskFileNum) interface{} { - return &cacheOpts{cacheID, fileNum} - } - private.SSTableRawTombstonesOpt = rawTombstonesOpt{} -} - -// CommonReader abstracts functionality over a Reader or a VirtualReader. This -// can be used by code which doesn't care to distinguish between a reader and a -// virtual reader. -type CommonReader interface { - NewRawRangeKeyIter() (keyspan.FragmentIterator, error) - NewRawRangeDelIter() (keyspan.FragmentIterator, error) - NewIterWithBlockPropertyFiltersAndContextEtc( - ctx context.Context, lower, upper []byte, - filterer *BlockPropertiesFilterer, - hideObsoletePoints, useFilterBlock bool, - stats *base.InternalIteratorStats, - rp ReaderProvider, - ) (Iterator, error) - NewCompactionIter( - bytesIterated *uint64, - rp ReaderProvider, - bufferPool *BufferPool, - ) (Iterator, error) - EstimateDiskUsage(start, end []byte) (uint64, error) - CommonProperties() *CommonProperties -} - -// Reader is a table reader. -type Reader struct { - readable objstorage.Readable - cacheID uint64 - fileNum base.DiskFileNum - err error - indexBH BlockHandle - filterBH BlockHandle - rangeDelBH BlockHandle - rangeKeyBH BlockHandle - rangeDelTransform blockTransform - valueBIH valueBlocksIndexHandle - propertiesBH BlockHandle - metaIndexBH BlockHandle - footerBH BlockHandle - opts ReaderOptions - Compare Compare - FormatKey base.FormatKey - Split Split - tableFilter *tableFilterReader - // Keep types that are not multiples of 8 bytes at the end and with - // decreasing size. - Properties Properties - tableFormat TableFormat - rawTombstones bool - mergerOK bool - checksumType ChecksumType - // metaBufferPool is a buffer pool used exclusively when opening a table and - // loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate - // the BufferPool.pool slice as a part of the Reader allocation. It's - // capacity 3 to accommodate the meta block (1), and both the compressed - // properties block (1) and decompressed properties block (1) - // simultaneously. - metaBufferPool BufferPool - metaBufferPoolAlloc [3]allocedBuffer -} - -// Close implements DB.Close, as documented in the pebble package. -func (r *Reader) Close() error { - r.opts.Cache.Unref() - - if r.readable != nil { - r.err = firstError(r.err, r.readable.Close()) - r.readable = nil - } - - if r.err != nil { - return r.err - } - // Make any future calls to Get, NewIter or Close return an error. - r.err = errReaderClosed - return nil -} - -// NewIterWithBlockPropertyFilters returns an iterator for the contents of the -// table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after -// itself and returns a nil iterator. -func (r *Reader) NewIterWithBlockPropertyFilters( - lower, upper []byte, - filterer *BlockPropertiesFilterer, - useFilterBlock bool, - stats *base.InternalIteratorStats, - rp ReaderProvider, -) (Iterator, error) { - return r.newIterWithBlockPropertyFiltersAndContext( - context.Background(), - lower, upper, filterer, false, useFilterBlock, stats, rp, nil, - ) -} - -// NewIterWithBlockPropertyFiltersAndContextEtc is similar to -// NewIterWithBlockPropertyFilters and additionally accepts a context for -// tracing. -// -// If hideObsoletePoints, the callee assumes that filterer already includes -// obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by -// first calling TryAddBlockPropertyFilterForHideObsoletePoints. -func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc( - ctx context.Context, - lower, upper []byte, - filterer *BlockPropertiesFilterer, - hideObsoletePoints, useFilterBlock bool, - stats *base.InternalIteratorStats, - rp ReaderProvider, -) (Iterator, error) { - return r.newIterWithBlockPropertyFiltersAndContext( - ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, nil, - ) -} - -// TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called -// before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the -// value of hideObsoletePoints and potentially add a block property filter. -func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints( - snapshotForHideObsoletePoints uint64, - fileLargestSeqNum uint64, - pointKeyFilters []BlockPropertyFilter, -) (hideObsoletePoints bool, filters []BlockPropertyFilter) { - hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 && - snapshotForHideObsoletePoints > fileLargestSeqNum - if hideObsoletePoints { - pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{}) - } - return hideObsoletePoints, pointKeyFilters -} - -func (r *Reader) newIterWithBlockPropertyFiltersAndContext( - ctx context.Context, - lower, upper []byte, - filterer *BlockPropertiesFilterer, - hideObsoletePoints bool, - useFilterBlock bool, - stats *base.InternalIteratorStats, - rp ReaderProvider, - v *virtualState, -) (Iterator, error) { - // NB: pebble.tableCache wraps the returned iterator with one which performs - // reference counting on the Reader, preventing the Reader from being closed - // until the final iterator closes. - if r.Properties.IndexType == twoLevelIndex { - i := twoLevelIterPool.Get().(*twoLevelIterator) - err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */) - if err != nil { - return nil, err - } - return i, nil - } - - i := singleLevelIterPool.Get().(*singleLevelIterator) - err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */) - if err != nil { - return nil, err - } - return i, nil -} - -// NewIter returns an iterator for the contents of the table. If an error -// occurs, NewIter cleans up after itself and returns a nil iterator. NewIter -// must only be used when the Reader is guaranteed to outlive any LazyValues -// returned from the iter. -func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) { - return r.NewIterWithBlockPropertyFilters( - lower, upper, nil, true /* useFilterBlock */, nil, /* stats */ - TrivialReaderProvider{Reader: r}) -} - -// NewCompactionIter returns an iterator similar to NewIter but it also increments -// the number of bytes iterated. If an error occurs, NewCompactionIter cleans up -// after itself and returns a nil iterator. -func (r *Reader) NewCompactionIter( - bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool, -) (Iterator, error) { - return r.newCompactionIter(bytesIterated, rp, nil, bufferPool) -} - -func (r *Reader) newCompactionIter( - bytesIterated *uint64, rp ReaderProvider, v *virtualState, bufferPool *BufferPool, -) (Iterator, error) { - if r.Properties.IndexType == twoLevelIndex { - i := twoLevelIterPool.Get().(*twoLevelIterator) - err := i.init( - context.Background(), - r, v, nil /* lower */, nil /* upper */, nil, - false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */ - nil /* stats */, rp, bufferPool, - ) - if err != nil { - return nil, err - } - i.setupForCompaction() - return &twoLevelCompactionIterator{ - twoLevelIterator: i, - bytesIterated: bytesIterated, - }, nil - } - i := singleLevelIterPool.Get().(*singleLevelIterator) - err := i.init( - context.Background(), r, v, nil /* lower */, nil, /* upper */ - nil, false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */ - nil /* stats */, rp, bufferPool, - ) - if err != nil { - return nil, err - } - i.setupForCompaction() - return &compactionIterator{ - singleLevelIterator: i, - bytesIterated: bytesIterated, - }, nil -} - -// NewRawRangeDelIter returns an internal iterator for the contents of the -// range-del block for the table. Returns nil if the table does not contain -// any range deletions. -// -// TODO(sumeer): plumb context.Context since this path is relevant in the user-facing -// iterator. Add WithContext methods since the existing ones are public. -func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { - if r.rangeDelBH.Length == 0 { - return nil, nil - } - h, err := r.readRangeDel(nil /* stats */) - if err != nil { - return nil, err - } - i := &fragmentBlockIter{elideSameSeqnum: true} - if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil { - return nil, err - } - return i, nil -} - -// NewRawRangeKeyIter returns an internal iterator for the contents of the -// range-key block for the table. Returns nil if the table does not contain any -// range keys. -// -// TODO(sumeer): plumb context.Context since this path is relevant in the user-facing -// iterator. Add WithContext methods since the existing ones are public. -func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { - if r.rangeKeyBH.Length == 0 { - return nil, nil - } - h, err := r.readRangeKey(nil /* stats */) - if err != nil { - return nil, err - } - i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter) - if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil { - return nil, err - } - return i, nil -} - -type rangeKeyFragmentBlockIter struct { - fragmentBlockIter -} - -func (i *rangeKeyFragmentBlockIter) Close() error { - err := i.fragmentBlockIter.Close() - i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse() - rangeKeyFragmentBlockIterPool.Put(i) - return err -} - -func (r *Reader) readIndex( - ctx context.Context, stats *base.InternalIteratorStats, -) (bufferHandle, error) { - ctx = objiotracing.WithBlockType(ctx, objiotracing.MetadataBlock) - return r.readBlock(ctx, r.indexBH, nil, nil, stats, nil /* buffer pool */) -} - -func (r *Reader) readFilter( - ctx context.Context, stats *base.InternalIteratorStats, -) (bufferHandle, error) { - ctx = objiotracing.WithBlockType(ctx, objiotracing.FilterBlock) - return r.readBlock(ctx, r.filterBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */) -} - -func (r *Reader) readRangeDel(stats *base.InternalIteratorStats) (bufferHandle, error) { - ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock) - return r.readBlock(ctx, r.rangeDelBH, r.rangeDelTransform, nil /* readHandle */, stats, nil /* buffer pool */) -} - -func (r *Reader) readRangeKey(stats *base.InternalIteratorStats) (bufferHandle, error) { - ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock) - return r.readBlock(ctx, r.rangeKeyBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */) -} - -func checkChecksum( - checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum, -) error { - expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:]) - var computedChecksum uint32 - switch checksumType { - case ChecksumTypeCRC32c: - computedChecksum = crc.New(b[:bh.Length+1]).Value() - case ChecksumTypeXXHash64: - computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1])) - default: - return errors.Errorf("unsupported checksum type: %d", checksumType) - } - - if expectedChecksum != computedChecksum { - return base.CorruptionErrorf( - "pebble/table: invalid table %s (checksum mismatch at %d/%d)", - errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length)) - } - return nil -} - -type cacheValueOrBuf struct { - // buf.Valid() returns true if backed by a BufferPool. - buf Buf - // v is non-nil if backed by the block cache. - v *cache.Value -} - -func (b cacheValueOrBuf) get() []byte { - if b.buf.Valid() { - return b.buf.p.pool[b.buf.i].b - } - return b.v.Buf() -} - -func (b cacheValueOrBuf) release() { - if b.buf.Valid() { - b.buf.Release() - } else { - cache.Free(b.v) - } -} - -func (b cacheValueOrBuf) truncate(n int) { - if b.buf.Valid() { - b.buf.p.pool[b.buf.i].b = b.buf.p.pool[b.buf.i].b[:n] - } else { - b.v.Truncate(n) - } -} - -func (r *Reader) readBlock( - ctx context.Context, - bh BlockHandle, - transform blockTransform, - readHandle objstorage.ReadHandle, - stats *base.InternalIteratorStats, - bufferPool *BufferPool, -) (handle bufferHandle, _ error) { - if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil { - // Cache hit. - if readHandle != nil { - readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+blockTrailerLen)) - } - if stats != nil { - stats.BlockBytes += bh.Length - stats.BlockBytesInCache += bh.Length - } - // This block is already in the cache; return a handle to existing vlaue - // in the cache. - return bufferHandle{h: h}, nil - } - - // Cache miss. - - if sema := r.opts.LoadBlockSema; sema != nil { - if err := sema.Acquire(ctx, 1); err != nil { - // An error here can only come from the context. - return bufferHandle{}, err - } - defer sema.Release(1) - } - - var compressed cacheValueOrBuf - if bufferPool != nil { - compressed = cacheValueOrBuf{ - buf: bufferPool.Alloc(int(bh.Length + blockTrailerLen)), - } - } else { - compressed = cacheValueOrBuf{ - v: cache.Alloc(int(bh.Length + blockTrailerLen)), - } - } - - readStartTime := time.Now() - var err error - if readHandle != nil { - err = readHandle.ReadAt(ctx, compressed.get(), int64(bh.Offset)) - } else { - err = r.readable.ReadAt(ctx, compressed.get(), int64(bh.Offset)) - } - readDuration := time.Since(readStartTime) - // TODO(sumeer): should the threshold be configurable. - const slowReadTracingThreshold = 5 * time.Millisecond - // The invariants.Enabled path is for deterministic testing. - if invariants.Enabled { - readDuration = slowReadTracingThreshold - } - // Call IsTracingEnabled to avoid the allocations of boxing integers into an - // interface{}, unless necessary. - if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) { - r.opts.LoggerAndTracer.Eventf(ctx, "reading %d bytes took %s", - int(bh.Length+blockTrailerLen), readDuration.String()) - } - if stats != nil { - stats.BlockBytes += bh.Length - stats.BlockReadDuration += readDuration - } - if err != nil { - compressed.release() - return bufferHandle{}, err - } - if err := checkChecksum(r.checksumType, compressed.get(), bh, r.fileNum.FileNum()); err != nil { - compressed.release() - return bufferHandle{}, err - } - - typ := blockType(compressed.get()[bh.Length]) - compressed.truncate(int(bh.Length)) - - var decompressed cacheValueOrBuf - if typ == noCompressionBlockType { - decompressed = compressed - } else { - // Decode the length of the decompressed value. - decodedLen, prefixLen, err := decompressedLen(typ, compressed.get()) - if err != nil { - compressed.release() - return bufferHandle{}, err - } - - if bufferPool != nil { - decompressed = cacheValueOrBuf{buf: bufferPool.Alloc(decodedLen)} - } else { - decompressed = cacheValueOrBuf{v: cache.Alloc(decodedLen)} - } - if _, err := decompressInto(typ, compressed.get()[prefixLen:], decompressed.get()); err != nil { - compressed.release() - return bufferHandle{}, err - } - compressed.release() - } - - if transform != nil { - // Transforming blocks is very rare, so the extra copy of the - // transformed data is not problematic. - tmpTransformed, err := transform(decompressed.get()) - if err != nil { - decompressed.release() - return bufferHandle{}, err - } - - var transformed cacheValueOrBuf - if bufferPool != nil { - transformed = cacheValueOrBuf{buf: bufferPool.Alloc(len(tmpTransformed))} - } else { - transformed = cacheValueOrBuf{v: cache.Alloc(len(tmpTransformed))} - } - copy(transformed.get(), tmpTransformed) - decompressed.release() - decompressed = transformed - } - - if decompressed.buf.Valid() { - return bufferHandle{b: decompressed.buf}, nil - } - h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, decompressed.v) - return bufferHandle{h: h}, nil -} - -func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) { - // Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The - // v1 format range-del blocks have unfragmented and unsorted range - // tombstones. We need properly fragmented and sorted range tombstones in - // order to serve from them directly. - iter := &blockIter{} - if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil { - return nil, err - } - var tombstones []keyspan.Span - for key, value := iter.First(); key != nil; key, value = iter.Next() { - t := keyspan.Span{ - Start: key.UserKey, - End: value.InPlaceValue(), - Keys: []keyspan.Key{{Trailer: key.Trailer}}, - } - tombstones = append(tombstones, t) - } - keyspan.Sort(r.Compare, tombstones) - - // Fragment the tombstones, outputting them directly to a block writer. - rangeDelBlock := blockWriter{ - restartInterval: 1, - } - frag := keyspan.Fragmenter{ - Cmp: r.Compare, - Format: r.FormatKey, - Emit: func(s keyspan.Span) { - for _, k := range s.Keys { - startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer} - rangeDelBlock.add(startIK, s.End) - } - }, - } - for i := range tombstones { - frag.Add(tombstones[i]) - } - frag.Finish() - - // Return the contents of the constructed v2 format range-del block. - return rangeDelBlock.finish(), nil -} - -func (r *Reader) readMetaindex(metaindexBH BlockHandle) error { - // We use a BufferPool when reading metaindex blocks in order to avoid - // populating the block cache with these blocks. In heavy-write workloads, - // especially with high compaction concurrency, new tables may be created - // frequently. Populating the block cache with these metaindex blocks adds - // additional contention on the block cache mutexes (see #1997). - // Additionally, these blocks are exceedingly unlikely to be read again - // while they're still in the block cache except in misconfigurations with - // excessive sstables counts or a table cache that's far too small. - r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0]) - // When we're finished, release the buffers we've allocated back to memory - // allocator. We don't expect to use metaBufferPool again. - defer r.metaBufferPool.Release() - - b, err := r.readBlock( - context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil /* stats */, &r.metaBufferPool) - if err != nil { - return err - } - data := b.Get() - defer b.Release() - - if uint64(len(data)) != metaindexBH.Length { - return base.CorruptionErrorf("pebble/table: unexpected metaindex block size: %d vs %d", - errors.Safe(len(data)), errors.Safe(metaindexBH.Length)) - } - - i, err := newRawBlockIter(bytes.Compare, data) - if err != nil { - return err - } - - meta := map[string]BlockHandle{} - for valid := i.First(); valid; valid = i.Next() { - value := i.Value() - if bytes.Equal(i.Key().UserKey, []byte(metaValueIndexName)) { - vbih, n, err := decodeValueBlocksIndexHandle(i.Value()) - if err != nil { - return err - } - if n == 0 || n != len(value) { - return base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)") - } - r.valueBIH = vbih - } else { - bh, n := decodeBlockHandle(value) - if n == 0 || n != len(value) { - return base.CorruptionErrorf("pebble/table: invalid table (bad block handle)") - } - meta[string(i.Key().UserKey)] = bh - } - } - if err := i.Close(); err != nil { - return err - } - - if bh, ok := meta[metaPropertiesName]; ok { - b, err = r.readBlock( - context.Background(), bh, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) - if err != nil { - return err - } - r.propertiesBH = bh - err := r.Properties.load(b.Get(), bh.Offset, r.opts.DeniedUserProperties) - b.Release() - if err != nil { - return err - } - } - - if bh, ok := meta[metaRangeDelV2Name]; ok { - r.rangeDelBH = bh - } else if bh, ok := meta[metaRangeDelName]; ok { - r.rangeDelBH = bh - if !r.rawTombstones { - r.rangeDelTransform = r.transformRangeDelV1 - } - } - - if bh, ok := meta[metaRangeKeyName]; ok { - r.rangeKeyBH = bh - } - - for name, fp := range r.opts.Filters { - types := []struct { - ftype FilterType - prefix string - }{ - {TableFilter, "fullfilter."}, - } - var done bool - for _, t := range types { - if bh, ok := meta[t.prefix+name]; ok { - r.filterBH = bh - - switch t.ftype { - case TableFilter: - r.tableFilter = newTableFilterReader(fp) - default: - return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype)) - } - - done = true - break - } - } - if done { - break - } - } - return nil -} - -// Layout returns the layout (block organization) for an sstable. -func (r *Reader) Layout() (*Layout, error) { - if r.err != nil { - return nil, r.err - } - - l := &Layout{ - Data: make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks), - Filter: r.filterBH, - RangeDel: r.rangeDelBH, - RangeKey: r.rangeKeyBH, - ValueIndex: r.valueBIH.h, - Properties: r.propertiesBH, - MetaIndex: r.metaIndexBH, - Footer: r.footerBH, - Format: r.tableFormat, - } - - indexH, err := r.readIndex(context.Background(), nil) - if err != nil { - return nil, err - } - defer indexH.Release() - - var alloc bytealloc.A - - if r.Properties.IndexPartitions == 0 { - l.Index = append(l.Index, r.indexBH) - iter, _ := newBlockIter(r.Compare, indexH.Get()) - for key, value := iter.First(); key != nil; key, value = iter.Next() { - dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) - if err != nil { - return nil, errCorruptIndexEntry - } - if len(dataBH.Props) > 0 { - alloc, dataBH.Props = alloc.Copy(dataBH.Props) - } - l.Data = append(l.Data, dataBH) - } - } else { - l.TopIndex = r.indexBH - topIter, _ := newBlockIter(r.Compare, indexH.Get()) - iter := &blockIter{} - for key, value := topIter.First(); key != nil; key, value = topIter.Next() { - indexBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) - if err != nil { - return nil, errCorruptIndexEntry - } - l.Index = append(l.Index, indexBH.BlockHandle) - - subIndex, err := r.readBlock(context.Background(), indexBH.BlockHandle, - nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) - if err != nil { - return nil, err - } - if err := iter.init(r.Compare, subIndex.Get(), 0, /* globalSeqNum */ - false /* hideObsoletePoints */); err != nil { - return nil, err - } - for key, value := iter.First(); key != nil; key, value = iter.Next() { - dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue()) - if len(dataBH.Props) > 0 { - alloc, dataBH.Props = alloc.Copy(dataBH.Props) - } - if err != nil { - return nil, errCorruptIndexEntry - } - l.Data = append(l.Data, dataBH) - } - subIndex.Release() - *iter = iter.resetForReuse() - } - } - if r.valueBIH.h.Length != 0 { - vbiH, err := r.readBlock(context.Background(), r.valueBIH.h, nil, nil, nil, nil /* buffer pool */) - if err != nil { - return nil, err - } - defer vbiH.Release() - vbiBlock := vbiH.Get() - indexEntryLen := int(r.valueBIH.blockNumByteLength + r.valueBIH.blockOffsetByteLength + - r.valueBIH.blockLengthByteLength) - i := 0 - for len(vbiBlock) != 0 { - if len(vbiBlock) < indexEntryLen { - return nil, errors.Errorf( - "remaining value index block %d does not contain a full entry of length %d", - len(vbiBlock), indexEntryLen) - } - n := int(r.valueBIH.blockNumByteLength) - bn := int(littleEndianGet(vbiBlock, n)) - if bn != i { - return nil, errors.Errorf("unexpected block num %d, expected %d", - bn, i) - } - i++ - vbiBlock = vbiBlock[n:] - n = int(r.valueBIH.blockOffsetByteLength) - blockOffset := littleEndianGet(vbiBlock, n) - vbiBlock = vbiBlock[n:] - n = int(r.valueBIH.blockLengthByteLength) - blockLen := littleEndianGet(vbiBlock, n) - vbiBlock = vbiBlock[n:] - l.ValueBlock = append(l.ValueBlock, BlockHandle{Offset: blockOffset, Length: blockLen}) - } - } - - return l, nil -} - -// ValidateBlockChecksums validates the checksums for each block in the SSTable. -func (r *Reader) ValidateBlockChecksums() error { - // Pre-compute the BlockHandles for the underlying file. - l, err := r.Layout() - if err != nil { - return err - } - - // Construct the set of blocks to check. Note that the footer is not checked - // as it is not a block with a checksum. - blocks := make([]BlockHandle, len(l.Data)) - for i := range l.Data { - blocks[i] = l.Data[i].BlockHandle - } - blocks = append(blocks, l.Index...) - blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex) - - // Sorting by offset ensures we are performing a sequential scan of the - // file. - sort.Slice(blocks, func(i, j int) bool { - return blocks[i].Offset < blocks[j].Offset - }) - - // Check all blocks sequentially. Make use of read-ahead, given we are - // scanning the entire file from start to end. - rh := r.readable.NewReadHandle(context.TODO()) - defer rh.Close() - - for _, bh := range blocks { - // Certain blocks may not be present, in which case we skip them. - if bh.Length == 0 { - continue - } - - // Read the block, which validates the checksum. - h, err := r.readBlock(context.Background(), bh, nil, rh, nil, nil /* buffer pool */) - if err != nil { - return err - } - h.Release() - } - - return nil -} - -// CommonProperties implemented the CommonReader interface. -func (r *Reader) CommonProperties() *CommonProperties { - return &r.Properties.CommonProperties -} - -// EstimateDiskUsage returns the total size of data blocks overlapping the range -// `[start, end]`. Even if a data block partially overlaps, or we cannot -// determine overlap due to abbreviated index keys, the full data block size is -// included in the estimation. -// -// This function does not account for any metablock space usage. Assumes there -// is at least partial overlap, i.e., `[start, end]` falls neither completely -// before nor completely after the file's range. -// -// Only blocks containing point keys are considered. Range deletion and range -// key blocks are not considered. -// -// TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of -// data blocks overlapped and add that same fraction of the metadata blocks to the -// estimate. -func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) { - if r.err != nil { - return 0, r.err - } - - indexH, err := r.readIndex(context.Background(), nil) - if err != nil { - return 0, err - } - defer indexH.Release() - - // Iterators over the bottom-level index blocks containing start and end. - // These may be different in case of partitioned index but will both point - // to the same blockIter over the single index in the unpartitioned case. - var startIdxIter, endIdxIter *blockIter - if r.Properties.IndexPartitions == 0 { - iter, err := newBlockIter(r.Compare, indexH.Get()) - if err != nil { - return 0, err - } - startIdxIter = iter - endIdxIter = iter - } else { - topIter, err := newBlockIter(r.Compare, indexH.Get()) - if err != nil { - return 0, err - } - - key, val := topIter.SeekGE(start, base.SeekGEFlagsNone) - if key == nil { - // The range falls completely after this file, or an error occurred. - return 0, topIter.Error() - } - startIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) - if err != nil { - return 0, errCorruptIndexEntry - } - startIdxBlock, err := r.readBlock(context.Background(), startIdxBH.BlockHandle, - nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) - if err != nil { - return 0, err - } - defer startIdxBlock.Release() - startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get()) - if err != nil { - return 0, err - } - - key, val = topIter.SeekGE(end, base.SeekGEFlagsNone) - if key == nil { - if err := topIter.Error(); err != nil { - return 0, err - } - } else { - endIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) - if err != nil { - return 0, errCorruptIndexEntry - } - endIdxBlock, err := r.readBlock(context.Background(), - endIdxBH.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */) - if err != nil { - return 0, err - } - defer endIdxBlock.Release() - endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get()) - if err != nil { - return 0, err - } - } - } - // startIdxIter should not be nil at this point, while endIdxIter can be if the - // range spans past the end of the file. - - key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone) - if key == nil { - // The range falls completely after this file, or an error occurred. - return 0, startIdxIter.Error() - } - startBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) - if err != nil { - return 0, errCorruptIndexEntry - } - - includeInterpolatedValueBlocksSize := func(dataBlockSize uint64) uint64 { - // INVARIANT: r.Properties.DataSize > 0 since startIdxIter is not nil. - // Linearly interpolate what is stored in value blocks. - // - // TODO(sumeer): if we need more accuracy, without loading any data blocks - // (which contain the value handles, and which may also be insufficient if - // the values are in separate files), we will need to accumulate the - // logical size of the key-value pairs and store the cumulative value for - // each data block in the index block entry. This increases the size of - // the BlockHandle, so wait until this becomes necessary. - return dataBlockSize + - uint64((float64(dataBlockSize)/float64(r.Properties.DataSize))* - float64(r.Properties.ValueBlocksSize)) - } - if endIdxIter == nil { - // The range spans beyond this file. Include data blocks through the last. - return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil - } - key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone) - if key == nil { - if err := endIdxIter.Error(); err != nil { - return 0, err - } - // The range spans beyond this file. Include data blocks through the last. - return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil - } - endBH, err := decodeBlockHandleWithProperties(val.InPlaceValue()) - if err != nil { - return 0, errCorruptIndexEntry - } - return includeInterpolatedValueBlocksSize( - endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset), nil -} - -// TableFormat returns the format version for the table. -func (r *Reader) TableFormat() (TableFormat, error) { - if r.err != nil { - return TableFormatUnspecified, r.err - } - return r.tableFormat, nil -} - -// NewReader returns a new table reader for the file. Closing the reader will -// close the file. -func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) { - o = o.ensureDefaults() - r := &Reader{ - readable: f, - opts: o, - } - if r.opts.Cache == nil { - r.opts.Cache = cache.New(0) - } else { - r.opts.Cache.Ref() - } - - if f == nil { - r.err = errors.New("pebble/table: nil file") - return nil, r.Close() - } - - // Note that the extra options are applied twice. First here for pre-apply - // options, and then below for post-apply options. Pre and post refer to - // before and after reading the metaindex and properties. - type preApply interface{ preApply() } - for _, opt := range extraOpts { - if _, ok := opt.(preApply); ok { - opt.readerApply(r) - } - } - if r.cacheID == 0 { - r.cacheID = r.opts.Cache.NewID() - } - - footer, err := readFooter(f) - if err != nil { - r.err = err - return nil, r.Close() - } - r.checksumType = footer.checksum - r.tableFormat = footer.format - // Read the metaindex. - if err := r.readMetaindex(footer.metaindexBH); err != nil { - r.err = err - return nil, r.Close() - } - r.indexBH = footer.indexBH - r.metaIndexBH = footer.metaindexBH - r.footerBH = footer.footerBH - - if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName { - r.Compare = o.Comparer.Compare - r.FormatKey = o.Comparer.FormatKey - r.Split = o.Comparer.Split - } - - if o.MergerName == r.Properties.MergerName { - r.mergerOK = true - } - - // Apply the extra options again now that the comparer and merger names are - // known. - for _, opt := range extraOpts { - if _, ok := opt.(preApply); !ok { - opt.readerApply(r) - } - } - - if r.Compare == nil { - r.err = errors.Errorf("pebble/table: %d: unknown comparer %s", - errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName)) - } - if !r.mergerOK { - if name := r.Properties.MergerName; name != "" && name != "nullptr" { - r.err = errors.Errorf("pebble/table: %d: unknown merger %s", - errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName)) - } - } - if r.err != nil { - return nil, r.Close() - } - - return r, nil -} - -// ReadableFile describes the smallest subset of vfs.File that is required for -// reading SSTs. -type ReadableFile interface { - io.ReaderAt - io.Closer - Stat() (os.FileInfo, error) -} - -// NewSimpleReadable wraps a ReadableFile in a objstorage.Readable -// implementation (which does not support read-ahead) -func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error) { - info, err := r.Stat() - if err != nil { - return nil, err - } - res := &simpleReadable{ - f: r, - size: info.Size(), - } - res.rh = objstorage.MakeNoopReadHandle(res) - return res, nil -} - -// simpleReadable wraps a ReadableFile to implement objstorage.Readable. -type simpleReadable struct { - f ReadableFile - size int64 - rh objstorage.NoopReadHandle -} - -var _ objstorage.Readable = (*simpleReadable)(nil) - -// ReadAt is part of the objstorage.Readable interface. -func (s *simpleReadable) ReadAt(_ context.Context, p []byte, off int64) error { - n, err := s.f.ReadAt(p, off) - if invariants.Enabled && err == nil && n != len(p) { - panic("short read") - } - return err -} - -// Close is part of the objstorage.Readable interface. -func (s *simpleReadable) Close() error { - return s.f.Close() -} - -// Size is part of the objstorage.Readable interface. -func (s *simpleReadable) Size() int64 { - return s.size -} - -// NewReaddHandle is part of the objstorage.Readable interface. -func (s *simpleReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle { - return &s.rh -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/reader_iter.go b/vendor/github.com/cockroachdb/pebble/sstable/reader_iter.go deleted file mode 100644 index 2b5a267..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/reader_iter.go +++ /dev/null @@ -1,291 +0,0 @@ -// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "fmt" - "os" - "sync" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" -) - -// Iterator iterates over an entire table of data. -type Iterator interface { - base.InternalIterator - - // NextPrefix implements (base.InternalIterator).NextPrefix. - NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) - - // MaybeFilteredKeys may be called when an iterator is exhausted to indicate - // whether or not the last positioning method may have skipped any keys due - // to block-property filters. This is used by the Pebble levelIter to - // control when an iterator steps to the next sstable. - // - // MaybeFilteredKeys may always return false positives, that is it may - // return true when no keys were filtered. It should only be called when the - // iterator is exhausted. It must never return false negatives when the - // iterator is exhausted. - MaybeFilteredKeys() bool - - SetCloseHook(fn func(i Iterator) error) -} - -// Iterator positioning optimizations and singleLevelIterator and -// twoLevelIterator: -// -// An iterator is absolute positioned using one of the Seek or First or Last -// calls. After absolute positioning, there can be relative positioning done -// by stepping using Prev or Next. -// -// We implement optimizations below where an absolute positioning call can in -// some cases use the current position to do less work. To understand these, -// we first define some terms. An iterator is bounds-exhausted if the bounds -// (upper of lower) have been reached. An iterator is data-exhausted if it has -// the reached the end of the data (forward or reverse) in the sstable. A -// singleLevelIterator only knows a local-data-exhausted property since when -// it is used as part of a twoLevelIterator, the twoLevelIterator can step to -// the next lower-level index block. -// -// The bounds-exhausted property is tracked by -// singleLevelIterator.exhaustedBounds being +1 (upper bound reached) or -1 -// (lower bound reached). The same field is reused by twoLevelIterator. Either -// may notice the exhaustion of the bound and set it. Note that if -// singleLevelIterator sets this property, it is not a local property (since -// the bound has been reached regardless of whether this is in the context of -// the twoLevelIterator or not). -// -// The data-exhausted property is tracked in a more subtle manner. We define -// two predicates: -// - partial-local-data-exhausted (PLDE): -// i.data.isDataInvalidated() || !i.data.valid() -// - partial-global-data-exhausted (PGDE): -// i.index.isDataInvalidated() || !i.index.valid() || i.data.isDataInvalidated() || -// !i.data.valid() -// -// PLDE is defined for a singleLevelIterator. PGDE is defined for a -// twoLevelIterator. Oddly, in our code below the singleLevelIterator does not -// know when it is part of a twoLevelIterator so it does not know when its -// property is local or global. -// -// Now to define data-exhausted: -// - Prerequisite: we must know that the iterator has been positioned and -// i.err is nil. -// - bounds-exhausted must not be true: -// If bounds-exhausted is true, we have incomplete knowledge of -// data-exhausted since PLDE or PGDE could be true because we could have -// chosen not to load index block or data block and figured out that the -// bound is exhausted (due to block property filters filtering out index and -// data blocks and going past the bound on the top level index block). Note -// that if we tried to separate out the BPF case from others we could -// develop more knowledge here. -// - PGDE is true for twoLevelIterator. PLDE is true if it is a standalone -// singleLevelIterator. !PLDE or !PGDE of course imply that data-exhausted -// is not true. -// -// An implication of the above is that if we are going to somehow utilize -// knowledge of data-exhausted in an optimization, we must not forget the -// existing value of bounds-exhausted since by forgetting the latter we can -// erroneously think that data-exhausted is true. Bug #2036 was due to this -// forgetting. -// -// Now to the two categories of optimizations we currently have: -// - Monotonic bounds optimization that reuse prior iterator position when -// doing seek: These only work with !data-exhausted. We could choose to make -// these work with data-exhausted but have not bothered because in the -// context of a DB if data-exhausted were true, the DB would move to the -// next file in the level. Note that this behavior of moving to the next -// file is not necessarily true for L0 files, so there could be some benefit -// in the future in this optimization. See the WARNING-data-exhausted -// comments if trying to optimize this in the future. -// - TrySeekUsingNext optimizations: these work regardless of exhaustion -// state. -// -// Implementation detail: In the code PLDE only checks that -// i.data.isDataInvalidated(). This narrower check is safe, since this is a -// subset of the set expressed by the OR expression. Also, it is not a -// de-optimization since whenever we exhaust the iterator we explicitly call -// i.data.invalidate(). PGDE checks i.index.isDataInvalidated() && -// i.data.isDataInvalidated(). Again, this narrower check is safe, and not a -// de-optimization since whenever we exhaust the iterator we explicitly call -// i.index.invalidate() and i.data.invalidate(). The && is questionable -- for -// now this is a bit of defensive code. We should seriously consider removing -// it, since defensive code suggests we are not confident about our invariants -// (and if we are not confident, we need more invariant assertions, not -// defensive code). -// -// TODO(sumeer): remove the aforementioned defensive code. - -var singleLevelIterPool = sync.Pool{ - New: func() interface{} { - i := &singleLevelIterator{} - // Note: this is a no-op if invariants are disabled or race is enabled. - invariants.SetFinalizer(i, checkSingleLevelIterator) - return i - }, -} - -var twoLevelIterPool = sync.Pool{ - New: func() interface{} { - i := &twoLevelIterator{} - // Note: this is a no-op if invariants are disabled or race is enabled. - invariants.SetFinalizer(i, checkTwoLevelIterator) - return i - }, -} - -// TODO(jackson): rangedel fragmentBlockIters can't be pooled because of some -// code paths that double Close the iters. Fix the double close and pool the -// *fragmentBlockIter type directly. - -var rangeKeyFragmentBlockIterPool = sync.Pool{ - New: func() interface{} { - i := &rangeKeyFragmentBlockIter{} - // Note: this is a no-op if invariants are disabled or race is enabled. - invariants.SetFinalizer(i, checkRangeKeyFragmentBlockIterator) - return i - }, -} - -func checkSingleLevelIterator(obj interface{}) { - i := obj.(*singleLevelIterator) - if p := i.data.handle.Get(); p != nil { - fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %p\n", p) - os.Exit(1) - } - if p := i.index.handle.Get(); p != nil { - fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %p\n", p) - os.Exit(1) - } -} - -func checkTwoLevelIterator(obj interface{}) { - i := obj.(*twoLevelIterator) - if p := i.data.handle.Get(); p != nil { - fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %p\n", p) - os.Exit(1) - } - if p := i.index.handle.Get(); p != nil { - fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %p\n", p) - os.Exit(1) - } -} - -func checkRangeKeyFragmentBlockIterator(obj interface{}) { - i := obj.(*rangeKeyFragmentBlockIter) - if p := i.blockIter.handle.Get(); p != nil { - fmt.Fprintf(os.Stderr, "fragmentBlockIter.blockIter.handle is not nil: %p\n", p) - os.Exit(1) - } -} - -// compactionIterator is similar to Iterator but it increments the number of -// bytes that have been iterated through. -type compactionIterator struct { - *singleLevelIterator - bytesIterated *uint64 - prevOffset uint64 -} - -// compactionIterator implements the base.InternalIterator interface. -var _ base.InternalIterator = (*compactionIterator)(nil) - -func (i *compactionIterator) String() string { - if i.vState != nil { - return i.vState.fileNum.String() - } - return i.reader.fileNum.String() -} - -func (i *compactionIterator) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*InternalKey, base.LazyValue) { - panic("pebble: SeekGE unimplemented") -} - -func (i *compactionIterator) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - panic("pebble: SeekPrefixGE unimplemented") -} - -func (i *compactionIterator) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*InternalKey, base.LazyValue) { - panic("pebble: SeekLT unimplemented") -} - -func (i *compactionIterator) First() (*InternalKey, base.LazyValue) { - i.err = nil // clear cached iteration error - return i.skipForward(i.singleLevelIterator.First()) -} - -func (i *compactionIterator) Last() (*InternalKey, base.LazyValue) { - panic("pebble: Last unimplemented") -} - -// Note: compactionIterator.Next mirrors the implementation of Iterator.Next -// due to performance. Keep the two in sync. -func (i *compactionIterator) Next() (*InternalKey, base.LazyValue) { - if i.err != nil { - return nil, base.LazyValue{} - } - return i.skipForward(i.data.Next()) -} - -func (i *compactionIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { - panic("pebble: NextPrefix unimplemented") -} - -func (i *compactionIterator) Prev() (*InternalKey, base.LazyValue) { - panic("pebble: Prev unimplemented") -} - -func (i *compactionIterator) skipForward( - key *InternalKey, val base.LazyValue, -) (*InternalKey, base.LazyValue) { - if key == nil { - for { - if key, _ := i.index.Next(); key == nil { - break - } - result := i.loadBlock(+1) - if result != loadBlockOK { - if i.err != nil { - break - } - switch result { - case loadBlockFailed: - // We checked that i.index was at a valid entry, so - // loadBlockFailed could not have happened due to to i.index - // being exhausted, and must be due to an error. - panic("loadBlock should not have failed with no error") - case loadBlockIrrelevant: - panic("compactionIter should not be using block intervals for skipping") - default: - panic(fmt.Sprintf("unexpected case %d", result)) - } - } - // result == loadBlockOK - if key, val = i.data.First(); key != nil { - break - } - } - } - - curOffset := i.recordOffset() - *i.bytesIterated += uint64(curOffset - i.prevOffset) - i.prevOffset = curOffset - - if i.vState != nil && key != nil { - cmp := i.cmp(key.UserKey, i.vState.upper.UserKey) - if cmp > 0 || (i.vState.upper.IsExclusiveSentinel() && cmp == 0) { - return nil, base.LazyValue{} - } - } - - return key, val -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/reader_iter_two_lvl.go b/vendor/github.com/cockroachdb/pebble/sstable/reader_iter_two_lvl.go deleted file mode 100644 index c090a3d..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/reader_iter_two_lvl.go +++ /dev/null @@ -1,1085 +0,0 @@ -// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "context" - "fmt" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" -) - -type twoLevelIterator struct { - singleLevelIterator - // maybeFilteredKeysSingleLevel indicates whether the last iterator - // positioning operation may have skipped any index blocks due to - // block-property filters when positioning the top-level-index. - maybeFilteredKeysTwoLevel bool - topLevelIndex blockIter -} - -// twoLevelIterator implements the base.InternalIterator interface. -var _ base.InternalIterator = (*twoLevelIterator)(nil) - -// loadIndex loads the index block at the current top level index position and -// leaves i.index unpositioned. If unsuccessful, it gets i.err to any error -// encountered, which may be nil if we have simply exhausted the entire table. -// This is used for two level indexes. -func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult { - // Ensure the index data block iterators are invalidated even if loading of - // the index fails. - i.data.invalidate() - i.index.invalidate() - if !i.topLevelIndex.valid() { - i.index.offset = 0 - i.index.restarts = 0 - return loadBlockFailed - } - v := i.topLevelIndex.value() - bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue()) - if err != nil { - i.err = base.CorruptionErrorf("pebble/table: corrupt top level index entry") - return loadBlockFailed - } - if i.bpfs != nil { - intersects, err := i.bpfs.intersects(bhp.Props) - if err != nil { - i.err = errCorruptIndexEntry - return loadBlockFailed - } - if intersects == blockMaybeExcluded { - intersects = i.resolveMaybeExcluded(dir) - } - if intersects == blockExcluded { - i.maybeFilteredKeysTwoLevel = true - return loadBlockIrrelevant - } - // blockIntersects - } - ctx := objiotracing.WithBlockType(i.ctx, objiotracing.MetadataBlock) - indexBlock, err := i.reader.readBlock(ctx, bhp.BlockHandle, nil /* transform */, nil /* readHandle */, i.stats, i.bufferPool) - if err != nil { - i.err = err - return loadBlockFailed - } - if i.err = i.index.initHandle(i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum, false); i.err == nil { - return loadBlockOK - } - return loadBlockFailed -} - -// resolveMaybeExcluded is invoked when the block-property filterer has found -// that an index block is excluded according to its properties but only if its -// bounds fall within the filter's current bounds. This function consults the -// apprioriate bound, depending on the iteration direction, and returns either -// `blockIntersects` or -// `blockMaybeExcluded`. -func (i *twoLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { - // This iterator is configured with a bound-limited block property filter. - // The bpf determined this entire index block could be excluded from - // iteration based on the property encoded in the block handle. However, we - // still need to determine if the index block is wholly contained within the - // filter's key bounds. - // - // External guarantees ensure all its data blocks' keys are ≥ the filter's - // lower bound during forward iteration, and that all its data blocks' keys - // are < the filter's upper bound during backward iteration. We only need to - // determine if the opposite bound is also met. - // - // The index separator in topLevelIndex.Key() provides an inclusive - // upper-bound for the index block's keys, guaranteeing that all its keys - // are ≤ topLevelIndex.Key(). For forward iteration, this is all we need. - if dir > 0 { - // Forward iteration. - if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.topLevelIndex.Key().UserKey) { - return blockExcluded - } - return blockIntersects - } - - // Reverse iteration. - // - // Because we're iterating in the reverse direction, we don't yet have - // enough context available to determine if the block is wholly contained - // within its bounds. This case arises only during backward iteration, - // because of the way the index is structured. - // - // Consider a bound-limited bpf limited to the bounds [b,d), loading the - // block with separator `c`. During reverse iteration, the guarantee that - // all the block's keys are < `d` is externally provided, but no guarantee - // is made on the bpf's lower bound. The separator `c` only provides an - // inclusive upper bound on the block's keys, indicating that the - // corresponding block handle points to a block containing only keys ≤ `c`. - // - // To establish a lower bound, we step the top-level index backwards to read - // the previous block's separator, which provides an inclusive lower bound - // on the original index block's keys. Afterwards, we step forward to - // restore our top-level index position. - if peekKey, _ := i.topLevelIndex.Prev(); peekKey == nil { - // The original block points to the first index block of this table. If - // we knew the lower bound for the entire table, it could provide a - // lower bound, but the code refactoring necessary to read it doesn't - // seem worth the payoff. We fall through to loading the block. - } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey.UserKey) { - // The lower-bound on the original index block falls within the filter's - // bounds, and we can skip the block (after restoring our current - // top-level index position). - _, _ = i.topLevelIndex.Next() - return blockExcluded - } - _, _ = i.topLevelIndex.Next() - return blockIntersects -} - -// Note that lower, upper passed into init has nothing to do with virtual sstable -// bounds. If the virtualState passed in is not nil, then virtual sstable bounds -// will be enforced. -func (i *twoLevelIterator) init( - ctx context.Context, - r *Reader, - v *virtualState, - lower, upper []byte, - filterer *BlockPropertiesFilterer, - useFilter, hideObsoletePoints bool, - stats *base.InternalIteratorStats, - rp ReaderProvider, - bufferPool *BufferPool, -) error { - if r.err != nil { - return r.err - } - topLevelIndexH, err := r.readIndex(ctx, stats) - if err != nil { - return err - } - if v != nil { - i.vState = v - // Note that upper is exclusive here. - i.endKeyInclusive, lower, upper = v.constrainBounds(lower, upper, false /* endInclusive */) - } - - i.ctx = ctx - i.lower = lower - i.upper = upper - i.bpfs = filterer - i.useFilter = useFilter - i.reader = r - i.cmp = r.Compare - i.stats = stats - i.hideObsoletePoints = hideObsoletePoints - i.bufferPool = bufferPool - err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum, false) - if err != nil { - // blockIter.Close releases topLevelIndexH and always returns a nil error - _ = i.topLevelIndex.Close() - return err - } - i.dataRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.dataRHPrealloc) - if r.tableFormat >= TableFormatPebblev3 { - if r.Properties.NumValueBlocks > 0 { - i.vbReader = &valueBlockReader{ - ctx: ctx, - bpOpen: i, - rp: rp, - vbih: r.valueBIH, - stats: stats, - } - i.data.lazyValueHandling.vbr = i.vbReader - i.vbRH = r.readable.NewReadHandle(ctx) - } - i.data.lazyValueHandling.hasValuePrefix = true - } - return nil -} - -func (i *twoLevelIterator) String() string { - if i.vState != nil { - return i.vState.fileNum.String() - } - return i.reader.fileNum.String() -} - -// MaybeFilteredKeys may be called when an iterator is exhausted to indicate -// whether or not the last positioning method may have skipped any keys due to -// block-property filters. -func (i *twoLevelIterator) MaybeFilteredKeys() bool { - // While reading sstables with two-level indexes, knowledge of whether we've - // filtered keys is tracked separately for each index level. The - // seek-using-next optimizations have different criteria. We can only reset - // maybeFilteredKeys back to false during a seek when NOT using the - // fast-path that uses the current iterator position. - // - // If either level might have filtered keys to arrive at the current - // iterator position, return MaybeFilteredKeys=true. - return i.maybeFilteredKeysTwoLevel || i.maybeFilteredKeysSingleLevel -} - -// SeekGE implements internalIterator.SeekGE, as documented in the pebble -// package. Note that SeekGE only checks the upper bound. It is up to the -// caller to ensure that key is greater than or equal to the lower bound. -func (i *twoLevelIterator) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*InternalKey, base.LazyValue) { - if i.vState != nil { - // Callers of SeekGE don't know about virtual sstable bounds, so we may - // have to internally restrict the bounds. - // - // TODO(bananabrick): We can optimize away this check for the level iter - // if necessary. - if i.cmp(key, i.lower) < 0 { - key = i.lower - } - } - - err := i.err - i.err = nil // clear cached iteration error - - // The twoLevelIterator could be already exhausted. Utilize that when - // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and - // bounds-exhausted near the top of the file. - if flags.TrySeekUsingNext() && - (i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) && - err == nil { - // Already exhausted, so return nil. - return nil, base.LazyValue{} - } - - // SeekGE performs various step-instead-of-seeking optimizations: eg enabled - // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp). - // Care must be taken to ensure that when performing these optimizations and - // the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately. - // Consider a previous SeekGE that filtered keys from k until the current - // iterator position. - // - // If the previous SeekGE exhausted the iterator while seeking within the - // two-level index, it's possible keys greater than or equal to the current - // search key were filtered through skipped index blocks. We must not reuse - // the position of the two-level index iterator without remembering the - // previous value of maybeFilteredKeys. - - // We fall into the slow path if i.index.isDataInvalidated() even if the - // top-level iterator is already positioned correctly and all other - // conditions are met. An alternative structure could reuse topLevelIndex's - // current position and reload the index block to which it points. Arguably, - // an index block load is expensive and the index block may still be earlier - // than the index block containing the sought key, resulting in a wasteful - // block load. - - var dontSeekWithinSingleLevelIter bool - if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil || - (i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { - // Slow-path: need to position the topLevelIndex. - - // The previous exhausted state of singleLevelIterator is no longer - // relevant, since we may be moving to a different index block. - i.exhaustedBounds = 0 - i.maybeFilteredKeysTwoLevel = false - flags = flags.DisableTrySeekUsingNext() - var ikey *InternalKey - if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { - i.data.invalidate() - i.index.invalidate() - return nil, base.LazyValue{} - } - - result := i.loadIndex(+1) - if result == loadBlockFailed { - i.boundsCmp = 0 - return nil, base.LazyValue{} - } - if result == loadBlockIrrelevant { - // Enforce the upper bound here since don't want to bother moving - // to the next entry in the top level index if upper bound is - // already exceeded. Note that the next entry starts with keys >= - // ikey.UserKey since even though this is the block separator, the - // same user key can span multiple index blocks. If upper is - // exclusive we use >= below, else we use >. - if i.upper != nil { - cmp := i.cmp(ikey.UserKey, i.upper) - if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { - i.exhaustedBounds = +1 - } - } - // Fall through to skipForward. - dontSeekWithinSingleLevelIter = true - // Clear boundsCmp. - // - // In the typical cases where dontSeekWithinSingleLevelIter=false, - // the singleLevelIterator.SeekGE call will clear boundsCmp. - // However, in this case where dontSeekWithinSingleLevelIter=true, - // we never seek on the single-level iterator. This call will fall - // through to skipForward, which may improperly leave boundsCmp=+1 - // unless we clear it here. - i.boundsCmp = 0 - } - } else { - // INVARIANT: err == nil. - // - // Else fast-path: There are two possible cases, from - // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): - // - // 1) The bounds have moved forward (i.boundsCmp > 0) and this SeekGE is - // respecting the lower bound (guaranteed by Iterator). We know that the - // iterator must already be positioned within or just outside the previous - // bounds. Therefore, the topLevelIndex iter cannot be positioned at an - // entry ahead of the seek position (though it can be positioned behind). - // The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 confirms that it is - // not behind. Since it is not ahead and not behind it must be at the - // right position. - // - // 2) This SeekGE will land on a key that is greater than the key we are - // currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, - // i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level - // index block. No need to reset the state of singleLevelIterator. - // - // Note that cases 1 and 2 never overlap, and one of them must be true, - // but we have some test code (TestIterRandomizedMaybeFilteredKeys) that - // sets both to true, so we fix things here and then do an invariant - // check. - // - // This invariant checking is important enough that we do not gate it - // behind invariants.Enabled. - if i.boundsCmp > 0 { - // TODO(sumeer): fix TestIterRandomizedMaybeFilteredKeys so as to not - // need this behavior. - flags = flags.DisableTrySeekUsingNext() - } - if i.boundsCmp > 0 == flags.TrySeekUsingNext() { - panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", - i.boundsCmp > 0, flags.TrySeekUsingNext())) - } - - if !flags.TrySeekUsingNext() { - // Case 1. Bounds have changed so the previous exhausted bounds state is - // irrelevant. - // WARNING-data-exhausted: this is safe to do only because the monotonic - // bounds optimizations only work when !data-exhausted. If they also - // worked with data-exhausted, we have made it unclear whether - // data-exhausted is actually true. See the comment at the top of the - // file. - i.exhaustedBounds = 0 - } - // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to - // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See - // bug https://github.com/cockroachdb/pebble/issues/2036. - } - - if !dontSeekWithinSingleLevelIter { - // Note that while trySeekUsingNext could be false here, singleLevelIterator - // could do its own boundsCmp-based optimization to seek using next. - if ikey, val := i.singleLevelIterator.SeekGE(key, flags); ikey != nil { - return ikey, val - } - } - return i.skipForward() -} - -// SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the -// pebble package. Note that SeekPrefixGE only checks the upper bound. It is up -// to the caller to ensure that key is greater than or equal to the lower bound. -func (i *twoLevelIterator) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - if i.vState != nil { - // Callers of SeekGE don't know about virtual sstable bounds, so we may - // have to internally restrict the bounds. - // - // TODO(bananabrick): We can optimize away this check for the level iter - // if necessary. - if i.cmp(key, i.lower) < 0 { - key = i.lower - } - } - - // NOTE: prefix is only used for bloom filter checking and not later work in - // this method. Hence, we can use the existing iterator position if the last - // SeekPrefixGE did not fail bloom filter matching. - - err := i.err - i.err = nil // clear cached iteration error - - // The twoLevelIterator could be already exhausted. Utilize that when - // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and - // bounds-exhausted near the top of the file. - filterUsedAndDidNotMatch := - i.reader.tableFilter != nil && i.useFilter && !i.lastBloomFilterMatched - if flags.TrySeekUsingNext() && !filterUsedAndDidNotMatch && - (i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) && - err == nil { - // Already exhausted, so return nil. - return nil, base.LazyValue{} - } - - // Check prefix bloom filter. - if i.reader.tableFilter != nil && i.useFilter { - if !i.lastBloomFilterMatched { - // Iterator is not positioned based on last seek. - flags = flags.DisableTrySeekUsingNext() - } - i.lastBloomFilterMatched = false - var dataH bufferHandle - dataH, i.err = i.reader.readFilter(i.ctx, i.stats) - if i.err != nil { - i.data.invalidate() - return nil, base.LazyValue{} - } - mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix) - dataH.Release() - if !mayContain { - // This invalidation may not be necessary for correctness, and may - // be a place to optimize later by reusing the already loaded - // block. It was necessary in earlier versions of the code since - // the caller was allowed to call Next when SeekPrefixGE returned - // nil. This is no longer allowed. - i.data.invalidate() - return nil, base.LazyValue{} - } - i.lastBloomFilterMatched = true - } - - // Bloom filter matches. - - // SeekPrefixGE performs various step-instead-of-seeking optimizations: eg - // enabled by trySeekUsingNext, or by monotonically increasing bounds - // (i.boundsCmp). Care must be taken to ensure that when performing these - // optimizations and the iterator becomes exhausted, - // i.maybeFilteredKeysTwoLevel is set appropriately. Consider a previous - // SeekPrefixGE that filtered keys from k until the current iterator - // position. - // - // If the previous SeekPrefixGE exhausted the iterator while seeking within - // the two-level index, it's possible keys greater than or equal to the - // current search key were filtered through skipped index blocks. We must - // not reuse the position of the two-level index iterator without - // remembering the previous value of maybeFilteredKeysTwoLevel. - - // We fall into the slow path if i.index.isDataInvalidated() even if the - // top-level iterator is already positioned correctly and all other - // conditions are met. An alternative structure could reuse topLevelIndex's - // current position and reload the index block to which it points. Arguably, - // an index block load is expensive and the index block may still be earlier - // than the index block containing the sought key, resulting in a wasteful - // block load. - - var dontSeekWithinSingleLevelIter bool - if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil || - (i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { - // Slow-path: need to position the topLevelIndex. - - // The previous exhausted state of singleLevelIterator is no longer - // relevant, since we may be moving to a different index block. - i.exhaustedBounds = 0 - i.maybeFilteredKeysTwoLevel = false - flags = flags.DisableTrySeekUsingNext() - var ikey *InternalKey - if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { - i.data.invalidate() - i.index.invalidate() - return nil, base.LazyValue{} - } - - result := i.loadIndex(+1) - if result == loadBlockFailed { - i.boundsCmp = 0 - return nil, base.LazyValue{} - } - if result == loadBlockIrrelevant { - // Enforce the upper bound here since don't want to bother moving - // to the next entry in the top level index if upper bound is - // already exceeded. Note that the next entry starts with keys >= - // ikey.UserKey since even though this is the block separator, the - // same user key can span multiple index blocks. If upper is - // exclusive we use >= below, else we use >. - if i.upper != nil { - cmp := i.cmp(ikey.UserKey, i.upper) - if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { - i.exhaustedBounds = +1 - } - } - // Fall through to skipForward. - dontSeekWithinSingleLevelIter = true - // Clear boundsCmp. - // - // In the typical cases where dontSeekWithinSingleLevelIter=false, - // the singleLevelIterator.SeekPrefixGE call will clear boundsCmp. - // However, in this case where dontSeekWithinSingleLevelIter=true, - // we never seek on the single-level iterator. This call will fall - // through to skipForward, which may improperly leave boundsCmp=+1 - // unless we clear it here. - i.boundsCmp = 0 - } - } else { - // INVARIANT: err == nil. - // - // Else fast-path: There are two possible cases, from - // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): - // - // 1) The bounds have moved forward (i.boundsCmp > 0) and this - // SeekPrefixGE is respecting the lower bound (guaranteed by Iterator). We - // know that the iterator must already be positioned within or just - // outside the previous bounds. Therefore, the topLevelIndex iter cannot - // be positioned at an entry ahead of the seek position (though it can be - // positioned behind). The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 - // confirms that it is not behind. Since it is not ahead and not behind it - // must be at the right position. - // - // 2) This SeekPrefixGE will land on a key that is greater than the key we - // are currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, - // i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level - // index block. No need to reset the state of singleLevelIterator. - // - // Note that cases 1 and 2 never overlap, and one of them must be true. - // This invariant checking is important enough that we do not gate it - // behind invariants.Enabled. - if i.boundsCmp > 0 == flags.TrySeekUsingNext() { - panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", - i.boundsCmp > 0, flags.TrySeekUsingNext())) - } - - if !flags.TrySeekUsingNext() { - // Case 1. Bounds have changed so the previous exhausted bounds state is - // irrelevant. - // WARNING-data-exhausted: this is safe to do only because the monotonic - // bounds optimizations only work when !data-exhausted. If they also - // worked with data-exhausted, we have made it unclear whether - // data-exhausted is actually true. See the comment at the top of the - // file. - i.exhaustedBounds = 0 - } - // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to - // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See - // bug https://github.com/cockroachdb/pebble/issues/2036. - } - - if !dontSeekWithinSingleLevelIter { - if ikey, val := i.singleLevelIterator.seekPrefixGE( - prefix, key, flags, false /* checkFilter */); ikey != nil { - return ikey, val - } - } - // NB: skipForward checks whether exhaustedBounds is already +1. - return i.skipForward() -} - -// virtualLast should only be called if i.vReader != nil and i.endKeyInclusive -// is true. -func (i *twoLevelIterator) virtualLast() (*InternalKey, base.LazyValue) { - if i.vState == nil { - panic("pebble: invalid call to virtualLast") - } - - // Seek to the first internal key. - ikey, _ := i.SeekGE(i.upper, base.SeekGEFlagsNone) - if i.endKeyInclusive { - // Let's say the virtual sstable upper bound is c#1, with the keys c#3, c#2, - // c#1, d, e, ... in the sstable. So, the last key in the virtual sstable is - // c#1. We can perform SeekGE(i.upper) and then keep nexting until we find - // the last key with userkey == i.upper. - // - // TODO(bananabrick): Think about how to improve this. If many internal keys - // with the same user key at the upper bound then this could be slow, but - // maybe the odds of having many internal keys with the same user key at the - // upper bound are low. - for ikey != nil && i.cmp(ikey.UserKey, i.upper) == 0 { - ikey, _ = i.Next() - } - return i.Prev() - } - // We seeked to the first key >= i.upper. - return i.Prev() -} - -// SeekLT implements internalIterator.SeekLT, as documented in the pebble -// package. Note that SeekLT only checks the lower bound. It is up to the -// caller to ensure that key is less than the upper bound. -func (i *twoLevelIterator) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*InternalKey, base.LazyValue) { - if i.vState != nil { - // Might have to fix upper bound since virtual sstable bounds are not - // known to callers of SeekLT. - // - // TODO(bananabrick): We can optimize away this check for the level iter - // if necessary. - cmp := i.cmp(key, i.upper) - // key == i.upper is fine. We'll do the right thing and return the - // first internal key with user key < key. - if cmp > 0 { - return i.virtualLast() - } - } - - i.exhaustedBounds = 0 - i.err = nil // clear cached iteration error - // Seek optimization only applies until iterator is first positioned after SetBounds. - i.boundsCmp = 0 - - var result loadBlockResult - var ikey *InternalKey - // NB: Unlike SeekGE, we don't have a fast-path here since we don't know - // whether the topLevelIndex is positioned after the position that would - // be returned by doing i.topLevelIndex.SeekGE(). To know this we would - // need to know the index key preceding the current one. - // NB: If a bound-limited block property filter is configured, it's - // externally ensured that the filter is disabled (through returning - // Intersects=false irrespective of the block props provided) during seeks. - i.maybeFilteredKeysTwoLevel = false - if ikey, _ = i.topLevelIndex.SeekGE(key, base.SeekGEFlagsNone); ikey == nil { - if ikey, _ = i.topLevelIndex.Last(); ikey == nil { - i.data.invalidate() - i.index.invalidate() - return nil, base.LazyValue{} - } - - result = i.loadIndex(-1) - if result == loadBlockFailed { - return nil, base.LazyValue{} - } - if result == loadBlockOK { - if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { - return i.maybeVerifyKey(ikey, val) - } - // Fall through to skipBackward since the singleLevelIterator did - // not have any blocks that satisfy the block interval - // constraints, or the lower bound was reached. - } - // Else loadBlockIrrelevant, so fall through. - } else { - result = i.loadIndex(-1) - if result == loadBlockFailed { - return nil, base.LazyValue{} - } - if result == loadBlockOK { - if ikey, val := i.singleLevelIterator.SeekLT(key, flags); ikey != nil { - return i.maybeVerifyKey(ikey, val) - } - // Fall through to skipBackward since the singleLevelIterator did - // not have any blocks that satisfy the block interval - // constraint, or the lower bound was reached. - } - // Else loadBlockIrrelevant, so fall through. - } - if result == loadBlockIrrelevant { - // Enforce the lower bound here since don't want to bother moving to - // the previous entry in the top level index if lower bound is already - // exceeded. Note that the previous entry starts with keys <= - // ikey.UserKey since even though this is the current block's - // separator, the same user key can span multiple index blocks. - if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { - i.exhaustedBounds = -1 - } - } - // NB: skipBackward checks whether exhaustedBounds is already -1. - return i.skipBackward() -} - -// First implements internalIterator.First, as documented in the pebble -// package. Note that First only checks the upper bound. It is up to the caller -// to ensure that key is greater than or equal to the lower bound (e.g. via a -// call to SeekGE(lower)). -func (i *twoLevelIterator) First() (*InternalKey, base.LazyValue) { - // If the iterator was created on a virtual sstable, we will SeekGE to the - // lower bound instead of using First, because First does not respect - // bounds. - if i.vState != nil { - return i.SeekGE(i.lower, base.SeekGEFlagsNone) - } - - if i.lower != nil { - panic("twoLevelIterator.First() used despite lower bound") - } - i.exhaustedBounds = 0 - i.maybeFilteredKeysTwoLevel = false - i.err = nil // clear cached iteration error - // Seek optimization only applies until iterator is first positioned after SetBounds. - i.boundsCmp = 0 - - var ikey *InternalKey - if ikey, _ = i.topLevelIndex.First(); ikey == nil { - return nil, base.LazyValue{} - } - - result := i.loadIndex(+1) - if result == loadBlockFailed { - return nil, base.LazyValue{} - } - if result == loadBlockOK { - if ikey, val := i.singleLevelIterator.First(); ikey != nil { - return ikey, val - } - // Else fall through to skipForward. - } else { - // result == loadBlockIrrelevant. Enforce the upper bound here since - // don't want to bother moving to the next entry in the top level - // index if upper bound is already exceeded. Note that the next entry - // starts with keys >= ikey.UserKey since even though this is the - // block separator, the same user key can span multiple index blocks. - // If upper is exclusive we use >= below, else we use >. - if i.upper != nil { - cmp := i.cmp(ikey.UserKey, i.upper) - if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { - i.exhaustedBounds = +1 - } - } - } - // NB: skipForward checks whether exhaustedBounds is already +1. - return i.skipForward() -} - -// Last implements internalIterator.Last, as documented in the pebble -// package. Note that Last only checks the lower bound. It is up to the caller -// to ensure that key is less than the upper bound (e.g. via a call to -// SeekLT(upper)) -func (i *twoLevelIterator) Last() (*InternalKey, base.LazyValue) { - if i.vState != nil { - if i.endKeyInclusive { - return i.virtualLast() - } - return i.SeekLT(i.upper, base.SeekLTFlagsNone) - } - - if i.upper != nil { - panic("twoLevelIterator.Last() used despite upper bound") - } - i.exhaustedBounds = 0 - i.maybeFilteredKeysTwoLevel = false - i.err = nil // clear cached iteration error - // Seek optimization only applies until iterator is first positioned after SetBounds. - i.boundsCmp = 0 - - var ikey *InternalKey - if ikey, _ = i.topLevelIndex.Last(); ikey == nil { - return nil, base.LazyValue{} - } - - result := i.loadIndex(-1) - if result == loadBlockFailed { - return nil, base.LazyValue{} - } - if result == loadBlockOK { - if ikey, val := i.singleLevelIterator.Last(); ikey != nil { - return ikey, val - } - // Else fall through to skipBackward. - } else { - // result == loadBlockIrrelevant. Enforce the lower bound here - // since don't want to bother moving to the previous entry in the - // top level index if lower bound is already exceeded. Note that - // the previous entry starts with keys <= ikey.UserKey since even - // though this is the current block's separator, the same user key - // can span multiple index blocks. - if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { - i.exhaustedBounds = -1 - } - } - // NB: skipBackward checks whether exhaustedBounds is already -1. - return i.skipBackward() -} - -// Next implements internalIterator.Next, as documented in the pebble -// package. -// Note: twoLevelCompactionIterator.Next mirrors the implementation of -// twoLevelIterator.Next due to performance. Keep the two in sync. -func (i *twoLevelIterator) Next() (*InternalKey, base.LazyValue) { - // Seek optimization only applies until iterator is first positioned after SetBounds. - i.boundsCmp = 0 - i.maybeFilteredKeysTwoLevel = false - if i.err != nil { - return nil, base.LazyValue{} - } - if key, val := i.singleLevelIterator.Next(); key != nil { - return key, val - } - return i.skipForward() -} - -// NextPrefix implements (base.InternalIterator).NextPrefix. -func (i *twoLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { - if i.exhaustedBounds == +1 { - panic("Next called even though exhausted upper bound") - } - // Seek optimization only applies until iterator is first positioned after SetBounds. - i.boundsCmp = 0 - i.maybeFilteredKeysTwoLevel = false - if i.err != nil { - return nil, base.LazyValue{} - } - if key, val := i.singleLevelIterator.NextPrefix(succKey); key != nil { - return key, val - } - // key == nil - if i.err != nil { - return nil, base.LazyValue{} - } - - // Did not find prefix in the existing second-level index block. This is the - // slow-path where we seek the iterator. - var ikey *InternalKey - if ikey, _ = i.topLevelIndex.SeekGE(succKey, base.SeekGEFlagsNone); ikey == nil { - i.data.invalidate() - i.index.invalidate() - return nil, base.LazyValue{} - } - result := i.loadIndex(+1) - if result == loadBlockFailed { - return nil, base.LazyValue{} - } - if result == loadBlockIrrelevant { - // Enforce the upper bound here since don't want to bother moving to the - // next entry in the top level index if upper bound is already exceeded. - // Note that the next entry starts with keys >= ikey.UserKey since even - // though this is the block separator, the same user key can span multiple - // index blocks. If upper is exclusive we use >= below, else we use >. - if i.upper != nil { - cmp := i.cmp(ikey.UserKey, i.upper) - if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { - i.exhaustedBounds = +1 - } - } - } else if key, val := i.singleLevelIterator.SeekGE(succKey, base.SeekGEFlagsNone); key != nil { - return i.maybeVerifyKey(key, val) - } - return i.skipForward() -} - -// Prev implements internalIterator.Prev, as documented in the pebble -// package. -func (i *twoLevelIterator) Prev() (*InternalKey, base.LazyValue) { - // Seek optimization only applies until iterator is first positioned after SetBounds. - i.boundsCmp = 0 - i.maybeFilteredKeysTwoLevel = false - if i.err != nil { - return nil, base.LazyValue{} - } - if key, val := i.singleLevelIterator.Prev(); key != nil { - return key, val - } - return i.skipBackward() -} - -func (i *twoLevelIterator) skipForward() (*InternalKey, base.LazyValue) { - for { - if i.err != nil || i.exhaustedBounds > 0 { - return nil, base.LazyValue{} - } - i.exhaustedBounds = 0 - var ikey *InternalKey - if ikey, _ = i.topLevelIndex.Next(); ikey == nil { - i.data.invalidate() - i.index.invalidate() - return nil, base.LazyValue{} - } - result := i.loadIndex(+1) - if result == loadBlockFailed { - return nil, base.LazyValue{} - } - if result == loadBlockOK { - if ikey, val := i.singleLevelIterator.firstInternal(); ikey != nil { - return i.maybeVerifyKey(ikey, val) - } - // Next iteration will return if singleLevelIterator set - // exhaustedBounds = +1. - } else { - // result == loadBlockIrrelevant. Enforce the upper bound here - // since don't want to bother moving to the next entry in the top - // level index if upper bound is already exceeded. Note that the - // next entry starts with keys >= ikey.UserKey since even though - // this is the block separator, the same user key can span - // multiple index blocks. If upper is exclusive we use >= - // below, else we use >. - if i.upper != nil { - cmp := i.cmp(ikey.UserKey, i.upper) - if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { - i.exhaustedBounds = +1 - // Next iteration will return. - } - } - } - } -} - -func (i *twoLevelIterator) skipBackward() (*InternalKey, base.LazyValue) { - for { - if i.err != nil || i.exhaustedBounds < 0 { - return nil, base.LazyValue{} - } - i.exhaustedBounds = 0 - var ikey *InternalKey - if ikey, _ = i.topLevelIndex.Prev(); ikey == nil { - i.data.invalidate() - i.index.invalidate() - return nil, base.LazyValue{} - } - result := i.loadIndex(-1) - if result == loadBlockFailed { - return nil, base.LazyValue{} - } - if result == loadBlockOK { - if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { - return i.maybeVerifyKey(ikey, val) - } - // Next iteration will return if singleLevelIterator set - // exhaustedBounds = -1. - } else { - // result == loadBlockIrrelevant. Enforce the lower bound here - // since don't want to bother moving to the previous entry in the - // top level index if lower bound is already exceeded. Note that - // the previous entry starts with keys <= ikey.UserKey since even - // though this is the current block's separator, the same user key - // can span multiple index blocks. - if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { - i.exhaustedBounds = -1 - // Next iteration will return. - } - } - } -} - -// Close implements internalIterator.Close, as documented in the pebble -// package. -func (i *twoLevelIterator) Close() error { - var err error - if i.closeHook != nil { - err = firstError(err, i.closeHook(i)) - } - err = firstError(err, i.data.Close()) - err = firstError(err, i.index.Close()) - err = firstError(err, i.topLevelIndex.Close()) - if i.dataRH != nil { - err = firstError(err, i.dataRH.Close()) - i.dataRH = nil - } - err = firstError(err, i.err) - if i.bpfs != nil { - releaseBlockPropertiesFilterer(i.bpfs) - } - if i.vbReader != nil { - i.vbReader.close() - } - if i.vbRH != nil { - err = firstError(err, i.vbRH.Close()) - i.vbRH = nil - } - *i = twoLevelIterator{ - singleLevelIterator: i.singleLevelIterator.resetForReuse(), - topLevelIndex: i.topLevelIndex.resetForReuse(), - } - twoLevelIterPool.Put(i) - return err -} - -// Note: twoLevelCompactionIterator and compactionIterator are very similar but -// were separated due to performance. -type twoLevelCompactionIterator struct { - *twoLevelIterator - bytesIterated *uint64 - prevOffset uint64 -} - -// twoLevelCompactionIterator implements the base.InternalIterator interface. -var _ base.InternalIterator = (*twoLevelCompactionIterator)(nil) - -func (i *twoLevelCompactionIterator) Close() error { - return i.twoLevelIterator.Close() -} - -func (i *twoLevelCompactionIterator) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*InternalKey, base.LazyValue) { - panic("pebble: SeekGE unimplemented") -} - -func (i *twoLevelCompactionIterator) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - panic("pebble: SeekPrefixGE unimplemented") -} - -func (i *twoLevelCompactionIterator) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*InternalKey, base.LazyValue) { - panic("pebble: SeekLT unimplemented") -} - -func (i *twoLevelCompactionIterator) First() (*InternalKey, base.LazyValue) { - i.err = nil // clear cached iteration error - return i.skipForward(i.twoLevelIterator.First()) -} - -func (i *twoLevelCompactionIterator) Last() (*InternalKey, base.LazyValue) { - panic("pebble: Last unimplemented") -} - -// Note: twoLevelCompactionIterator.Next mirrors the implementation of -// twoLevelIterator.Next due to performance. Keep the two in sync. -func (i *twoLevelCompactionIterator) Next() (*InternalKey, base.LazyValue) { - if i.err != nil { - return nil, base.LazyValue{} - } - return i.skipForward(i.singleLevelIterator.Next()) -} - -func (i *twoLevelCompactionIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { - panic("pebble: NextPrefix unimplemented") -} - -func (i *twoLevelCompactionIterator) Prev() (*InternalKey, base.LazyValue) { - panic("pebble: Prev unimplemented") -} - -func (i *twoLevelCompactionIterator) String() string { - if i.vState != nil { - return i.vState.fileNum.String() - } - return i.reader.fileNum.String() -} - -func (i *twoLevelCompactionIterator) skipForward( - key *InternalKey, val base.LazyValue, -) (*InternalKey, base.LazyValue) { - if key == nil { - for { - if key, _ := i.topLevelIndex.Next(); key == nil { - break - } - result := i.loadIndex(+1) - if result != loadBlockOK { - if i.err != nil { - break - } - switch result { - case loadBlockFailed: - // We checked that i.index was at a valid entry, so - // loadBlockFailed could not have happened due to to i.index - // being exhausted, and must be due to an error. - panic("loadBlock should not have failed with no error") - case loadBlockIrrelevant: - panic("compactionIter should not be using block intervals for skipping") - default: - panic(fmt.Sprintf("unexpected case %d", result)) - } - } - // result == loadBlockOK - if key, val = i.singleLevelIterator.First(); key != nil { - break - } - } - } - - curOffset := i.recordOffset() - *i.bytesIterated += uint64(curOffset - i.prevOffset) - i.prevOffset = curOffset - - if i.vState != nil && key != nil { - cmp := i.cmp(key.UserKey, i.vState.upper.UserKey) - if cmp > 0 || (i.vState.upper.IsExclusiveSentinel() && cmp == 0) { - return nil, base.LazyValue{} - } - } - - return key, val -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/reader_virtual.go b/vendor/github.com/cockroachdb/pebble/sstable/reader_virtual.go deleted file mode 100644 index b044d19..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/reader_virtual.go +++ /dev/null @@ -1,206 +0,0 @@ -// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "context" - - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" -) - -// VirtualReader wraps Reader. Its purpose is to restrict functionality of the -// Reader which should be inaccessible to virtual sstables, and enforce bounds -// invariants associated with virtual sstables. All reads on virtual sstables -// should go through a VirtualReader. -// -// INVARIANT: Any iterators created through a virtual reader will guarantee that -// they don't expose keys outside the virtual sstable bounds. -type VirtualReader struct { - vState virtualState - reader *Reader - Properties CommonProperties -} - -// Lightweight virtual sstable state which can be passed to sstable iterators. -type virtualState struct { - lower InternalKey - upper InternalKey - fileNum base.FileNum - Compare Compare - isForeign bool -} - -func ceilDiv(a, b uint64) uint64 { - return (a + b - 1) / b -} - -// MakeVirtualReader is used to contruct a reader which can read from virtual -// sstables. -func MakeVirtualReader( - reader *Reader, meta manifest.VirtualFileMeta, isForeign bool, -) VirtualReader { - if reader.fileNum != meta.FileBacking.DiskFileNum { - panic("pebble: invalid call to MakeVirtualReader") - } - - vState := virtualState{ - lower: meta.Smallest, - upper: meta.Largest, - fileNum: meta.FileNum, - Compare: reader.Compare, - isForeign: isForeign, - } - v := VirtualReader{ - vState: vState, - reader: reader, - } - - v.Properties.RawKeySize = ceilDiv(reader.Properties.RawKeySize*meta.Size, meta.FileBacking.Size) - v.Properties.RawValueSize = ceilDiv(reader.Properties.RawValueSize*meta.Size, meta.FileBacking.Size) - v.Properties.NumEntries = ceilDiv(reader.Properties.NumEntries*meta.Size, meta.FileBacking.Size) - v.Properties.NumDeletions = ceilDiv(reader.Properties.NumDeletions*meta.Size, meta.FileBacking.Size) - v.Properties.NumRangeDeletions = ceilDiv(reader.Properties.NumRangeDeletions*meta.Size, meta.FileBacking.Size) - v.Properties.NumRangeKeyDels = ceilDiv(reader.Properties.NumRangeKeyDels*meta.Size, meta.FileBacking.Size) - - // Note that we rely on NumRangeKeySets for correctness. If the sstable may - // contain range keys, then NumRangeKeySets must be > 0. ceilDiv works because - // meta.Size will not be 0 for virtual sstables. - v.Properties.NumRangeKeySets = ceilDiv(reader.Properties.NumRangeKeySets*meta.Size, meta.FileBacking.Size) - v.Properties.ValueBlocksSize = ceilDiv(reader.Properties.ValueBlocksSize*meta.Size, meta.FileBacking.Size) - v.Properties.NumSizedDeletions = ceilDiv(reader.Properties.NumSizedDeletions*meta.Size, meta.FileBacking.Size) - v.Properties.RawPointTombstoneKeySize = ceilDiv(reader.Properties.RawPointTombstoneKeySize*meta.Size, meta.FileBacking.Size) - v.Properties.RawPointTombstoneValueSize = ceilDiv(reader.Properties.RawPointTombstoneValueSize*meta.Size, meta.FileBacking.Size) - return v -} - -// NewCompactionIter is the compaction iterator function for virtual readers. -func (v *VirtualReader) NewCompactionIter( - bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool, -) (Iterator, error) { - return v.reader.newCompactionIter(bytesIterated, rp, &v.vState, bufferPool) -} - -// NewIterWithBlockPropertyFiltersAndContextEtc wraps -// Reader.NewIterWithBlockPropertyFiltersAndContext. We assume that the passed -// in [lower, upper) bounds will have at least some overlap with the virtual -// sstable bounds. No overlap is not currently supported in the iterator. -func (v *VirtualReader) NewIterWithBlockPropertyFiltersAndContextEtc( - ctx context.Context, - lower, upper []byte, - filterer *BlockPropertiesFilterer, - hideObsoletePoints, useFilterBlock bool, - stats *base.InternalIteratorStats, - rp ReaderProvider, -) (Iterator, error) { - return v.reader.newIterWithBlockPropertyFiltersAndContext( - ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, &v.vState, - ) -} - -// ValidateBlockChecksumsOnBacking will call ValidateBlockChecksumsOnBacking on the underlying reader. -// Note that block checksum validation is NOT restricted to virtual sstable bounds. -func (v *VirtualReader) ValidateBlockChecksumsOnBacking() error { - return v.reader.ValidateBlockChecksums() -} - -// NewRawRangeDelIter wraps Reader.NewRawRangeDelIter. -func (v *VirtualReader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) { - iter, err := v.reader.NewRawRangeDelIter() - if err != nil { - return nil, err - } - if iter == nil { - return nil, nil - } - - // Truncation of spans isn't allowed at a user key that also contains points - // in the same virtual sstable, as it would lead to covered points getting - // uncovered. Set panicOnUpperTruncate to true if the file's upper bound - // is not an exclusive sentinel. - // - // As an example, if an sstable contains a rangedel a-c and point keys at - // a.SET.2 and b.SET.3, the file bounds [a#2,SET-b#RANGEDELSENTINEL] are - // allowed (as they exclude b.SET.3), or [a#2,SET-c#RANGEDELSENTINEL] (as it - // includes both point keys), but not [a#2,SET-b#3,SET] (as it would truncate - // the rangedel at b and lead to the point being uncovered). - return keyspan.Truncate( - v.reader.Compare, iter, v.vState.lower.UserKey, v.vState.upper.UserKey, - &v.vState.lower, &v.vState.upper, !v.vState.upper.IsExclusiveSentinel(), /* panicOnUpperTruncate */ - ), nil -} - -// NewRawRangeKeyIter wraps Reader.NewRawRangeKeyIter. -func (v *VirtualReader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) { - iter, err := v.reader.NewRawRangeKeyIter() - if err != nil { - return nil, err - } - if iter == nil { - return nil, nil - } - - // Truncation of spans isn't allowed at a user key that also contains points - // in the same virtual sstable, as it would lead to covered points getting - // uncovered. Set panicOnUpperTruncate to true if the file's upper bound - // is not an exclusive sentinel. - // - // As an example, if an sstable contains a range key a-c and point keys at - // a.SET.2 and b.SET.3, the file bounds [a#2,SET-b#RANGEKEYSENTINEL] are - // allowed (as they exclude b.SET.3), or [a#2,SET-c#RANGEKEYSENTINEL] (as it - // includes both point keys), but not [a#2,SET-b#3,SET] (as it would truncate - // the range key at b and lead to the point being uncovered). - return keyspan.Truncate( - v.reader.Compare, iter, v.vState.lower.UserKey, v.vState.upper.UserKey, - &v.vState.lower, &v.vState.upper, !v.vState.upper.IsExclusiveSentinel(), /* panicOnUpperTruncate */ - ), nil -} - -// Constrain bounds will narrow the start, end bounds if they do not fit within -// the virtual sstable. The function will return if the new end key is -// inclusive. -func (v *virtualState) constrainBounds( - start, end []byte, endInclusive bool, -) (lastKeyInclusive bool, first []byte, last []byte) { - first = start - if start == nil || v.Compare(start, v.lower.UserKey) < 0 { - first = v.lower.UserKey - } - - // Note that we assume that start, end has some overlap with the virtual - // sstable bounds. - last = v.upper.UserKey - lastKeyInclusive = !v.upper.IsExclusiveSentinel() - if end != nil { - cmp := v.Compare(end, v.upper.UserKey) - switch { - case cmp == 0: - lastKeyInclusive = !v.upper.IsExclusiveSentinel() && endInclusive - last = v.upper.UserKey - case cmp > 0: - lastKeyInclusive = !v.upper.IsExclusiveSentinel() - last = v.upper.UserKey - default: - lastKeyInclusive = endInclusive - last = end - } - } - // TODO(bananabrick): What if someone passes in bounds completely outside of - // virtual sstable bounds? - return lastKeyInclusive, first, last -} - -// EstimateDiskUsage just calls VirtualReader.reader.EstimateDiskUsage after -// enforcing the virtual sstable bounds. -func (v *VirtualReader) EstimateDiskUsage(start, end []byte) (uint64, error) { - _, f, l := v.vState.constrainBounds(start, end, true /* endInclusive */) - return v.reader.EstimateDiskUsage(f, l) -} - -// CommonProperties implements the CommonReader interface. -func (v *VirtualReader) CommonProperties() *CommonProperties { - return &v.Properties -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/suffix_rewriter.go b/vendor/github.com/cockroachdb/pebble/sstable/suffix_rewriter.go deleted file mode 100644 index 9672ded..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/suffix_rewriter.go +++ /dev/null @@ -1,589 +0,0 @@ -package sstable - -import ( - "bytes" - "context" - "math" - "sync" - - "github.com/cespare/xxhash/v2" - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/bytealloc" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/rangekey" - "github.com/cockroachdb/pebble/objstorage" -) - -// RewriteKeySuffixes is deprecated. -// -// TODO(sumeer): remove after switching CockroachDB to RewriteKeySuffixesAndReturnFormat. -func RewriteKeySuffixes( - sst []byte, - rOpts ReaderOptions, - out objstorage.Writable, - o WriterOptions, - from, to []byte, - concurrency int, -) (*WriterMetadata, error) { - meta, _, err := RewriteKeySuffixesAndReturnFormat(sst, rOpts, out, o, from, to, concurrency) - return meta, err -} - -// RewriteKeySuffixesAndReturnFormat copies the content of the passed SSTable -// bytes to a new sstable, written to `out`, in which the suffix `from` has is -// replaced with `to` in every key. The input sstable must consist of only -// Sets or RangeKeySets and every key must have `from` as its suffix as -// determined by the Split function of the Comparer in the passed -// WriterOptions. Range deletes must not exist in this sstable, as they will -// be ignored. -// -// Data blocks are rewritten in parallel by `concurrency` workers and then -// assembled into a final SST. Filters are copied from the original SST without -// modification as they are not affected by the suffix, while block and table -// properties are only minimally recomputed. -// -// TODO(sumeer): document limitations, if any, due to this limited -// re-computation of properties (is there any loss of fidelity?). -// -// Any block and table property collectors configured in the WriterOptions must -// implement SuffixReplaceableTableCollector/SuffixReplaceableBlockCollector. -// -// The WriterOptions.TableFormat is ignored, and the output sstable has the -// same TableFormat as the input, which is returned in case the caller wants -// to do some error checking. Suffix rewriting is meant to be efficient, and -// allowing changes in the TableFormat detracts from that efficiency. -// -// Any obsolete bits that key-value pairs may be annotated with are ignored -// and lost during the rewrite. Additionally, the output sstable has the -// pebble.obsolete.is_strict property set to false. These limitations could be -// removed if needed. The current use case for -// RewriteKeySuffixesAndReturnFormat in CockroachDB is for MVCC-compliant file -// ingestion, where these files do not contain RANGEDELs and have one -// key-value pair per userkey -- so they trivially satisfy the strict -// criteria, and we don't need the obsolete bit as a performance optimization. -// For disaggregated storage, strict obsolete sstables are needed for L5 and -// L6, but at the time of writing, we expect such MVCC-compliant file -// ingestion to only ingest into levels L4 and higher. If this changes, we can -// do one of two things to get rid of this limitation: -// - Validate that there are no duplicate userkeys and no RANGEDELs/MERGEs -// in the sstable to be rewritten. Validating no duplicate userkeys is -// non-trivial when rewriting blocks in parallel, so we could encode the -// pre-existing condition in the (existing) SnapshotPinnedKeys property -- -// we need to update the external sst writer to calculate and encode this -// property. -// - Preserve the obsolete bit (with changes to the blockIter). -func RewriteKeySuffixesAndReturnFormat( - sst []byte, - rOpts ReaderOptions, - out objstorage.Writable, - o WriterOptions, - from, to []byte, - concurrency int, -) (*WriterMetadata, TableFormat, error) { - r, err := NewMemReader(sst, rOpts) - if err != nil { - return nil, TableFormatUnspecified, err - } - defer r.Close() - return rewriteKeySuffixesInBlocks(r, out, o, from, to, concurrency) -} - -func rewriteKeySuffixesInBlocks( - r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte, concurrency int, -) (*WriterMetadata, TableFormat, error) { - if o.Comparer == nil || o.Comparer.Split == nil { - return nil, TableFormatUnspecified, - errors.New("a valid splitter is required to rewrite suffixes") - } - if concurrency < 1 { - return nil, TableFormatUnspecified, errors.New("concurrency must be >= 1") - } - // Even though NumValueBlocks = 0 => NumValuesInValueBlocks = 0, check both - // as a defensive measure. - if r.Properties.NumValueBlocks > 0 || r.Properties.NumValuesInValueBlocks > 0 { - return nil, TableFormatUnspecified, - errors.New("sstable with a single suffix should not have value blocks") - } - - tableFormat := r.tableFormat - o.TableFormat = tableFormat - w := NewWriter(out, o) - defer func() { - if w != nil { - w.Close() - } - }() - - for _, c := range w.propCollectors { - if _, ok := c.(SuffixReplaceableTableCollector); !ok { - return nil, TableFormatUnspecified, - errors.Errorf("property collector %s does not support suffix replacement", c.Name()) - } - } - for _, c := range w.blockPropCollectors { - if _, ok := c.(SuffixReplaceableBlockCollector); !ok { - return nil, TableFormatUnspecified, - errors.Errorf("block property collector %s does not support suffix replacement", c.Name()) - } - } - - l, err := r.Layout() - if err != nil { - return nil, TableFormatUnspecified, errors.Wrap(err, "reading layout") - } - - if err := rewriteDataBlocksToWriter(r, w, l.Data, from, to, w.split, concurrency); err != nil { - return nil, TableFormatUnspecified, errors.Wrap(err, "rewriting data blocks") - } - - // Copy over the range key block and replace suffixes in it if it exists. - if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil { - return nil, TableFormatUnspecified, errors.Wrap(err, "rewriting range key blocks") - } - - // Copy over the filter block if it exists (rewriteDataBlocksToWriter will - // already have ensured this is valid if it exists). - if w.filter != nil && l.Filter.Length > 0 { - filterBlock, _, err := readBlockBuf(r, l.Filter, nil) - if err != nil { - return nil, TableFormatUnspecified, errors.Wrap(err, "reading filter") - } - w.filter = copyFilterWriter{ - origPolicyName: w.filter.policyName(), origMetaName: w.filter.metaName(), data: filterBlock, - } - } - - if err := w.Close(); err != nil { - w = nil - return nil, TableFormatUnspecified, err - } - writerMeta, err := w.Metadata() - w = nil - return writerMeta, tableFormat, err -} - -var errBadKind = errors.New("key does not have expected kind (set)") - -type blockWithSpan struct { - start, end InternalKey - data []byte -} - -func rewriteBlocks( - r *Reader, - restartInterval int, - checksumType ChecksumType, - compression Compression, - input []BlockHandleWithProperties, - output []blockWithSpan, - totalWorkers, worker int, - from, to []byte, - split Split, -) error { - bw := blockWriter{ - restartInterval: restartInterval, - } - buf := blockBuf{checksummer: checksummer{checksumType: checksumType}} - if checksumType == ChecksumTypeXXHash { - buf.checksummer.xxHasher = xxhash.New() - } - - var blockAlloc bytealloc.A - var keyAlloc bytealloc.A - var scratch InternalKey - - var inputBlock, inputBlockBuf []byte - - iter := &blockIter{} - - // We'll assume all blocks are _roughly_ equal so round-robin static partition - // of each worker doing every ith block is probably enough. - for i := worker; i < len(input); i += totalWorkers { - bh := input[i] - - var err error - inputBlock, inputBlockBuf, err = readBlockBuf(r, bh.BlockHandle, inputBlockBuf) - if err != nil { - return err - } - if err := iter.init(r.Compare, inputBlock, r.Properties.GlobalSeqNum, false); err != nil { - return err - } - - if cap(bw.restarts) < int(iter.restarts) { - bw.restarts = make([]uint32, 0, iter.restarts) - } - if cap(bw.buf) == 0 { - bw.buf = make([]byte, 0, len(inputBlock)) - } - if cap(bw.restarts) < int(iter.numRestarts) { - bw.restarts = make([]uint32, 0, iter.numRestarts) - } - - for key, val := iter.First(); key != nil; key, val = iter.Next() { - if key.Kind() != InternalKeyKindSet { - return errBadKind - } - si := split(key.UserKey) - oldSuffix := key.UserKey[si:] - if !bytes.Equal(oldSuffix, from) { - err := errors.Errorf("key has suffix %q, expected %q", oldSuffix, from) - return err - } - newLen := si + len(to) - if cap(scratch.UserKey) < newLen { - scratch.UserKey = make([]byte, 0, len(key.UserKey)*2+len(to)-len(from)) - } - - scratch.Trailer = key.Trailer - scratch.UserKey = scratch.UserKey[:newLen] - copy(scratch.UserKey, key.UserKey[:si]) - copy(scratch.UserKey[si:], to) - - // NB: for TableFormatPebblev3 and higher, since - // !iter.lazyValueHandling.hasValuePrefix, it will return the raw value - // in the block, which includes the 1-byte prefix. This is fine since bw - // also does not know about the prefix and will preserve it in bw.add. - v := val.InPlaceValue() - if invariants.Enabled && r.tableFormat >= TableFormatPebblev3 && - key.Kind() == InternalKeyKindSet { - if len(v) < 1 { - return errors.Errorf("value has no prefix") - } - prefix := valuePrefix(v[0]) - if isValueHandle(prefix) { - return errors.Errorf("value prefix is incorrect") - } - if setHasSamePrefix(prefix) { - return errors.Errorf("multiple keys with same key prefix") - } - } - bw.add(scratch, v) - if output[i].start.UserKey == nil { - keyAlloc, output[i].start = cloneKeyWithBuf(scratch, keyAlloc) - } - } - *iter = iter.resetForReuse() - - keyAlloc, output[i].end = cloneKeyWithBuf(scratch, keyAlloc) - - finished := compressAndChecksum(bw.finish(), compression, &buf) - - // copy our finished block into the output buffer. - blockAlloc, output[i].data = blockAlloc.Alloc(len(finished) + blockTrailerLen) - copy(output[i].data, finished) - copy(output[i].data[len(finished):], buf.tmp[:blockTrailerLen]) - } - return nil -} - -func rewriteDataBlocksToWriter( - r *Reader, - w *Writer, - data []BlockHandleWithProperties, - from, to []byte, - split Split, - concurrency int, -) error { - if r.Properties.NumEntries == 0 { - // No point keys. - return nil - } - blocks := make([]blockWithSpan, len(data)) - - if w.filter != nil { - if r.Properties.FilterPolicyName != w.filter.policyName() { - return errors.New("mismatched filters") - } - if was, is := r.Properties.ComparerName, w.props.ComparerName; was != is { - return errors.Errorf("mismatched Comparer %s vs %s, replacement requires same splitter to copy filters", was, is) - } - } - - g := &sync.WaitGroup{} - g.Add(concurrency) - errCh := make(chan error, concurrency) - for i := 0; i < concurrency; i++ { - worker := i - go func() { - defer g.Done() - err := rewriteBlocks( - r, - w.dataBlockBuf.dataBlock.restartInterval, - w.blockBuf.checksummer.checksumType, - w.compression, - data, - blocks, - concurrency, - worker, - from, to, - split, - ) - if err != nil { - errCh <- err - } - }() - } - g.Wait() - close(errCh) - if err, ok := <-errCh; ok { - return err - } - - for _, p := range w.propCollectors { - if err := p.(SuffixReplaceableTableCollector).UpdateKeySuffixes(r.Properties.UserProperties, from, to); err != nil { - return err - } - } - - var decoder blockPropertiesDecoder - var oldShortIDs []shortID - var oldProps [][]byte - if len(w.blockPropCollectors) > 0 { - oldProps = make([][]byte, len(w.blockPropCollectors)) - oldShortIDs = make([]shortID, math.MaxUint8) - for i, p := range w.blockPropCollectors { - if prop, ok := r.Properties.UserProperties[p.Name()]; ok { - was, is := shortID(byte(prop[0])), shortID(i) - oldShortIDs[was] = is - } - } - } - - for i := range blocks { - // Write the rewritten block to the file. - if err := w.writable.Write(blocks[i].data); err != nil { - return err - } - - n := len(blocks[i].data) - bh := BlockHandle{Offset: w.meta.Size, Length: uint64(n) - blockTrailerLen} - // Update the overall size. - w.meta.Size += uint64(n) - - // Load any previous values for our prop collectors into oldProps. - for i := range oldProps { - oldProps[i] = nil - } - decoder.props = data[i].Props - for !decoder.done() { - id, val, err := decoder.next() - if err != nil { - return err - } - oldProps[oldShortIDs[id]] = val - } - - for i, p := range w.blockPropCollectors { - if err := p.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProps[i], from, to); err != nil { - return err - } - } - - bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh) - if err != nil { - return err - } - var nextKey InternalKey - if i+1 < len(blocks) { - nextKey = blocks[i+1].start - } - if err = w.addIndexEntrySync(blocks[i].end, nextKey, bhp, w.dataBlockBuf.tmp[:]); err != nil { - return err - } - } - - w.meta.updateSeqNum(blocks[0].start.SeqNum()) - w.props.NumEntries = r.Properties.NumEntries - w.props.RawKeySize = r.Properties.RawKeySize - w.props.RawValueSize = r.Properties.RawValueSize - w.meta.SetSmallestPointKey(blocks[0].start) - w.meta.SetLargestPointKey(blocks[len(blocks)-1].end) - return nil -} - -func rewriteRangeKeyBlockToWriter(r *Reader, w *Writer, from, to []byte) error { - iter, err := r.NewRawRangeKeyIter() - if err != nil { - return err - } - if iter == nil { - // No range keys. - return nil - } - defer iter.Close() - - for s := iter.First(); s != nil; s = iter.Next() { - if !s.Valid() { - break - } - for i := range s.Keys { - if s.Keys[i].Kind() != base.InternalKeyKindRangeKeySet { - return errBadKind - } - if !bytes.Equal(s.Keys[i].Suffix, from) { - return errors.Errorf("key has suffix %q, expected %q", s.Keys[i].Suffix, from) - } - s.Keys[i].Suffix = to - } - - err := rangekey.Encode(s, func(k base.InternalKey, v []byte) error { - // Calling AddRangeKey instead of addRangeKeySpan bypasses the fragmenter. - // This is okay because the raw fragments off of `iter` are already - // fragmented, and suffix replacement should not affect fragmentation. - return w.AddRangeKey(k, v) - }) - if err != nil { - return err - } - } - - return nil -} - -type copyFilterWriter struct { - origMetaName string - origPolicyName string - data []byte -} - -func (copyFilterWriter) addKey(key []byte) { panic("unimplemented") } -func (c copyFilterWriter) finish() ([]byte, error) { return c.data, nil } -func (c copyFilterWriter) metaName() string { return c.origMetaName } -func (c copyFilterWriter) policyName() string { return c.origPolicyName } - -// RewriteKeySuffixesViaWriter is similar to RewriteKeySuffixes but uses just a -// single loop over the Reader that writes each key to the Writer with the new -// suffix. The is significantly slower than the parallelized rewriter, and does -// more work to rederive filters, props, etc. -// -// Any obsolete bits that key-value pairs may be annotated with are ignored -// and lost during the rewrite. Some of the obsolete bits may be recreated -- -// specifically when there are multiple keys with the same user key. -// Additionally, the output sstable has the pebble.obsolete.is_strict property -// set to false. See the longer comment at RewriteKeySuffixesAndReturnFormat. -func RewriteKeySuffixesViaWriter( - r *Reader, out objstorage.Writable, o WriterOptions, from, to []byte, -) (*WriterMetadata, error) { - if o.Comparer == nil || o.Comparer.Split == nil { - return nil, errors.New("a valid splitter is required to rewrite suffixes") - } - - o.IsStrictObsolete = false - w := NewWriter(out, o) - defer func() { - if w != nil { - w.Close() - } - }() - i, err := r.NewIter(nil, nil) - if err != nil { - return nil, err - } - defer i.Close() - - k, v := i.First() - var scratch InternalKey - for k != nil { - if k.Kind() != InternalKeyKindSet { - return nil, errors.New("invalid key type") - } - oldSuffix := k.UserKey[r.Split(k.UserKey):] - if !bytes.Equal(oldSuffix, from) { - return nil, errors.Errorf("key has suffix %q, expected %q", oldSuffix, from) - } - scratch.UserKey = append(scratch.UserKey[:0], k.UserKey[:len(k.UserKey)-len(from)]...) - scratch.UserKey = append(scratch.UserKey, to...) - scratch.Trailer = k.Trailer - - val, _, err := v.Value(nil) - if err != nil { - return nil, err - } - if w.addPoint(scratch, val, false); err != nil { - return nil, err - } - k, v = i.Next() - } - if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil { - return nil, err - } - if err := w.Close(); err != nil { - w = nil - return nil, err - } - writerMeta, err := w.Metadata() - w = nil - return writerMeta, err -} - -// NewMemReader opens a reader over the SST stored in the passed []byte. -func NewMemReader(sst []byte, o ReaderOptions) (*Reader, error) { - return NewReader(newMemReader(sst), o) -} - -func readBlockBuf(r *Reader, bh BlockHandle, buf []byte) ([]byte, []byte, error) { - raw := r.readable.(*memReader).b[bh.Offset : bh.Offset+bh.Length+blockTrailerLen] - if err := checkChecksum(r.checksumType, raw, bh, 0); err != nil { - return nil, buf, err - } - typ := blockType(raw[bh.Length]) - raw = raw[:bh.Length] - if typ == noCompressionBlockType { - return raw, buf, nil - } - decompressedLen, prefix, err := decompressedLen(typ, raw) - if err != nil { - return nil, buf, err - } - if cap(buf) < decompressedLen { - buf = make([]byte, decompressedLen) - } - res, err := decompressInto(typ, raw[prefix:], buf[:decompressedLen]) - return res, buf, err -} - -// memReader is a thin wrapper around a []byte such that it can be passed to -// sstable.Reader. It supports concurrent use, and does so without locking in -// contrast to the heavier read/write vfs.MemFile. -type memReader struct { - b []byte - r *bytes.Reader - rh objstorage.NoopReadHandle -} - -var _ objstorage.Readable = (*memReader)(nil) - -func newMemReader(b []byte) *memReader { - r := &memReader{ - b: b, - r: bytes.NewReader(b), - } - r.rh = objstorage.MakeNoopReadHandle(r) - return r -} - -// ReadAt is part of objstorage.Readable. -func (m *memReader) ReadAt(_ context.Context, p []byte, off int64) error { - n, err := m.r.ReadAt(p, off) - if invariants.Enabled && err == nil && n != len(p) { - panic("short read") - } - return err -} - -// Close is part of objstorage.Readable. -func (*memReader) Close() error { - return nil -} - -// Stat is part of objstorage.Readable. -func (m *memReader) Size() int64 { - return int64(len(m.b)) -} - -// NewReadHandle is part of objstorage.Readable. -func (m *memReader) NewReadHandle(_ context.Context) objstorage.ReadHandle { - return &m.rh -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/unsafe.go b/vendor/github.com/cockroachdb/pebble/sstable/unsafe.go deleted file mode 100644 index 11ec068..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/unsafe.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "unsafe" - - "github.com/cockroachdb/pebble/internal/manual" -) - -func getBytes(ptr unsafe.Pointer, length int) []byte { - return (*[manual.MaxArrayLen]byte)(ptr)[:length:length] -} - -func decodeVarint(ptr unsafe.Pointer) (uint32, unsafe.Pointer) { - if a := *((*uint8)(ptr)); a < 128 { - return uint32(a), - unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - return uint32(b)<<7 | uint32(a), - unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - return uint32(c)<<14 | uint32(b)<<7 | uint32(a), - unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - return uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a), - unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - return uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a), - unsafe.Pointer(uintptr(ptr) + 5) - } -} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/value_block.go b/vendor/github.com/cockroachdb/pebble/sstable/value_block.go deleted file mode 100644 index 447348b..0000000 --- a/vendor/github.com/cockroachdb/pebble/sstable/value_block.go +++ /dev/null @@ -1,950 +0,0 @@ -// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package sstable - -import ( - "context" - "encoding/binary" - "io" - "sync" - "unsafe" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" - "golang.org/x/exp/rand" -) - -// Value blocks are supported in TableFormatPebblev3. -// -// 1. Motivation and overview -// -// Value blocks are a mechanism designed for sstables storing MVCC data, where -// there can be many versions of a key that need to be kept, but only the -// latest value is typically read (see the documentation for Comparer.Split -// regarding MVCC keys). The goal is faster reads. Unlike Pebble versions, -// which can be eagerly thrown away (except when there are snapshots), MVCC -// versions are long-lived (e.g. default CockroachDB garbage collection -// threshold for older versions is 24 hours) and can significantly slow down -// reads. We have seen CockroachDB production workloads with very slow reads -// due to: -// - 100s of versions for each key in a table. -// -// - Tables with mostly MVCC garbage consisting of 2 versions per key -- a -// real key-value pair, followed by a key-value pair whose value (usually -// with zero byte length) indicates it is an MVCC tombstone. -// -// The value blocks mechanism attempts to improve read throughput in these -// cases when the key size is smaller than the value sizes of older versions. -// This is done by moving the value of an older version to a value block in a -// different part of the sstable. This improves spatial locality of the data -// being read by the workload, which increases caching effectiveness. -// -// Additionally, even when the key size is not smaller than the value of older -// versions (e.g. secondary indexes in CockroachDB), TableFormatPebblev3 -// stores the result of key comparisons done at write time inside the sstable, -// which makes stepping from one key prefix to the next prefix (i.e., skipping -// over older versions of a MVCC key) more efficient by avoiding key -// comparisons and key decoding. See the results in -// https://github.com/cockroachdb/pebble/pull/2149 and more details in the -// comment inside BenchmarkIteratorScanNextPrefix. These improvements are also -// visible in end-to-end CockroachDB tests, as outlined in -// https://github.com/cockroachdb/cockroach/pull/96652. -// -// In TableFormatPebblev3, each SET has a one byte value prefix that tells us -// whether the value is in-place or in a value block. This 1 byte prefix -// encodes additional information: -// -// - ShortAttribute: This is an attribute of the value. Currently, CockroachDB -// uses it to represent whether the value is a tombstone or not. This avoids -// the need to fetch a value from the value block if the caller only wants -// to figure out whether it is an MVCC tombstone. The length of the value is -// another attribute that the caller can be interested in, and it is also -// accessible without reading the value in the value block (see the value -// handle in the details section). -// -// - SET-same-prefix: this enables the aforementioned optimization when -// stepping from one key prefix to the next key prefix. -// -// We further optimize this iteration over prefixes by using the restart -// points in a block to encode whether the SET at a restart point has the same -// prefix since the last restart point. This allows us to skip over restart -// points within the same block. See the comment in blockWriter, and how both -// SET-same-prefix and the restart point information is used in -// blockIter.nextPrefixV3. -// -// This flexibility of values that are in-place or in value blocks requires -// flexibility in the iterator interface. The InternalIterator interface -// returns a LazyValue instead of a byte slice. Additionally, pebble.Iterator -// allows the caller to ask for a LazyValue. See lazy_value.go for details, -// including the memory lifetime management. -// -// For historical discussions about this feature, see the issue -// https://github.com/cockroachdb/pebble/issues/1170 and the prototype in -// https://github.com/cockroachdb/pebble/pull/1443. -// -// The code in this file mainly covers value block and related encodings. We -// discuss these in the next section. -// -// 2. Details -// -// Note that the notion of the latest value is local to the sstable. It is -// possible that that latest value has been deleted by a sstable in a higher -// level, and what is the latest value from the perspective of the whole LSM -// is an older MVCC version. This only affects performance and not -// correctness. This local knowledge is also why we continue to store these -// older versions in the same sstable -- we need to be able to conveniently -// read them. The code in this file is agnostic to the policy regarding what -// should be stored in value blocks -- it allows even the latest MVCC version -// to be stored in a value block. The policy decision in made in the -// sstable.Writer. See Writer.makeAddPointDecisionV3. -// -// Data blocks contain two kinds of SET keys: those with in-place values and -// those with a value handle. To distinguish these two cases we use a single -// byte prefix (valuePrefix). This single byte prefix is split into multiple -// parts, where nb represents information that is encoded in n bits. -// -// +---------------+--------------------+-----------+--------------------+ -// | value-kind 2b | SET-same-prefix 1b | unused 2b | short-attribute 3b | -// +---------------+--------------------+-----------+--------------------+ -// -// The 2 bit value-kind specifies whether this is an in-place value or a value -// handle pointing to a value block. We use 2 bits here for future -// representation of values that are in separate files. The 1 bit -// SET-same-prefix is true if this key is a SET and is immediately preceded by -// a SET that shares the same prefix. The 3 bit short-attribute is described -// in base.ShortAttribute -- it stores user-defined attributes about the -// value. It is unused for in-place values. -// -// Value Handle and Value Blocks: -// valueHandles refer to values in value blocks. Value blocks are simpler than -// normal data blocks (that contain key-value pairs, and allow for binary -// search), which makes them cheap for value retrieval purposes. A valueHandle -// is a tuple (valueLen, blockNum, offsetInBlock), where blockNum is the 0 -// indexed value block number and offsetInBlock is the byte offset in that -// block containing the value. The valueHandle.valueLen is included since -// there are multiple use cases in CockroachDB that need the value length but -// not the value, for which we can avoid reading the value in the value block -// (see -// https://github.com/cockroachdb/pebble/issues/1170#issuecomment-958203245). -// -// A value block has a checksum like other blocks, and is optionally -// compressed. An uncompressed value block is a sequence of values with no -// separator or length (we rely on the valueHandle to demarcate). The -// valueHandle.offsetInBlock points to the value, of length -// valueHandle.valueLen. While writing a sstable, all the (possibly -// compressed) value blocks need to be held in-memory until they can be -// written. Value blocks are placed after the "meta rangedel" and "meta range -// key" blocks since value blocks are considered less likely to be read. -// -// Meta Value Index Block: -// Since the (key, valueHandle) pair are written before there is any knowledge -// of the byte offset of the value block in the file, or its compressed -// length, we need another lookup to map the valueHandle.blockNum to the -// information needed to read it from the file. This information is provided -// by the "value index block". The "value index block" is referred to by the -// metaindex block. The design intentionally avoids making the "value index -// block" a general purpose key-value block, since each caller wants to lookup -// the information for a particular blockNum (there is no need for SeekGE -// etc.). Instead, this index block stores a sequence of (blockNum, -// blockOffset, blockLength) tuples, where the blockNums are consecutive -// integers, and the tuples are encoded with a fixed width encoding. This -// allows a reader to find the tuple for block K by looking at the offset -// K*fixed-width. The fixed width for each field is decided by looking at the -// maximum value of each of these fields. As a concrete example of a large -// sstable with many value blocks, we constructed a 100MB sstable with many -// versions and had 2475 value blocks (~32KB each). This sstable had this -// tuple encoded using 2+4+2=8 bytes, which means the uncompressed value index -// block was 2475*8=~19KB, which is modest. Therefore, we don't support more -// than one value index block. Consider the example of 2 byte blockNum, 4 byte -// blockOffset and 2 byte blockLen. The value index block will look like: -// -// +---------------+------------------+---------------+ -// | blockNum (2B) | blockOffset (4B) | blockLen (2B) | -// +---------------+------------------+---------------+ -// | 0 | 7,123,456 | 30,000 | -// +---------------+------------------+---------------+ -// | 1 | 7,153,456 | 20,000 | -// +---------------+------------------+---------------+ -// | 2 | 7,173,456 | 25,567 | -// +---------------+------------------+---------------+ -// | .... | ... | ... | -// -// -// The metaindex block contains the valueBlocksIndexHandle which in addition -// to the BlockHandle also specifies the widths of these tuple fields. In the -// above example, the -// valueBlockIndexHandle.{blockNumByteLength,blockOffsetByteLength,blockLengthByteLength} -// will be (2,4,2). - -// valueHandle is stored with a key when the value is in a value block. This -// handle is the pointer to that value. -type valueHandle struct { - valueLen uint32 - blockNum uint32 - offsetInBlock uint32 -} - -// valuePrefix is the single byte prefix for either the in-place value or the -// encoded valueHandle. It encoded multiple kinds of information. -type valuePrefix byte - -const ( - // 2 most-significant bits of valuePrefix encodes the value-kind. - valueKindMask valuePrefix = '\xC0' - valueKindIsValueHandle valuePrefix = '\x80' - valueKindIsInPlaceValue valuePrefix = '\x00' - - // 1 bit indicates SET has same key prefix as immediately preceding key that - // is also a SET. If the immediately preceding key in the same block is a - // SET, AND this bit is 0, the prefix must have changed. - // - // Note that the current policy of only storing older MVCC versions in value - // blocks means that valueKindIsValueHandle => SET has same prefix. But no - // code should rely on this behavior. Also, SET has same prefix does *not* - // imply valueKindIsValueHandle. - setHasSameKeyPrefixMask valuePrefix = '\x20' - - // 3 least-significant bits for the user-defined base.ShortAttribute. - // Undefined for valueKindIsInPlaceValue. - userDefinedShortAttributeMask valuePrefix = '\x07' -) - -// valueHandle fields are varint encoded, so maximum 5 bytes each, plus 1 byte -// for the valuePrefix. This could alternatively be group varint encoded, but -// experiments were inconclusive -// (https://github.com/cockroachdb/pebble/pull/1443#issuecomment-1270298802). -const valueHandleMaxLen = 5*3 + 1 - -// Assert blockHandleLikelyMaxLen >= valueHandleMaxLen. -const _ = uint(blockHandleLikelyMaxLen - valueHandleMaxLen) - -func encodeValueHandle(dst []byte, v valueHandle) int { - n := 0 - n += binary.PutUvarint(dst[n:], uint64(v.valueLen)) - n += binary.PutUvarint(dst[n:], uint64(v.blockNum)) - n += binary.PutUvarint(dst[n:], uint64(v.offsetInBlock)) - return n -} - -func makePrefixForValueHandle(setHasSameKeyPrefix bool, attribute base.ShortAttribute) valuePrefix { - prefix := valueKindIsValueHandle | valuePrefix(attribute) - if setHasSameKeyPrefix { - prefix = prefix | setHasSameKeyPrefixMask - } - return prefix -} - -func makePrefixForInPlaceValue(setHasSameKeyPrefix bool) valuePrefix { - prefix := valueKindIsInPlaceValue - if setHasSameKeyPrefix { - prefix = prefix | setHasSameKeyPrefixMask - } - return prefix -} - -func isValueHandle(b valuePrefix) bool { - return b&valueKindMask == valueKindIsValueHandle -} - -// REQUIRES: isValueHandle(b) -func getShortAttribute(b valuePrefix) base.ShortAttribute { - return base.ShortAttribute(b & userDefinedShortAttributeMask) -} - -func setHasSamePrefix(b valuePrefix) bool { - return b&setHasSameKeyPrefixMask == setHasSameKeyPrefixMask -} - -func decodeLenFromValueHandle(src []byte) (uint32, []byte) { - ptr := unsafe.Pointer(&src[0]) - var v uint32 - if a := *((*uint8)(ptr)); a < 128 { - v = uint32(a) - src = src[1:] - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - v = uint32(b)<<7 | uint32(a) - src = src[2:] - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - src = src[3:] - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - src = src[4:] - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - src = src[5:] - } - return v, src -} - -func decodeRemainingValueHandle(src []byte) valueHandle { - var vh valueHandle - ptr := unsafe.Pointer(&src[0]) - // Manually inlined uvarint decoding. Saves ~25% in benchmarks. Unrolling - // a loop for i:=0; i<2; i++, saves ~6%. - var v uint32 - if a := *((*uint8)(ptr)); a < 128 { - v = uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 1) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - v = uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 2) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 3) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 4) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - ptr = unsafe.Pointer(uintptr(ptr) + 5) - } - vh.blockNum = v - - if a := *((*uint8)(ptr)); a < 128 { - v = uint32(a) - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - v = uint32(b)<<7 | uint32(a) - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - } - vh.offsetInBlock = v - - return vh -} - -func decodeValueHandle(src []byte) valueHandle { - valLen, src := decodeLenFromValueHandle(src) - vh := decodeRemainingValueHandle(src) - vh.valueLen = valLen - return vh -} - -// valueBlocksIndexHandle is placed in the metaindex if there are any value -// blocks. If there are no value blocks, there is no value blocks index, and -// no entry in the metaindex. Note that the lack of entry in the metaindex -// should not be used to ascertain whether the values are prefixed, since the -// former is an emergent property of the data that was written and not known -// until all the key-value pairs in the sstable are written. -type valueBlocksIndexHandle struct { - h BlockHandle - blockNumByteLength uint8 - blockOffsetByteLength uint8 - blockLengthByteLength uint8 -} - -const valueBlocksIndexHandleMaxLen = blockHandleMaxLenWithoutProperties + 3 - -// Assert blockHandleLikelyMaxLen >= valueBlocksIndexHandleMaxLen. -const _ = uint(blockHandleLikelyMaxLen - valueBlocksIndexHandleMaxLen) - -func encodeValueBlocksIndexHandle(dst []byte, v valueBlocksIndexHandle) int { - n := encodeBlockHandle(dst, v.h) - dst[n] = v.blockNumByteLength - n++ - dst[n] = v.blockOffsetByteLength - n++ - dst[n] = v.blockLengthByteLength - n++ - return n -} - -func decodeValueBlocksIndexHandle(src []byte) (valueBlocksIndexHandle, int, error) { - var vbih valueBlocksIndexHandle - var n int - vbih.h, n = decodeBlockHandle(src) - if n <= 0 { - return vbih, 0, errors.Errorf("bad BlockHandle %x", src) - } - if len(src) != n+3 { - return vbih, 0, errors.Errorf("bad BlockHandle %x", src) - } - vbih.blockNumByteLength = src[n] - vbih.blockOffsetByteLength = src[n+1] - vbih.blockLengthByteLength = src[n+2] - return vbih, n + 3, nil -} - -type valueBlocksAndIndexStats struct { - numValueBlocks uint64 - numValuesInValueBlocks uint64 - // Includes both value blocks and value index block. - valueBlocksAndIndexSize uint64 -} - -// valueBlockWriter writes a sequence of value blocks, and the value blocks -// index, for a sstable. -type valueBlockWriter struct { - // The configured uncompressed block size and size threshold - blockSize, blockSizeThreshold int - // Configured compression. - compression Compression - // checksummer with configured checksum type. - checksummer checksummer - // Block finished callback. - blockFinishedFunc func(compressedSize int) - - // buf is the current block being written to (uncompressed). - buf *blockBuffer - // compressedBuf is used for compressing the block. - compressedBuf *blockBuffer - // Sequence of blocks that are finished. - blocks []blockAndHandle - // Cumulative value block bytes written so far. - totalBlockBytes uint64 - numValues uint64 -} - -type blockAndHandle struct { - block *blockBuffer - handle BlockHandle - compressed bool -} - -type blockBuffer struct { - b []byte -} - -// Pool of block buffers that should be roughly the blockSize. -var uncompressedValueBlockBufPool = sync.Pool{ - New: func() interface{} { - return &blockBuffer{} - }, -} - -// Pool of block buffers for compressed value blocks. These may widely vary in -// size based on compression ratios. -var compressedValueBlockBufPool = sync.Pool{ - New: func() interface{} { - return &blockBuffer{} - }, -} - -func releaseToValueBlockBufPool(pool *sync.Pool, b *blockBuffer) { - // Don't pool buffers larger than 128KB, in case we had some rare large - // values. - if len(b.b) > 128*1024 { - return - } - if invariants.Enabled { - // Set the bytes to a random value. Cap the number of bytes being - // randomized to prevent test timeouts. - length := cap(b.b) - if length > 1000 { - length = 1000 - } - b.b = b.b[:length:length] - rand.Read(b.b) - } - pool.Put(b) -} - -var valueBlockWriterPool = sync.Pool{ - New: func() interface{} { - return &valueBlockWriter{} - }, -} - -func newValueBlockWriter( - blockSize int, - blockSizeThreshold int, - compression Compression, - checksumType ChecksumType, - // compressedSize should exclude the block trailer. - blockFinishedFunc func(compressedSize int), -) *valueBlockWriter { - w := valueBlockWriterPool.Get().(*valueBlockWriter) - *w = valueBlockWriter{ - blockSize: blockSize, - blockSizeThreshold: blockSizeThreshold, - compression: compression, - checksummer: checksummer{ - checksumType: checksumType, - }, - blockFinishedFunc: blockFinishedFunc, - buf: uncompressedValueBlockBufPool.Get().(*blockBuffer), - compressedBuf: compressedValueBlockBufPool.Get().(*blockBuffer), - blocks: w.blocks[:0], - } - w.buf.b = w.buf.b[:0] - w.compressedBuf.b = w.compressedBuf.b[:0] - return w -} - -func releaseValueBlockWriter(w *valueBlockWriter) { - for i := range w.blocks { - if w.blocks[i].compressed { - releaseToValueBlockBufPool(&compressedValueBlockBufPool, w.blocks[i].block) - } else { - releaseToValueBlockBufPool(&uncompressedValueBlockBufPool, w.blocks[i].block) - } - w.blocks[i].block = nil - } - if w.buf != nil { - releaseToValueBlockBufPool(&uncompressedValueBlockBufPool, w.buf) - } - if w.compressedBuf != nil { - releaseToValueBlockBufPool(&compressedValueBlockBufPool, w.compressedBuf) - } - *w = valueBlockWriter{ - blocks: w.blocks[:0], - } - valueBlockWriterPool.Put(w) -} - -func (w *valueBlockWriter) addValue(v []byte) (valueHandle, error) { - if invariants.Enabled && len(v) == 0 { - return valueHandle{}, errors.Errorf("cannot write empty value to value block") - } - w.numValues++ - blockLen := len(w.buf.b) - valueLen := len(v) - if blockLen >= w.blockSize || - (blockLen > w.blockSizeThreshold && blockLen+valueLen > w.blockSize) { - // Block is not currently empty and adding this value will become too big, - // so finish this block. - w.compressAndFlush() - blockLen = len(w.buf.b) - if invariants.Enabled && blockLen != 0 { - panic("blockLen of new block should be 0") - } - } - vh := valueHandle{ - valueLen: uint32(valueLen), - blockNum: uint32(len(w.blocks)), - offsetInBlock: uint32(blockLen), - } - blockLen = int(vh.offsetInBlock + vh.valueLen) - if cap(w.buf.b) < blockLen { - size := 2 * cap(w.buf.b) - if size < 1024 { - size = 1024 - } - for size < blockLen { - size *= 2 - } - buf := make([]byte, blockLen, size) - _ = copy(buf, w.buf.b) - w.buf.b = buf - } else { - w.buf.b = w.buf.b[:blockLen] - } - buf := w.buf.b[vh.offsetInBlock:] - n := copy(buf, v) - if n != len(buf) { - panic("incorrect length computation") - } - return vh, nil -} - -func (w *valueBlockWriter) compressAndFlush() { - // Compress the buffer, discarding the result if the improvement isn't at - // least 12.5%. - blockType := noCompressionBlockType - b := w.buf - if w.compression != NoCompression { - blockType, w.compressedBuf.b = - compressBlock(w.compression, w.buf.b, w.compressedBuf.b[:cap(w.compressedBuf.b)]) - if len(w.compressedBuf.b) < len(w.buf.b)-len(w.buf.b)/8 { - b = w.compressedBuf - } else { - blockType = noCompressionBlockType - } - } - n := len(b.b) - if n+blockTrailerLen > cap(b.b) { - block := make([]byte, n+blockTrailerLen) - copy(block, b.b) - b.b = block - } else { - b.b = b.b[:n+blockTrailerLen] - } - b.b[n] = byte(blockType) - w.computeChecksum(b.b) - bh := BlockHandle{Offset: w.totalBlockBytes, Length: uint64(n)} - w.totalBlockBytes += uint64(len(b.b)) - // blockFinishedFunc length excludes the block trailer. - w.blockFinishedFunc(n) - compressed := blockType != noCompressionBlockType - w.blocks = append(w.blocks, blockAndHandle{ - block: b, - handle: bh, - compressed: compressed, - }) - // Handed off a buffer to w.blocks, so need get a new one. - if compressed { - w.compressedBuf = compressedValueBlockBufPool.Get().(*blockBuffer) - } else { - w.buf = uncompressedValueBlockBufPool.Get().(*blockBuffer) - } - w.buf.b = w.buf.b[:0] -} - -func (w *valueBlockWriter) computeChecksum(block []byte) { - n := len(block) - blockTrailerLen - checksum := w.checksummer.checksum(block[:n], block[n:n+1]) - binary.LittleEndian.PutUint32(block[n+1:], checksum) -} - -func (w *valueBlockWriter) finish( - writer io.Writer, fileOffset uint64, -) (valueBlocksIndexHandle, valueBlocksAndIndexStats, error) { - if len(w.buf.b) > 0 { - w.compressAndFlush() - } - n := len(w.blocks) - if n == 0 { - return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, nil - } - largestOffset := uint64(0) - largestLength := uint64(0) - for i := range w.blocks { - _, err := writer.Write(w.blocks[i].block.b) - if err != nil { - return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, err - } - w.blocks[i].handle.Offset += fileOffset - largestOffset = w.blocks[i].handle.Offset - if largestLength < w.blocks[i].handle.Length { - largestLength = w.blocks[i].handle.Length - } - } - vbihOffset := fileOffset + w.totalBlockBytes - - vbih := valueBlocksIndexHandle{ - h: BlockHandle{ - Offset: vbihOffset, - }, - blockNumByteLength: uint8(lenLittleEndian(uint64(n - 1))), - blockOffsetByteLength: uint8(lenLittleEndian(largestOffset)), - blockLengthByteLength: uint8(lenLittleEndian(largestLength)), - } - var err error - if vbih, err = w.writeValueBlocksIndex(writer, vbih); err != nil { - return valueBlocksIndexHandle{}, valueBlocksAndIndexStats{}, err - } - stats := valueBlocksAndIndexStats{ - numValueBlocks: uint64(n), - numValuesInValueBlocks: w.numValues, - valueBlocksAndIndexSize: w.totalBlockBytes + vbih.h.Length + blockTrailerLen, - } - return vbih, stats, err -} - -func (w *valueBlockWriter) writeValueBlocksIndex( - writer io.Writer, h valueBlocksIndexHandle, -) (valueBlocksIndexHandle, error) { - blockLen := - int(h.blockNumByteLength+h.blockOffsetByteLength+h.blockLengthByteLength) * len(w.blocks) - h.h.Length = uint64(blockLen) - blockLen += blockTrailerLen - var buf []byte - if cap(w.buf.b) < blockLen { - buf = make([]byte, blockLen) - w.buf.b = buf - } else { - buf = w.buf.b[:blockLen] - } - b := buf - for i := range w.blocks { - littleEndianPut(uint64(i), b, int(h.blockNumByteLength)) - b = b[int(h.blockNumByteLength):] - littleEndianPut(w.blocks[i].handle.Offset, b, int(h.blockOffsetByteLength)) - b = b[int(h.blockOffsetByteLength):] - littleEndianPut(w.blocks[i].handle.Length, b, int(h.blockLengthByteLength)) - b = b[int(h.blockLengthByteLength):] - } - if len(b) != blockTrailerLen { - panic("incorrect length calculation") - } - b[0] = byte(noCompressionBlockType) - w.computeChecksum(buf) - if _, err := writer.Write(buf); err != nil { - return valueBlocksIndexHandle{}, err - } - return h, nil -} - -// littleEndianPut writes v to b using little endian encoding, under the -// assumption that v can be represented using n bytes. -func littleEndianPut(v uint64, b []byte, n int) { - _ = b[n-1] // bounds check - for i := 0; i < n; i++ { - b[i] = byte(v) - v = v >> 8 - } -} - -// lenLittleEndian returns the minimum number of bytes needed to encode v -// using little endian encoding. -func lenLittleEndian(v uint64) int { - n := 0 - for i := 0; i < 8; i++ { - n++ - v = v >> 8 - if v == 0 { - break - } - } - return n -} - -func littleEndianGet(b []byte, n int) uint64 { - _ = b[n-1] // bounds check - v := uint64(b[0]) - for i := 1; i < n; i++ { - v |= uint64(b[i]) << (8 * i) - } - return v -} - -// UserKeyPrefixBound represents a [Lower,Upper) bound of user key prefixes. -// If both are nil, there is no bound specified. Else, Compare(Lower,Upper) -// must be < 0. -type UserKeyPrefixBound struct { - // Lower is a lower bound user key prefix. - Lower []byte - // Upper is an upper bound user key prefix. - Upper []byte -} - -// IsEmpty returns true iff the bound is empty. -func (ukb *UserKeyPrefixBound) IsEmpty() bool { - return len(ukb.Lower) == 0 && len(ukb.Upper) == 0 -} - -type blockProviderWhenOpen interface { - readBlockForVBR( - ctx context.Context, h BlockHandle, stats *base.InternalIteratorStats, - ) (bufferHandle, error) -} - -type blockProviderWhenClosed struct { - rp ReaderProvider - r *Reader -} - -func (bpwc *blockProviderWhenClosed) open() error { - var err error - bpwc.r, err = bpwc.rp.GetReader() - return err -} - -func (bpwc *blockProviderWhenClosed) close() { - bpwc.rp.Close() - bpwc.r = nil -} - -func (bpwc blockProviderWhenClosed) readBlockForVBR( - ctx context.Context, h BlockHandle, stats *base.InternalIteratorStats, -) (bufferHandle, error) { - ctx = objiotracing.WithBlockType(ctx, objiotracing.ValueBlock) - // TODO(jackson,sumeer): Consider whether to use a buffer pool in this case. - // The bpwc is not allowed to outlive the iterator tree, so it cannot - // outlive the buffer pool. - return bpwc.r.readBlock(ctx, h, nil, nil, stats, nil /* buffer pool */) -} - -// ReaderProvider supports the implementation of blockProviderWhenClosed. -// GetReader and Close can be called multiple times in pairs. -type ReaderProvider interface { - GetReader() (r *Reader, err error) - Close() -} - -// TrivialReaderProvider implements ReaderProvider for a Reader that will -// outlive the top-level iterator in the iterator tree. -type TrivialReaderProvider struct { - *Reader -} - -var _ ReaderProvider = TrivialReaderProvider{} - -// GetReader implements ReaderProvider. -func (trp TrivialReaderProvider) GetReader() (*Reader, error) { - return trp.Reader, nil -} - -// Close implements ReaderProvider. -func (trp TrivialReaderProvider) Close() {} - -// valueBlockReader is used to retrieve values in value -// blocks. It is used when the sstable was written with -// Properties.ValueBlocksAreEnabled. -type valueBlockReader struct { - ctx context.Context - bpOpen blockProviderWhenOpen - rp ReaderProvider - vbih valueBlocksIndexHandle - stats *base.InternalIteratorStats - - // The value blocks index is lazily retrieved the first time the reader - // needs to read a value that resides in a value block. - vbiBlock []byte - vbiCache bufferHandle - // When sequentially iterating through all key-value pairs, the cost of - // repeatedly getting a block that is already in the cache and releasing the - // bufferHandle can be ~40% of the cpu overhead. So the reader remembers the - // last value block it retrieved, in case there is locality of access, and - // this value block can be used for the next value retrieval. - valueBlockNum uint32 - valueBlock []byte - valueBlockPtr unsafe.Pointer - valueCache bufferHandle - lazyFetcher base.LazyFetcher - closed bool - bufToMangle []byte -} - -func (r *valueBlockReader) getLazyValueForPrefixAndValueHandle(handle []byte) base.LazyValue { - fetcher := &r.lazyFetcher - valLen, h := decodeLenFromValueHandle(handle[1:]) - *fetcher = base.LazyFetcher{ - Fetcher: r, - Attribute: base.AttributeAndLen{ - ValueLen: int32(valLen), - ShortAttribute: getShortAttribute(valuePrefix(handle[0])), - }, - } - if r.stats != nil { - r.stats.SeparatedPointValue.Count++ - r.stats.SeparatedPointValue.ValueBytes += uint64(valLen) - } - return base.LazyValue{ - ValueOrHandle: h, - Fetcher: fetcher, - } -} - -func (r *valueBlockReader) close() { - r.bpOpen = nil - r.vbiBlock = nil - r.vbiCache.Release() - // Set the handle to empty since Release does not nil the Handle.value. If - // we were to reopen this valueBlockReader and retrieve the same - // Handle.value from the cache, we don't want to accidentally unref it when - // attempting to unref the old handle. - r.vbiCache = bufferHandle{} - r.valueBlock = nil - r.valueBlockPtr = nil - r.valueCache.Release() - // See comment above. - r.valueCache = bufferHandle{} - r.closed = true - // rp, vbih, stats remain valid, so that LazyFetcher.ValueFetcher can be - // implemented. -} - -// Fetch implements base.ValueFetcher. -func (r *valueBlockReader) Fetch( - handle []byte, valLen int32, buf []byte, -) (val []byte, callerOwned bool, err error) { - if !r.closed { - val, err := r.getValueInternal(handle, valLen) - if invariants.Enabled { - val = r.doValueMangling(val) - } - return val, false, err - } - - bp := blockProviderWhenClosed{rp: r.rp} - err = bp.open() - if err != nil { - return nil, false, err - } - defer bp.close() - defer r.close() - r.bpOpen = bp - var v []byte - v, err = r.getValueInternal(handle, valLen) - if err != nil { - return nil, false, err - } - buf = append(buf[:0], v...) - return buf, true, nil -} - -// doValueMangling attempts to uncover violations of the contract listed in -// the declaration comment of LazyValue. It is expensive, hence only called -// when invariants.Enabled. -func (r *valueBlockReader) doValueMangling(v []byte) []byte { - // Randomly set the bytes in the previous retrieved value to 0, since - // property P1 only requires the valueBlockReader to maintain the memory of - // one fetched value. - if rand.Intn(2) == 0 { - for i := range r.bufToMangle { - r.bufToMangle[i] = 0 - } - } - // Store the current value in a new buffer for future mangling. - r.bufToMangle = append([]byte(nil), v...) - return r.bufToMangle -} - -func (r *valueBlockReader) getValueInternal(handle []byte, valLen int32) (val []byte, err error) { - vh := decodeRemainingValueHandle(handle) - vh.valueLen = uint32(valLen) - if r.vbiBlock == nil { - ch, err := r.bpOpen.readBlockForVBR(r.ctx, r.vbih.h, r.stats) - if err != nil { - return nil, err - } - r.vbiCache = ch - r.vbiBlock = ch.Get() - } - if r.valueBlock == nil || r.valueBlockNum != vh.blockNum { - vbh, err := r.getBlockHandle(vh.blockNum) - if err != nil { - return nil, err - } - vbCacheHandle, err := r.bpOpen.readBlockForVBR(r.ctx, vbh, r.stats) - if err != nil { - return nil, err - } - r.valueBlockNum = vh.blockNum - r.valueCache.Release() - r.valueCache = vbCacheHandle - r.valueBlock = vbCacheHandle.Get() - r.valueBlockPtr = unsafe.Pointer(&r.valueBlock[0]) - } - if r.stats != nil { - r.stats.SeparatedPointValue.ValueBytesFetched += uint64(valLen) - } - return r.valueBlock[vh.offsetInBlock : vh.offsetInBlock+vh.valueLen], nil -} - -func (r *valueBlockReader) getBlockHandle(blockNum uint32) (BlockHandle, error) { - indexEntryLen := - int(r.vbih.blockNumByteLength + r.vbih.blockOffsetByteLength + r.vbih.blockLengthByteLength) - offsetInIndex := indexEntryLen * int(blockNum) - if len(r.vbiBlock) < offsetInIndex+indexEntryLen { - return BlockHandle{}, errors.Errorf( - "cannot read at offset %d and length %d from block of length %d", - offsetInIndex, indexEntryLen, len(r.vbiBlock)) - } - b := r.vbiBlock[offsetInIndex : offsetInIndex+indexEntryLen] - n := int(r.vbih.blockNumByteLength) - bn := littleEndianGet(b, n) - if uint32(bn) != blockNum { - return BlockHandle{}, - errors.Errorf("expected block num %d but found %d", blockNum, bn) - } - b = b[n:] - n = int(r.vbih.blockOffsetByteLength) - blockOffset := littleEndianGet(b, n) - b = b[n:] - n = int(r.vbih.blockLengthByteLength) - blockLen := littleEndianGet(b, n) - return BlockHandle{Offset: blockOffset, Length: blockLen}, nil -} diff --git a/vendor/github.com/cockroachdb/pebble/table_cache.go b/vendor/github.com/cockroachdb/pebble/table_cache.go deleted file mode 100644 index 516e5e8..0000000 --- a/vendor/github.com/cockroachdb/pebble/table_cache.go +++ /dev/null @@ -1,1195 +0,0 @@ -// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use -// of this source code is governed by a BSD-style license that can be found in -// the LICENSE file. - -package pebble - -import ( - "bytes" - "context" - "fmt" - "io" - "runtime/debug" - "runtime/pprof" - "sync" - "sync/atomic" - "unsafe" - - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/internal/private" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" - "github.com/cockroachdb/pebble/sstable" -) - -var emptyIter = &errorIter{err: nil} -var emptyKeyspanIter = &errorKeyspanIter{err: nil} - -// filteredAll is a singleton internalIterator implementation used when an -// sstable does contain point keys, but all the keys are filtered by the active -// PointKeyFilters set in the iterator's IterOptions. -// -// filteredAll implements filteredIter, ensuring the level iterator recognizes -// when it may need to return file boundaries to keep the rangeDelIter open -// during mergingIter operation. -var filteredAll = &filteredAllKeysIter{errorIter: errorIter{err: nil}} - -var _ filteredIter = filteredAll - -type filteredAllKeysIter struct { - errorIter -} - -func (s *filteredAllKeysIter) MaybeFilteredKeys() bool { - return true -} - -var tableCacheLabels = pprof.Labels("pebble", "table-cache") - -// tableCacheOpts contains the db specific fields -// of a table cache. This is stored in the tableCacheContainer -// along with the table cache. -// NB: It is important to make sure that the fields in this -// struct are read-only. Since the fields here are shared -// by every single tableCacheShard, if non read-only fields -// are updated, we could have unnecessary evictions of those -// fields, and the surrounding fields from the CPU caches. -type tableCacheOpts struct { - // iterCount keeps track of how many iterators are open. It is used to keep - // track of leaked iterators on a per-db level. - iterCount *atomic.Int32 - - loggerAndTracer LoggerAndTracer - cacheID uint64 - objProvider objstorage.Provider - opts sstable.ReaderOptions - filterMetrics *sstable.FilterMetricsTracker -} - -// tableCacheContainer contains the table cache and -// fields which are unique to the DB. -type tableCacheContainer struct { - tableCache *TableCache - - // dbOpts contains fields relevant to the table cache - // which are unique to each DB. - dbOpts tableCacheOpts -} - -// newTableCacheContainer will panic if the underlying cache in the table cache -// doesn't match Options.Cache. -func newTableCacheContainer( - tc *TableCache, cacheID uint64, objProvider objstorage.Provider, opts *Options, size int, -) *tableCacheContainer { - // We will release a ref to table cache acquired here when tableCacheContainer.close is called. - if tc != nil { - if tc.cache != opts.Cache { - panic("pebble: underlying cache for the table cache and db are different") - } - tc.Ref() - } else { - // NewTableCache should create a ref to tc which the container should - // drop whenever it is closed. - tc = NewTableCache(opts.Cache, opts.Experimental.TableCacheShards, size) - } - - t := &tableCacheContainer{} - t.tableCache = tc - t.dbOpts.loggerAndTracer = opts.LoggerAndTracer - t.dbOpts.cacheID = cacheID - t.dbOpts.objProvider = objProvider - t.dbOpts.opts = opts.MakeReaderOptions() - t.dbOpts.filterMetrics = &sstable.FilterMetricsTracker{} - t.dbOpts.iterCount = new(atomic.Int32) - return t -} - -// Before calling close, make sure that there will be no further need -// to access any of the files associated with the store. -func (c *tableCacheContainer) close() error { - // We want to do some cleanup work here. Check for leaked iterators - // by the DB using this container. Note that we'll still perform cleanup - // below in the case that there are leaked iterators. - var err error - if v := c.dbOpts.iterCount.Load(); v > 0 { - err = errors.Errorf("leaked iterators: %d", errors.Safe(v)) - } - - // Release nodes here. - for _, shard := range c.tableCache.shards { - if shard != nil { - shard.removeDB(&c.dbOpts) - } - } - return firstError(err, c.tableCache.Unref()) -} - -func (c *tableCacheContainer) newIters( - ctx context.Context, - file *manifest.FileMetadata, - opts *IterOptions, - internalOpts internalIterOpts, -) (internalIterator, keyspan.FragmentIterator, error) { - return c.tableCache.getShard(file.FileBacking.DiskFileNum).newIters(ctx, file, opts, internalOpts, &c.dbOpts) -} - -func (c *tableCacheContainer) newRangeKeyIter( - file *manifest.FileMetadata, opts keyspan.SpanIterOptions, -) (keyspan.FragmentIterator, error) { - return c.tableCache.getShard(file.FileBacking.DiskFileNum).newRangeKeyIter(file, opts, &c.dbOpts) -} - -// getTableProperties returns the properties associated with the backing physical -// table if the input metadata belongs to a virtual sstable. -func (c *tableCacheContainer) getTableProperties(file *fileMetadata) (*sstable.Properties, error) { - return c.tableCache.getShard(file.FileBacking.DiskFileNum).getTableProperties(file, &c.dbOpts) -} - -func (c *tableCacheContainer) evict(fileNum base.DiskFileNum) { - c.tableCache.getShard(fileNum).evict(fileNum, &c.dbOpts, false) -} - -func (c *tableCacheContainer) metrics() (CacheMetrics, FilterMetrics) { - var m CacheMetrics - for i := range c.tableCache.shards { - s := c.tableCache.shards[i] - s.mu.RLock() - m.Count += int64(len(s.mu.nodes)) - s.mu.RUnlock() - m.Hits += s.hits.Load() - m.Misses += s.misses.Load() - } - m.Size = m.Count * int64(unsafe.Sizeof(sstable.Reader{})) - f := c.dbOpts.filterMetrics.Load() - return m, f -} - -func (c *tableCacheContainer) estimateSize( - meta *fileMetadata, lower, upper []byte, -) (size uint64, err error) { - if meta.Virtual { - err = c.withVirtualReader( - meta.VirtualMeta(), - func(r sstable.VirtualReader) (err error) { - size, err = r.EstimateDiskUsage(lower, upper) - return err - }, - ) - } else { - err = c.withReader( - meta.PhysicalMeta(), - func(r *sstable.Reader) (err error) { - size, err = r.EstimateDiskUsage(lower, upper) - return err - }, - ) - } - if err != nil { - return 0, err - } - return size, nil -} - -// createCommonReader creates a Reader for this file. isForeign, if true for -// virtual sstables, is passed into the vSSTable reader so its iterators can -// collapse obsolete points accordingly. -func createCommonReader( - v *tableCacheValue, file *fileMetadata, isForeign bool, -) sstable.CommonReader { - // TODO(bananabrick): We suffer an allocation if file is a virtual sstable. - var cr sstable.CommonReader = v.reader - if file.Virtual { - virtualReader := sstable.MakeVirtualReader( - v.reader, file.VirtualMeta(), isForeign, - ) - cr = &virtualReader - } - return cr -} - -func (c *tableCacheContainer) withCommonReader( - meta *fileMetadata, fn func(sstable.CommonReader) error, -) error { - s := c.tableCache.getShard(meta.FileBacking.DiskFileNum) - v := s.findNode(meta, &c.dbOpts) - defer s.unrefValue(v) - if v.err != nil { - return v.err - } - provider := c.dbOpts.objProvider - objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) - if err != nil { - return err - } - return fn(createCommonReader(v, meta, provider.IsSharedForeign(objMeta))) -} - -func (c *tableCacheContainer) withReader(meta physicalMeta, fn func(*sstable.Reader) error) error { - s := c.tableCache.getShard(meta.FileBacking.DiskFileNum) - v := s.findNode(meta.FileMetadata, &c.dbOpts) - defer s.unrefValue(v) - if v.err != nil { - return v.err - } - return fn(v.reader) -} - -// withVirtualReader fetches a VirtualReader associated with a virtual sstable. -func (c *tableCacheContainer) withVirtualReader( - meta virtualMeta, fn func(sstable.VirtualReader) error, -) error { - s := c.tableCache.getShard(meta.FileBacking.DiskFileNum) - v := s.findNode(meta.FileMetadata, &c.dbOpts) - defer s.unrefValue(v) - if v.err != nil { - return v.err - } - provider := c.dbOpts.objProvider - objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) - if err != nil { - return err - } - return fn(sstable.MakeVirtualReader(v.reader, meta, provider.IsSharedForeign(objMeta))) -} - -func (c *tableCacheContainer) iterCount() int64 { - return int64(c.dbOpts.iterCount.Load()) -} - -// TableCache is a shareable cache for open sstables. -type TableCache struct { - refs atomic.Int64 - - cache *Cache - shards []*tableCacheShard -} - -// Ref adds a reference to the table cache. Once tableCache.init returns, -// the table cache only remains valid if there is at least one reference -// to it. -func (c *TableCache) Ref() { - v := c.refs.Add(1) - // We don't want the reference count to ever go from 0 -> 1, - // cause a reference count of 0 implies that we've closed the cache. - if v <= 1 { - panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) - } -} - -// Unref removes a reference to the table cache. -func (c *TableCache) Unref() error { - v := c.refs.Add(-1) - switch { - case v < 0: - panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) - case v == 0: - var err error - for i := range c.shards { - // The cache shard is not allocated yet, nothing to close - if c.shards[i] == nil { - continue - } - err = firstError(err, c.shards[i].Close()) - } - - // Unref the cache which we create a reference to when the tableCache - // is first instantiated. - c.cache.Unref() - return err - } - return nil -} - -// NewTableCache will create a reference to the table cache. It is the callers responsibility -// to call tableCache.Unref if they will no longer hold a reference to the table cache. -func NewTableCache(cache *Cache, numShards int, size int) *TableCache { - if size == 0 { - panic("pebble: cannot create a table cache of size 0") - } else if numShards == 0 { - panic("pebble: cannot create a table cache with 0 shards") - } - - c := &TableCache{} - c.cache = cache - c.cache.Ref() - - c.shards = make([]*tableCacheShard, numShards) - for i := range c.shards { - c.shards[i] = &tableCacheShard{} - c.shards[i].init(size / len(c.shards)) - } - - // Hold a ref to the cache here. - c.refs.Store(1) - - return c -} - -func (c *TableCache) getShard(fileNum base.DiskFileNum) *tableCacheShard { - return c.shards[uint64(fileNum.FileNum())%uint64(len(c.shards))] -} - -type tableCacheKey struct { - cacheID uint64 - fileNum base.DiskFileNum -} - -type tableCacheShard struct { - hits atomic.Int64 - misses atomic.Int64 - iterCount atomic.Int32 - - size int - - mu struct { - sync.RWMutex - nodes map[tableCacheKey]*tableCacheNode - // The iters map is only created and populated in race builds. - iters map[io.Closer][]byte - - handHot *tableCacheNode - handCold *tableCacheNode - handTest *tableCacheNode - - coldTarget int - sizeHot int - sizeCold int - sizeTest int - } - releasing sync.WaitGroup - releasingCh chan *tableCacheValue - releaseLoopExit sync.WaitGroup -} - -func (c *tableCacheShard) init(size int) { - c.size = size - - c.mu.nodes = make(map[tableCacheKey]*tableCacheNode) - c.mu.coldTarget = size - c.releasingCh = make(chan *tableCacheValue, 100) - c.releaseLoopExit.Add(1) - go c.releaseLoop() - - if invariants.RaceEnabled { - c.mu.iters = make(map[io.Closer][]byte) - } -} - -func (c *tableCacheShard) releaseLoop() { - pprof.Do(context.Background(), tableCacheLabels, func(context.Context) { - defer c.releaseLoopExit.Done() - for v := range c.releasingCh { - v.release(c) - } - }) -} - -// checkAndIntersectFilters checks the specific table and block property filters -// for intersection with any available table and block-level properties. Returns -// true for ok if this table should be read by this iterator. -func (c *tableCacheShard) checkAndIntersectFilters( - v *tableCacheValue, - tableFilter func(userProps map[string]string) bool, - blockPropertyFilters []BlockPropertyFilter, - boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter, -) (ok bool, filterer *sstable.BlockPropertiesFilterer, err error) { - if tableFilter != nil && - !tableFilter(v.reader.Properties.UserProperties) { - return false, nil, nil - } - - if boundLimitedFilter != nil || len(blockPropertyFilters) > 0 { - filterer, err = sstable.IntersectsTable( - blockPropertyFilters, - boundLimitedFilter, - v.reader.Properties.UserProperties, - ) - // NB: IntersectsTable will return a nil filterer if the table-level - // properties indicate there's no intersection with the provided filters. - if filterer == nil || err != nil { - return false, nil, err - } - } - return true, filterer, nil -} - -func (c *tableCacheShard) newIters( - ctx context.Context, - file *manifest.FileMetadata, - opts *IterOptions, - internalOpts internalIterOpts, - dbOpts *tableCacheOpts, -) (internalIterator, keyspan.FragmentIterator, error) { - // TODO(sumeer): constructing the Reader should also use a plumbed context, - // since parts of the sstable are read during the construction. The Reader - // should not remember that context since the Reader can be long-lived. - - // Calling findNode gives us the responsibility of decrementing v's - // refCount. If opening the underlying table resulted in error, then we - // decrement this straight away. Otherwise, we pass that responsibility to - // the sstable iterator, which decrements when it is closed. - v := c.findNode(file, dbOpts) - if v.err != nil { - defer c.unrefValue(v) - return nil, nil, v.err - } - - hideObsoletePoints := false - var pointKeyFilters []BlockPropertyFilter - if opts != nil { - // This code is appending (at most one filter) in-place to - // opts.PointKeyFilters even though the slice is shared for iterators in - // the same iterator tree. This is acceptable since all the following - // properties are true: - // - The iterator tree is single threaded, so the shared backing for the - // slice is being mutated in a single threaded manner. - // - Each shallow copy of the slice has its own notion of length. - // - The appended element is always the obsoleteKeyBlockPropertyFilter - // struct, which is stateless, so overwriting that struct when creating - // one sstable iterator is harmless to other sstable iterators that are - // relying on that struct. - // - // An alternative would be to have different slices for different sstable - // iterators, but that requires more work to avoid allocations. - hideObsoletePoints, pointKeyFilters = - v.reader.TryAddBlockPropertyFilterForHideObsoletePoints( - opts.snapshotForHideObsoletePoints, file.LargestSeqNum, opts.PointKeyFilters) - } - ok := true - var filterer *sstable.BlockPropertiesFilterer - var err error - if opts != nil { - ok, filterer, err = c.checkAndIntersectFilters(v, opts.TableFilter, - pointKeyFilters, internalOpts.boundLimitedFilter) - } - if err != nil { - c.unrefValue(v) - return nil, nil, err - } - - provider := dbOpts.objProvider - // Check if this file is a foreign file. - objMeta, err := provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) - if err != nil { - return nil, nil, err - } - - // Note: This suffers an allocation for virtual sstables. - cr := createCommonReader(v, file, provider.IsSharedForeign(objMeta)) - - // NB: range-del iterator does not maintain a reference to the table, nor - // does it need to read from it after creation. - rangeDelIter, err := cr.NewRawRangeDelIter() - if err != nil { - c.unrefValue(v) - return nil, nil, err - } - - if !ok { - c.unrefValue(v) - // Return an empty iterator. This iterator has no mutable state, so - // using a singleton is fine. - // NB: We still return the potentially non-empty rangeDelIter. This - // ensures the iterator observes the file's range deletions even if the - // block property filters exclude all the file's point keys. The range - // deletions may still delete keys lower in the LSM in files that DO - // match the active filters. - // - // The point iterator returned must implement the filteredIter - // interface, so that the level iterator surfaces file boundaries when - // range deletions are present. - return filteredAll, rangeDelIter, err - } - - var iter sstable.Iterator - useFilter := true - if opts != nil { - useFilter = manifest.LevelToInt(opts.level) != 6 || opts.UseL6Filters - ctx = objiotracing.WithLevel(ctx, manifest.LevelToInt(opts.level)) - } - tableFormat, err := v.reader.TableFormat() - if err != nil { - return nil, nil, err - } - var rp sstable.ReaderProvider - if tableFormat >= sstable.TableFormatPebblev3 && v.reader.Properties.NumValueBlocks > 0 { - rp = &tableCacheShardReaderProvider{c: c, file: file, dbOpts: dbOpts} - } - - if provider.IsSharedForeign(objMeta) { - if tableFormat < sstable.TableFormatPebblev4 { - return nil, nil, errors.New("pebble: shared foreign sstable has a lower table format than expected") - } - hideObsoletePoints = true - } - if internalOpts.bytesIterated != nil { - iter, err = cr.NewCompactionIter(internalOpts.bytesIterated, rp, internalOpts.bufferPool) - } else { - iter, err = cr.NewIterWithBlockPropertyFiltersAndContextEtc( - ctx, opts.GetLowerBound(), opts.GetUpperBound(), filterer, hideObsoletePoints, useFilter, - internalOpts.stats, rp) - } - if err != nil { - if rangeDelIter != nil { - _ = rangeDelIter.Close() - } - c.unrefValue(v) - return nil, nil, err - } - // NB: v.closeHook takes responsibility for calling unrefValue(v) here. Take - // care to avoid introducing an allocation here by adding a closure. - iter.SetCloseHook(v.closeHook) - - c.iterCount.Add(1) - dbOpts.iterCount.Add(1) - if invariants.RaceEnabled { - c.mu.Lock() - c.mu.iters[iter] = debug.Stack() - c.mu.Unlock() - } - return iter, rangeDelIter, nil -} - -func (c *tableCacheShard) newRangeKeyIter( - file *manifest.FileMetadata, opts keyspan.SpanIterOptions, dbOpts *tableCacheOpts, -) (keyspan.FragmentIterator, error) { - // Calling findNode gives us the responsibility of decrementing v's - // refCount. If opening the underlying table resulted in error, then we - // decrement this straight away. Otherwise, we pass that responsibility to - // the sstable iterator, which decrements when it is closed. - v := c.findNode(file, dbOpts) - if v.err != nil { - defer c.unrefValue(v) - return nil, v.err - } - - ok := true - var err error - // Don't filter a table's range keys if the file contains RANGEKEYDELs. - // The RANGEKEYDELs may delete range keys in other levels. Skipping the - // file's range key blocks may surface deleted range keys below. This is - // done here, rather than deferring to the block-property collector in order - // to maintain parity with point keys and the treatment of RANGEDELs. - if v.reader.Properties.NumRangeKeyDels == 0 { - ok, _, err = c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil) - } - if err != nil { - c.unrefValue(v) - return nil, err - } - if !ok { - c.unrefValue(v) - // Return the empty iterator. This iterator has no mutable state, so - // using a singleton is fine. - return emptyKeyspanIter, err - } - - var iter keyspan.FragmentIterator - if file.Virtual { - provider := dbOpts.objProvider - var objMeta objstorage.ObjectMetadata - objMeta, err = provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) - if err == nil { - virtualReader := sstable.MakeVirtualReader( - v.reader, file.VirtualMeta(), provider.IsSharedForeign(objMeta), - ) - iter, err = virtualReader.NewRawRangeKeyIter() - } - } else { - iter, err = v.reader.NewRawRangeKeyIter() - } - - // iter is a block iter that holds the entire value of the block in memory. - // No need to hold onto a ref of the cache value. - c.unrefValue(v) - - if err != nil { - return nil, err - } - - if iter == nil { - // NewRawRangeKeyIter can return nil even if there's no error. However, - // the keyspan.LevelIter expects a non-nil iterator if err is nil. - return emptyKeyspanIter, nil - } - - return iter, nil -} - -type tableCacheShardReaderProvider struct { - c *tableCacheShard - file *manifest.FileMetadata - dbOpts *tableCacheOpts - v *tableCacheValue -} - -var _ sstable.ReaderProvider = &tableCacheShardReaderProvider{} - -// GetReader implements sstable.ReaderProvider. Note that it is not the -// responsibility of tableCacheShardReaderProvider to ensure that the file -// continues to exist. The ReaderProvider is used in iterators where the -// top-level iterator is pinning the read state and preventing the files from -// being deleted. -// -// The caller must call tableCacheShardReaderProvider.Close. -// -// Note that currently the Reader returned here is only used to read value -// blocks. This reader shouldn't be used for other purposes like reading keys -// outside of virtual sstable bounds. -// -// TODO(bananabrick): We could return a wrapper over the Reader to ensure -// that the reader isn't used for other purposes. -func (rp *tableCacheShardReaderProvider) GetReader() (*sstable.Reader, error) { - // Calling findNode gives us the responsibility of decrementing v's - // refCount. - v := rp.c.findNode(rp.file, rp.dbOpts) - if v.err != nil { - defer rp.c.unrefValue(v) - return nil, v.err - } - rp.v = v - return v.reader, nil -} - -// Close implements sstable.ReaderProvider. -func (rp *tableCacheShardReaderProvider) Close() { - rp.c.unrefValue(rp.v) - rp.v = nil -} - -// getTableProperties return sst table properties for target file -func (c *tableCacheShard) getTableProperties( - file *fileMetadata, dbOpts *tableCacheOpts, -) (*sstable.Properties, error) { - // Calling findNode gives us the responsibility of decrementing v's refCount here - v := c.findNode(file, dbOpts) - defer c.unrefValue(v) - - if v.err != nil { - return nil, v.err - } - return &v.reader.Properties, nil -} - -// releaseNode releases a node from the tableCacheShard. -// -// c.mu must be held when calling this. -func (c *tableCacheShard) releaseNode(n *tableCacheNode) { - c.unlinkNode(n) - c.clearNode(n) -} - -// unlinkNode removes a node from the tableCacheShard, leaving the shard -// reference in place. -// -// c.mu must be held when calling this. -func (c *tableCacheShard) unlinkNode(n *tableCacheNode) { - key := tableCacheKey{n.cacheID, n.fileNum} - delete(c.mu.nodes, key) - - switch n.ptype { - case tableCacheNodeHot: - c.mu.sizeHot-- - case tableCacheNodeCold: - c.mu.sizeCold-- - case tableCacheNodeTest: - c.mu.sizeTest-- - } - - if n == c.mu.handHot { - c.mu.handHot = c.mu.handHot.prev() - } - if n == c.mu.handCold { - c.mu.handCold = c.mu.handCold.prev() - } - if n == c.mu.handTest { - c.mu.handTest = c.mu.handTest.prev() - } - - if n.unlink() == n { - // This was the last entry in the cache. - c.mu.handHot = nil - c.mu.handCold = nil - c.mu.handTest = nil - } - - n.links.prev = nil - n.links.next = nil -} - -func (c *tableCacheShard) clearNode(n *tableCacheNode) { - if v := n.value; v != nil { - n.value = nil - c.unrefValue(v) - } -} - -// unrefValue decrements the reference count for the specified value, releasing -// it if the reference count fell to 0. Note that the value has a reference if -// it is present in tableCacheShard.mu.nodes, so a reference count of 0 means -// the node has already been removed from that map. -func (c *tableCacheShard) unrefValue(v *tableCacheValue) { - if v.refCount.Add(-1) == 0 { - c.releasing.Add(1) - c.releasingCh <- v - } -} - -// findNode returns the node for the table with the given file number, creating -// that node if it didn't already exist. The caller is responsible for -// decrementing the returned node's refCount. -func (c *tableCacheShard) findNode(meta *fileMetadata, dbOpts *tableCacheOpts) *tableCacheValue { - v := c.findNodeInternal(meta, dbOpts) - - // Loading a file before its global sequence number is known (eg, - // during ingest before entering the commit pipeline) can pollute - // the cache with incorrect state. In invariant builds, verify - // that the global sequence number of the returned reader matches. - if invariants.Enabled { - if v.reader != nil && meta.LargestSeqNum == meta.SmallestSeqNum && - v.reader.Properties.GlobalSeqNum != meta.SmallestSeqNum { - panic(errors.AssertionFailedf("file %s loaded from table cache with the wrong global sequence number %d", - meta, v.reader.Properties.GlobalSeqNum)) - } - } - return v -} - -func (c *tableCacheShard) findNodeInternal( - meta *fileMetadata, dbOpts *tableCacheOpts, -) *tableCacheValue { - if refs := meta.Refs(); refs <= 0 { - panic(errors.AssertionFailedf("attempting to load file %s with refs=%d from table cache", - meta, refs)) - } - // Fast-path for a hit in the cache. - c.mu.RLock() - key := tableCacheKey{dbOpts.cacheID, meta.FileBacking.DiskFileNum} - if n := c.mu.nodes[key]; n != nil && n.value != nil { - // Fast-path hit. - // - // The caller is responsible for decrementing the refCount. - v := n.value - v.refCount.Add(1) - c.mu.RUnlock() - n.referenced.Store(true) - c.hits.Add(1) - <-v.loaded - return v - } - c.mu.RUnlock() - - c.mu.Lock() - - n := c.mu.nodes[key] - switch { - case n == nil: - // Slow-path miss of a non-existent node. - n = &tableCacheNode{ - fileNum: meta.FileBacking.DiskFileNum, - ptype: tableCacheNodeCold, - } - c.addNode(n, dbOpts) - c.mu.sizeCold++ - - case n.value != nil: - // Slow-path hit of a hot or cold node. - // - // The caller is responsible for decrementing the refCount. - v := n.value - v.refCount.Add(1) - n.referenced.Store(true) - c.hits.Add(1) - c.mu.Unlock() - <-v.loaded - return v - - default: - // Slow-path miss of a test node. - c.unlinkNode(n) - c.mu.coldTarget++ - if c.mu.coldTarget > c.size { - c.mu.coldTarget = c.size - } - - n.referenced.Store(false) - n.ptype = tableCacheNodeHot - c.addNode(n, dbOpts) - c.mu.sizeHot++ - } - - c.misses.Add(1) - - v := &tableCacheValue{ - loaded: make(chan struct{}), - } - v.refCount.Store(2) - // Cache the closure invoked when an iterator is closed. This avoids an - // allocation on every call to newIters. - v.closeHook = func(i sstable.Iterator) error { - if invariants.RaceEnabled { - c.mu.Lock() - delete(c.mu.iters, i) - c.mu.Unlock() - } - c.unrefValue(v) - c.iterCount.Add(-1) - dbOpts.iterCount.Add(-1) - return nil - } - n.value = v - - c.mu.Unlock() - - // Note adding to the cache lists must complete before we begin loading the - // table as a failure during load will result in the node being unlinked. - pprof.Do(context.Background(), tableCacheLabels, func(context.Context) { - v.load( - loadInfo{ - backingFileNum: meta.FileBacking.DiskFileNum, - smallestSeqNum: meta.SmallestSeqNum, - largestSeqNum: meta.LargestSeqNum, - }, c, dbOpts) - }) - return v -} - -func (c *tableCacheShard) addNode(n *tableCacheNode, dbOpts *tableCacheOpts) { - c.evictNodes() - n.cacheID = dbOpts.cacheID - key := tableCacheKey{n.cacheID, n.fileNum} - c.mu.nodes[key] = n - - n.links.next = n - n.links.prev = n - if c.mu.handHot == nil { - // First element. - c.mu.handHot = n - c.mu.handCold = n - c.mu.handTest = n - } else { - c.mu.handHot.link(n) - } - - if c.mu.handCold == c.mu.handHot { - c.mu.handCold = c.mu.handCold.prev() - } -} - -func (c *tableCacheShard) evictNodes() { - for c.size <= c.mu.sizeHot+c.mu.sizeCold && c.mu.handCold != nil { - c.runHandCold() - } -} - -func (c *tableCacheShard) runHandCold() { - n := c.mu.handCold - if n.ptype == tableCacheNodeCold { - if n.referenced.Load() { - n.referenced.Store(false) - n.ptype = tableCacheNodeHot - c.mu.sizeCold-- - c.mu.sizeHot++ - } else { - c.clearNode(n) - n.ptype = tableCacheNodeTest - c.mu.sizeCold-- - c.mu.sizeTest++ - for c.size < c.mu.sizeTest && c.mu.handTest != nil { - c.runHandTest() - } - } - } - - c.mu.handCold = c.mu.handCold.next() - - for c.size-c.mu.coldTarget <= c.mu.sizeHot && c.mu.handHot != nil { - c.runHandHot() - } -} - -func (c *tableCacheShard) runHandHot() { - if c.mu.handHot == c.mu.handTest && c.mu.handTest != nil { - c.runHandTest() - if c.mu.handHot == nil { - return - } - } - - n := c.mu.handHot - if n.ptype == tableCacheNodeHot { - if n.referenced.Load() { - n.referenced.Store(false) - } else { - n.ptype = tableCacheNodeCold - c.mu.sizeHot-- - c.mu.sizeCold++ - } - } - - c.mu.handHot = c.mu.handHot.next() -} - -func (c *tableCacheShard) runHandTest() { - if c.mu.sizeCold > 0 && c.mu.handTest == c.mu.handCold && c.mu.handCold != nil { - c.runHandCold() - if c.mu.handTest == nil { - return - } - } - - n := c.mu.handTest - if n.ptype == tableCacheNodeTest { - c.mu.coldTarget-- - if c.mu.coldTarget < 0 { - c.mu.coldTarget = 0 - } - c.unlinkNode(n) - c.clearNode(n) - } - - c.mu.handTest = c.mu.handTest.next() -} - -func (c *tableCacheShard) evict(fileNum base.DiskFileNum, dbOpts *tableCacheOpts, allowLeak bool) { - c.mu.Lock() - key := tableCacheKey{dbOpts.cacheID, fileNum} - n := c.mu.nodes[key] - var v *tableCacheValue - if n != nil { - // NB: This is equivalent to tableCacheShard.releaseNode(), but we perform - // the tableCacheNode.release() call synchronously below to ensure the - // sstable file descriptor is closed before returning. Note that - // tableCacheShard.releasing needs to be incremented while holding - // tableCacheShard.mu in order to avoid a race with Close() - c.unlinkNode(n) - v = n.value - if v != nil { - if !allowLeak { - if t := v.refCount.Add(-1); t != 0 { - dbOpts.loggerAndTracer.Fatalf("sstable %s: refcount is not zero: %d\n%s", fileNum, t, debug.Stack()) - } - } - c.releasing.Add(1) - } - } - - c.mu.Unlock() - - if v != nil { - v.release(c) - } - - dbOpts.opts.Cache.EvictFile(dbOpts.cacheID, fileNum) -} - -// removeDB evicts any nodes which have a reference to the DB -// associated with dbOpts.cacheID. Make sure that there will -// be no more accesses to the files associated with the DB. -func (c *tableCacheShard) removeDB(dbOpts *tableCacheOpts) { - var fileNums []base.DiskFileNum - - c.mu.RLock() - // Collect the fileNums which need to be cleaned. - var firstNode *tableCacheNode - node := c.mu.handHot - for node != firstNode { - if firstNode == nil { - firstNode = node - } - - if node.cacheID == dbOpts.cacheID { - fileNums = append(fileNums, node.fileNum) - } - node = node.next() - } - c.mu.RUnlock() - - // Evict all the nodes associated with the DB. - // This should synchronously close all the files - // associated with the DB. - for _, fileNum := range fileNums { - c.evict(fileNum, dbOpts, true) - } -} - -func (c *tableCacheShard) Close() error { - c.mu.Lock() - defer c.mu.Unlock() - - // Check for leaked iterators. Note that we'll still perform cleanup below in - // the case that there are leaked iterators. - var err error - if v := c.iterCount.Load(); v > 0 { - if !invariants.RaceEnabled { - err = errors.Errorf("leaked iterators: %d", errors.Safe(v)) - } else { - var buf bytes.Buffer - for _, stack := range c.mu.iters { - fmt.Fprintf(&buf, "%s\n", stack) - } - err = errors.Errorf("leaked iterators: %d\n%s", errors.Safe(v), buf.String()) - } - } - - for c.mu.handHot != nil { - n := c.mu.handHot - if n.value != nil { - if n.value.refCount.Add(-1) == 0 { - c.releasing.Add(1) - c.releasingCh <- n.value - } - } - c.unlinkNode(n) - } - c.mu.nodes = nil - c.mu.handHot = nil - c.mu.handCold = nil - c.mu.handTest = nil - - // Only shutdown the releasing goroutine if there were no leaked - // iterators. If there were leaked iterators, we leave the goroutine running - // and the releasingCh open so that a subsequent iterator close can - // complete. This behavior is used by iterator leak tests. Leaking the - // goroutine for these tests is less bad not closing the iterator which - // triggers other warnings about block cache handles not being released. - if err != nil { - c.releasing.Wait() - return err - } - - close(c.releasingCh) - c.releasing.Wait() - c.releaseLoopExit.Wait() - return err -} - -type tableCacheValue struct { - closeHook func(i sstable.Iterator) error - reader *sstable.Reader - err error - loaded chan struct{} - // Reference count for the value. The reader is closed when the reference - // count drops to zero. - refCount atomic.Int32 -} - -type loadInfo struct { - backingFileNum base.DiskFileNum - largestSeqNum uint64 - smallestSeqNum uint64 -} - -func (v *tableCacheValue) load(loadInfo loadInfo, c *tableCacheShard, dbOpts *tableCacheOpts) { - // Try opening the file first. - var f objstorage.Readable - var err error - f, err = dbOpts.objProvider.OpenForReading( - context.TODO(), fileTypeTable, loadInfo.backingFileNum, objstorage.OpenOptions{MustExist: true}, - ) - if err == nil { - cacheOpts := private.SSTableCacheOpts(dbOpts.cacheID, loadInfo.backingFileNum).(sstable.ReaderOption) - v.reader, err = sstable.NewReader(f, dbOpts.opts, cacheOpts, dbOpts.filterMetrics) - } - if err != nil { - v.err = errors.Wrapf( - err, "pebble: backing file %s error", errors.Safe(loadInfo.backingFileNum.FileNum())) - } - if v.err == nil && loadInfo.smallestSeqNum == loadInfo.largestSeqNum { - v.reader.Properties.GlobalSeqNum = loadInfo.largestSeqNum - } - if v.err != nil { - c.mu.Lock() - defer c.mu.Unlock() - // Lookup the node in the cache again as it might have already been - // removed. - key := tableCacheKey{dbOpts.cacheID, loadInfo.backingFileNum} - n := c.mu.nodes[key] - if n != nil && n.value == v { - c.releaseNode(n) - } - } - close(v.loaded) -} - -func (v *tableCacheValue) release(c *tableCacheShard) { - <-v.loaded - // Nothing to be done about an error at this point. Close the reader if it is - // open. - if v.reader != nil { - _ = v.reader.Close() - } - c.releasing.Done() -} - -type tableCacheNodeType int8 - -const ( - tableCacheNodeTest tableCacheNodeType = iota - tableCacheNodeCold - tableCacheNodeHot -) - -func (p tableCacheNodeType) String() string { - switch p { - case tableCacheNodeTest: - return "test" - case tableCacheNodeCold: - return "cold" - case tableCacheNodeHot: - return "hot" - } - return "unknown" -} - -type tableCacheNode struct { - fileNum base.DiskFileNum - value *tableCacheValue - - links struct { - next *tableCacheNode - prev *tableCacheNode - } - ptype tableCacheNodeType - // referenced is atomically set to indicate that this entry has been accessed - // since the last time one of the clock hands swept it. - referenced atomic.Bool - - // Storing the cache id associated with the DB instance here - // avoids the need to thread the dbOpts struct through many functions. - cacheID uint64 -} - -func (n *tableCacheNode) next() *tableCacheNode { - if n == nil { - return nil - } - return n.links.next -} - -func (n *tableCacheNode) prev() *tableCacheNode { - if n == nil { - return nil - } - return n.links.prev -} - -func (n *tableCacheNode) link(s *tableCacheNode) { - s.links.prev = n.links.prev - s.links.prev.links.next = s - s.links.next = n - s.links.next.links.prev = s -} - -func (n *tableCacheNode) unlink() *tableCacheNode { - next := n.links.next - n.links.prev.links.next = n.links.next - n.links.next.links.prev = n.links.prev - n.links.prev = n - n.links.next = n - return next -} diff --git a/vendor/github.com/cockroachdb/pebble/.editorconfig b/vendor/github.com/cockroachdb/pebble/v2/.editorconfig similarity index 100% rename from vendor/github.com/cockroachdb/pebble/.editorconfig rename to vendor/github.com/cockroachdb/pebble/v2/.editorconfig diff --git a/vendor/github.com/cockroachdb/pebble/.gitignore b/vendor/github.com/cockroachdb/pebble/v2/.gitignore similarity index 65% rename from vendor/github.com/cockroachdb/pebble/.gitignore rename to vendor/github.com/cockroachdb/pebble/v2/.gitignore index 87ef192..7c353d2 100644 --- a/vendor/github.com/cockroachdb/pebble/.gitignore +++ b/vendor/github.com/cockroachdb/pebble/v2/.gitignore @@ -7,3 +7,9 @@ mutex.prof coverprofile.out # Testing artifacts meta.*.test + +# Bazel files, generated with 'make gen-bazel'. +/WORKSPACE +BUILD.bazel + + diff --git a/vendor/github.com/cockroachdb/pebble/LICENSE b/vendor/github.com/cockroachdb/pebble/v2/LICENSE similarity index 100% rename from vendor/github.com/cockroachdb/pebble/LICENSE rename to vendor/github.com/cockroachdb/pebble/v2/LICENSE diff --git a/vendor/github.com/cockroachdb/pebble/Makefile b/vendor/github.com/cockroachdb/pebble/v2/Makefile similarity index 60% rename from vendor/github.com/cockroachdb/pebble/Makefile rename to vendor/github.com/cockroachdb/pebble/v2/Makefile index 2cbf3ea..f3fe013 100644 --- a/vendor/github.com/cockroachdb/pebble/Makefile +++ b/vendor/github.com/cockroachdb/pebble/v2/Makefile @@ -4,7 +4,6 @@ GOFLAGS := STRESSFLAGS := TAGS := invariants TESTS := . -PREV_RELEASE := crl-release-23.1 COVER_PROFILE := coverprofile.out .PHONY: all @@ -18,6 +17,7 @@ all: @echo " make crossversion-meta" @echo " make testcoverage" @echo " make mod-update" + @echo " make gen-bazel" @echo " make generate" @echo " make generate-test-data" @echo " make clean" @@ -36,12 +36,18 @@ testrace: testflags += -race -timeout 20m testrace: test testasan: testflags += -asan -timeout 20m +testasan: TAGS += slowbuild testasan: test testmsan: export CC=clang testmsan: testflags += -msan -timeout 20m +testmsan: TAGS += slowbuild testmsan: test +.PHONY: testnocgo +testnocgo: + CGO_ENABLED=0 ${GO} test -tags '$(TAGS)' ${testflags} -run ${TESTS} ${PKG} + .PHONY: testobjiotracing testobjiotracing: ${GO} test -tags '$(TAGS) pebble_obj_io_tracing' ${testflags} -run ${TESTS} ./objstorage/objstorageprovider/objiotracing @@ -62,16 +68,48 @@ stressmeta: override TESTS = TestMeta$$ stressmeta: stress .PHONY: crossversion-meta +crossversion-meta: LATEST_RELEASE := crl-release-25.2 crossversion-meta: - git checkout ${PREV_RELEASE}; \ - ${GO} test -c ./internal/metamorphic -o './internal/metamorphic/crossversion/${PREV_RELEASE}.test'; \ + git checkout ${LATEST_RELEASE}; \ + ${GO} test -c ./internal/metamorphic -o './internal/metamorphic/crossversion/${LATEST_RELEASE}.test'; \ git checkout -; \ ${GO} test -c ./internal/metamorphic -o './internal/metamorphic/crossversion/head.test'; \ - ${GO} test -tags '$(TAGS)' ${testflags} -v -run 'TestMetaCrossVersion' ./internal/metamorphic/crossversion --version '${PREV_RELEASE},${PREV_RELEASE},${PREV_RELEASE}.test' --version 'HEAD,HEAD,./head.test' + ${GO} test -tags '$(TAGS)' ${testflags} -v -timeout 20m -run 'TestMetaCrossVersion' ./internal/metamorphic/crossversion --version '${LATEST_RELEASE},${LATEST_RELEASE},${LATEST_RELEASE}.test' --version 'HEAD,HEAD,./head.test' .PHONY: stress-crossversion stress-crossversion: - STRESS=1 ./scripts/run-crossversion-meta.sh crl-release-21.2 crl-release-22.1 crl-release-22.2 crl-release-23.1 master + STRESS=1 ./scripts/run-crossversion-meta.sh crl-release-24.1 crl-release-24.3 crl-release-25.1 crl-release-25.2 crl-release-25.3 + +.PHONY: test-s390x-qemu +test-s390x-qemu: TAGS += slowbuild +test-s390x-qemu: S390X_GOVERSION := 1.23 +test-s390x-qemu: + @echo "Running tests on s390x using QEMU" + @echo "Requires a recent linux with docker and qemu-user-static installed" + @echo "(sudo apt-get install -y qemu-user-static)" + @echo "" + @qemu-s390x-static --version + @echo "" + @docker run --rm -v "$(CURDIR):/pebble" --platform=linux/s390x golang:${S390X_GOVERSION} \ + bash -c " \ + uname -a && \ + lscpu | grep Endian && \ + cd /pebble && \ + go version && \ + go test -tags '$(TAGS)' -timeout 30m ./..." + +.PHONY: gen-bazel +gen-bazel: + @echo "Generating WORKSPACE" + @echo 'workspace(name = "com_github_cockroachdb_pebble")' > WORKSPACE + @echo 'Running gazelle...' + ${GO} run github.com/bazelbuild/bazel-gazelle/cmd/gazelle@v0.37.0 update --go_prefix=github.com/cockroachdb/pebble/v2 --repo_root=. + @echo 'You should now be able to build Cockroach using:' + @echo ' ./dev build short -- --override_repository=com_github_cockroachdb_pebble=${CURDIR}' + +.PHONY: clean-bazel +clean-bazel: + git clean -dxf WORKSPACE BUILD.bazel '**/BUILD.bazel' .PHONY: generate generate: @@ -88,6 +126,7 @@ generate-test-data: ${GO} run -tags make_test_sstables ./tool/make_test_sstables.go ${GO} run -tags make_test_remotecat ./tool/make_test_remotecat.go +.PHONY: mod-update mod-update: ${GO} get -u ${GO} mod tidy @@ -115,11 +154,9 @@ endif @${GO} mod tidy $(MAKE) git-clean-check -# TODO(radu): switch back to @latest once bogus doc changes are -# addressed; see https://github.com/cockroachdb/crlfmt/pull/44 .PHONY: format format: - go install github.com/cockroachdb/crlfmt@44a36ec7 && crlfmt -w -tab 2 . + go install -C internal/devtools github.com/cockroachdb/crlfmt && crlfmt -w -tab 2 . .PHONY: format-check format-check: diff --git a/vendor/github.com/cockroachdb/pebble/README.md b/vendor/github.com/cockroachdb/pebble/v2/README.md similarity index 57% rename from vendor/github.com/cockroachdb/pebble/README.md rename to vendor/github.com/cockroachdb/pebble/v2/README.md index e1aa479..d22b4fe 100644 --- a/vendor/github.com/cockroachdb/pebble/README.md +++ b/vendor/github.com/cockroachdb/pebble/v2/README.md @@ -1,4 +1,4 @@ -# Pebble [![Build Status](https://github.com/cockroachdb/pebble/actions/workflows/ci.yaml/badge.svg?branch=master)](https://github.com/cockroachdb/pebble/actions/workflows/ci.yaml) [![GoDoc](https://godoc.org/github.com/cockroachdb/pebble?status.svg)](https://godoc.org/github.com/cockroachdb/pebble) [Coverage](https://storage.googleapis.com/crl-codecover-public/pebble/index.html) +# Pebble [![Build Status](https://github.com/cockroachdb/pebble/v2/actions/workflows/ci.yaml/badge.svg?branch=master)](https://github.com/cockroachdb/pebble/v2/actions/workflows/ci.yaml) [![GoDoc](https://godoc.org/github.com/cockroachdb/pebble/v2?status.svg)](https://godoc.org/github.com/cockroachdb/pebble/v2) [Coverage](https://storage.googleapis.com/crl-codecover-public/pebble/index.html) #### [Nightly benchmarks](https://cockroachdb.github.io/pebble/) @@ -86,17 +86,22 @@ differences. ## RocksDB Compatibility -Pebble strives for forward compatibility with RocksDB 6.2.1 (the latest -version of RocksDB used by CockroachDB). Forward compatibility means -that a DB generated by RocksDB can be used by Pebble. Currently, Pebble -provides bidirectional compatibility with RocksDB (a Pebble generated DB -can be used by RocksDB) when using its FormatMostCompatible format. New -functionality that is backwards incompatible is gated behind new format -major versions. In general, Pebble only provides compatibility with the -subset of functionality and configuration used by CockroachDB. The scope -of RocksDB functionality and configuration is too large to adequately -test and document all the incompatibilities. The list below contains -known incompatibilities. +Pebble `v1` strives for forward compatibility with RocksDB 6.2.1 (the latest +version of RocksDB used by CockroachDB). Forward compatibility means that a DB +generated by RocksDB 6.2.1 can be upgraded for use by Pebble. Pebble versions in +the `v1` series may open DBs generated by RocksDB 6.2.1. Since its introduction, +Pebble has adopted various backwards-incompatible format changes that are gated +behind new 'format major versions'. Pebble `v2` and newer does not support +opening DBs generated by RocksDB. DBs generated by RocksDB may only be used with +recent versions of Pebble after migrating them through format major version +upgrades using previous versions of Pebble. See the below section of format +major versions. + +Even the RocksDB-compatible versions of Pebble only provide compatibility with +the subset of functionality and configuration used by CockroachDB. The scope of +RocksDB functionality and configuration is too large to adequately test and +document all the incompatibilities. The list below contains known +incompatibilities. * Pebble's use of WAL recycling is only compatible with RocksDB's `kTolerateCorruptedTailRecords` WAL recovery mode. Older versions of @@ -113,43 +118,52 @@ known incompatibilities. * SSTable format version 3 and 4. Pebble does not support version 3 and version 4 format sstables. The sstable format version is controlled by the `BlockBasedTableOptions::format_version` option. - See [#97](https://github.com/cockroachdb/pebble/issues/97). + See [#97](https://github.com/cockroachdb/pebble/v2/issues/97). ## Format major versions Over time Pebble has introduced new physical file formats. Backwards incompatible changes are made through the introduction of 'format major -versions'. By default, when Pebble opens a database, it defaults to -`FormatMostCompatible`. This version is bi-directionally compatible with RocksDB -6.2.1 (with the caveats described above). +versions'. By default, when Pebble opens a database, it defaults to the lowest +supported version. In `v1`, this is `FormatMostCompatible`, which is +bi-directionally compatible with RocksDB 6.2.1 (with the caveats described +above). + +Databases created by RocksDB or Pebble versions `v1` and earlier must be upgraded +to a compatible format major version before running newer Pebble versions. Newer +Pebble versions will refuse to open databases in no longer supported formats. To opt into new formats, a user may set `FormatMajorVersion` on the -[`Options`](https://pkg.go.dev/github.com/cockroachdb/pebble#Options) +[`Options`](https://pkg.go.dev/github.com/cockroachdb/pebble/v2#Options) supplied to -[`Open`](https://pkg.go.dev/github.com/cockroachdb/pebble#Open), or +[`Open`](https://pkg.go.dev/github.com/cockroachdb/pebble/v2#Open), or upgrade the format major version at runtime using -[`DB.RatchetFormatMajorVersion`](https://pkg.go.dev/github.com/cockroachdb/pebble#DB.RatchetFormatMajorVersion). +[`DB.RatchetFormatMajorVersion`](https://pkg.go.dev/github.com/cockroachdb/pebble/v2#DB.RatchetFormatMajorVersion). Format major version upgrades are permanent; There is no option to return to an earlier format. -The table below outlines the history of format major versions: - -| Name | Value | Migration | -|------------------------------------|-------|------------| -| FormatMostCompatible | 1 | No | -| FormatVersioned | 3 | No | -| FormatSetWithDelete | 4 | No | -| FormatBlockPropertyCollector | 5 | No | -| FormatSplitUserKeysMarked | 6 | Background | -| FormatSplitUserKeysMarkedCompacted | 7 | Blocking | -| FormatRangeKeys | 8 | No | -| FormatMinTableFormatPebblev1 | 9 | No | -| FormatPrePebblev1Marked | 10 | Background | -| FormatSSTableValueBlocks | 12 | No | -| FormatFlushableIngest | 13 | No | -| FormatPrePebblev1MarkedCompacted | 14 | Blocking | -| FormatDeleteSizedAndObsolete | 15 | No | -| FormatVirtualSSTables | 16 | No | +The table below outlines the history of format major versions, along with what +range of Pebble versions support that format. + +| Name | Value | Migration | Pebble support | +|------------------------------------|-------|------------|----------------| +| FormatMostCompatible | 1 | No | v1 | +| FormatVersioned | 3 | No | v1 | +| FormatSetWithDelete | 4 | No | v1 | +| FormatBlockPropertyCollector | 5 | No | v1 | +| FormatSplitUserKeysMarked | 6 | Background | v1 | +| FormatSplitUserKeysMarkedCompacted | 7 | Blocking | v1 | +| FormatRangeKeys | 8 | No | v1 | +| FormatMinTableFormatPebblev1 | 9 | No | v1 | +| FormatPrePebblev1Marked | 10 | Background | v1 | +| FormatSSTableValueBlocks | 12 | No | v1 | +| FormatFlushableIngest | 13 | No | v1, v2, master | +| FormatPrePebblev1MarkedCompacted | 14 | Blocking | v1, v2, master | +| FormatDeleteSizedAndObsolete | 15 | No | v1, v2, master | +| FormatVirtualSSTables | 16 | No | v1, v2, master | +| FormatSyntheticPrefixSuffix | 17 | No | v2, master | +| FormatFlushableIngestExcises | 18 | No | v2, master | +| FormatColumnarBlocks | 19 | No | v2, master | Upgrading to a format major version with 'Background' in the migration column may trigger background activity to rewrite physical file @@ -160,18 +174,30 @@ writes if upgrading a live database through `RatchetFormatMajorVersion`, but the method call will not return until the migration is complete. +Upgrading existing stores can be performed via the `RatchetFormatMajorVersion` +method. If the database does not use a custom comparer, merger, or block +property collectors, the `pebble` tool can also be used, at the latest version +that supports the format. For example: +``` +# WARNING: only use if no custom comparer/merger/property collector are necessary. +go run github.com/cockroachdb/pebble/v2/cmd/pebble@v1.1.3 db upgrade +``` + For reference, the table below lists the range of supported Pebble format major versions for CockroachDB releases. -| CockroachDB release | Earliest supported | Latest supported | -|---------------------|------------------------------------|---------------------------| -| 20.1 through 21.1 | FormatMostCompatible | FormatMostCompatible | -| 21.2 | FormatMostCompatible | FormatSetWithDelete | -| 21.2 | FormatMostCompatible | FormatSetWithDelete | -| 22.1 | FormatMostCompatible | FormatSplitUserKeysMarked | -| 22.2 | FormatMostCompatible | FormatPrePebblev1Marked | -| 23.1 | FormatSplitUserKeysMarkedCompacted | FormatFlushableIngest | -| 23.2 | FormatSplitUserKeysMarkedCompacted | FormatVirtualSSTables | +| CockroachDB release | Earliest supported | Latest supported | +|---------------------|------------------------------------|-----------------------------| +| 20.1 through 21.1 | FormatMostCompatible | FormatMostCompatible | +| 21.2 | FormatMostCompatible | FormatSetWithDelete | +| 21.2 | FormatMostCompatible | FormatSetWithDelete | +| 22.1 | FormatMostCompatible | FormatSplitUserKeysMarked | +| 22.2 | FormatMostCompatible | FormatPrePebblev1Marked | +| 23.1 | FormatSplitUserKeysMarkedCompacted | FormatFlushableIngest | +| 23.2 | FormatPrePebblev1Marked | FormatVirtualSSTables | +| 24.1 | FormatFlushableIngest | FormatSyntheticPrefixSuffix | +| 24.2 | FormatVirtualSSTables | FormatSyntheticPrefixSuffix | +| 24.3 | FormatSyntheticPrefixSuffix | FormatColumnarBlocks | ## Pedigree @@ -198,7 +224,7 @@ import ( "fmt" "log" - "github.com/cockroachdb/pebble" + "github.com/cockroachdb/pebble/v2" ) func main() { diff --git a/vendor/github.com/cockroachdb/pebble/batch.go b/vendor/github.com/cockroachdb/pebble/v2/batch.go similarity index 72% rename from vendor/github.com/cockroachdb/pebble/batch.go rename to vendor/github.com/cockroachdb/pebble/v2/batch.go index 95c7e30..4554bd4 100644 --- a/vendor/github.com/cockroachdb/pebble/batch.go +++ b/vendor/github.com/cockroachdb/pebble/v2/batch.go @@ -5,6 +5,7 @@ package pebble import ( + "bytes" "context" "encoding/binary" "fmt" @@ -16,24 +17,27 @@ import ( "time" "unsafe" + "github.com/cockroachdb/crlib/crtime" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/batchskl" - "github.com/cockroachdb/pebble/internal/humanize" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/private" - "github.com/cockroachdb/pebble/internal/rangedel" - "github.com/cockroachdb/pebble/internal/rangekey" - "github.com/cockroachdb/pebble/internal/rawalloc" + "github.com/cockroachdb/pebble/v2/batchrepr" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/batchskl" + "github.com/cockroachdb/pebble/v2/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/private" + "github.com/cockroachdb/pebble/v2/internal/rangedel" + "github.com/cockroachdb/pebble/v2/internal/rangekey" + "github.com/cockroachdb/pebble/v2/internal/rawalloc" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" ) const ( - batchCountOffset = 8 - batchHeaderLen = 12 - batchInitialSize = 1 << 10 // 1 KB - batchMaxRetainedSize = 1 << 20 // 1 MB - invalidBatchCount = 1<<32 - 1 - maxVarintLen32 = 5 + invalidBatchCount = 1<<32 - 1 + maxVarintLen32 = 5 + + defaultBatchInitialSize = 1 << 10 // 1 KB + defaultBatchMaxRetainedSize = 1 << 20 // 1 MB ) // ErrNotIndexed means that a read operation on a batch failed because the @@ -41,9 +45,9 @@ const ( var ErrNotIndexed = errors.New("pebble: batch not indexed") // ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted. -var ErrInvalidBatch = base.MarkCorruptionError(errors.New("pebble: invalid batch")) +var ErrInvalidBatch = batchrepr.ErrInvalidBatch -// ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted. +// ErrBatchTooLarge indicates that the size of this batch is over the limit of 4GB. var ErrBatchTooLarge = base.MarkCorruptionError(errors.Newf("pebble: batch too large: >= %s", humanize.Bytes.Uint64(maxBatchSize))) // DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is @@ -119,14 +123,17 @@ func (d DeferredBatchOp) Finish() error { // // # Large batches // -// The size of a batch is limited only by available memory (be aware that -// indexed batches require considerably additional memory for the skiplist -// structure). A given WAL file has a single memtable associated with it (this -// restriction could be removed, but doing so is onerous and complex). And a -// memtable has a fixed size due to the underlying fixed size arena. Note that -// this differs from RocksDB where a memtable can grow arbitrarily large using -// a list of arena chunks. In RocksDB this is accomplished by storing pointers -// in the arena memory, but that isn't possible in Go. +// The size of a batch is limited to 4GB, the max that can be represented by +// a uint32 type. Be aware that indexed batches require considerably more +// memory for the skiplist structure (this skiplist is separate from the 4GB +// batch limit). For users that require atomic writes of data that's greater +// than 4GB, DB.Ingest() is able to atomically ingest pre-computed sstables. +// A given WAL file has a single memtable associated with it (this restriction +// could be removed, but doing so is onerous and complex). And a memtable has +// a fixed size due to the underlying fixed size arena. Note that this differs +// from RocksDB where a memtable can grow arbitrarily large using a list of +// arena chunks. In RocksDB this is accomplished by storing pointers in the +// arena memory, but that isn't possible in Go. // // During Batch.Commit, a batch which is larger than a threshold (> // MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue @@ -185,6 +192,69 @@ func (d DeferredBatchOp) Finish() error { type Batch struct { batchInternal applied atomic.Bool + // lifecycle is used to negotiate the lifecycle of a Batch. A Batch and its + // underlying batchInternal.data byte slice may be reused. There are two + // mechanisms for reuse: + // + // 1. The caller may explicitly call [Batch.Reset] to reset the batch to be + // empty (while retaining the underlying repr's buffer). + // 2. The caller may call [Batch.Close], passing ownership off to Pebble, + // which may reuse the batch's memory to service new callers to + // [DB.NewBatch]. + // + // There's a complication to reuse: When WAL failover is configured, the + // Pebble commit pipeline may retain a pointer to the batch.data beyond the + // return of [Batch.Commit]. The user of the Batch may commit their batch + // and call Close or Reset before the commit pipeline is finished reading + // the data slice. Recycling immediately would cause a data race. + // + // To resolve this data race, this [lifecycle] atomic is used to determine + // safety and responsibility of reusing a batch. The low bits of the atomic + // are used as a reference count (really just the lowest bit—in practice + // there's only 1 code path that references). The [Batch] is passed into + // [wal.Writer]'s WriteRecord method as a [RefCount] implementation. The + // wal.Writer guarantees that if it will read [Batch.data] after the call to + // WriteRecord returns, it will increment the reference count. When it's + // complete, it'll unreference through invoking [Batch.Unref]. + // + // When the committer of a batch indicates intent to recycle a Batch through + // calling [Batch.Reset] or [Batch.Close], the lifecycle atomic is read. If + // an outstanding reference remains, it's unsafe to reuse Batch.data yet. In + // [Batch.Reset] the caller wants to reuse the [Batch] immediately, so we + // discard b.data to recycle the struct but not the underlying byte slice. + // In [Batch.Close], we set a special high bit [batchClosedBit] on lifecycle + // that indicates that the user will not use [Batch] again and we're free to + // recycle it when safe. When the commit pipeline eventually calls + // [Batch.Unref], the [batchClosedBit] is noticed and the batch is + // recycled. + lifecycle atomic.Int32 +} + +// batchClosedBit is a bit stored on Batch.lifecycle to indicate that the user +// called [Batch.Close] to release a Batch, but an open reference count +// prevented immediate recycling. +const batchClosedBit = 1 << 30 + +// TODO(jackson): Hide the wal.RefCount implementation from the public Batch interface. + +// Ref implements wal.RefCount. If the WAL writer may need to read b.data after +// it returns, it invokes Ref to increment the lifecycle's reference count. When +// it's finished, it invokes Unref. +func (b *Batch) Ref() { + b.lifecycle.Add(+1) +} + +// Unref implemets wal.RefCount. +func (b *Batch) Unref() { + if v := b.lifecycle.Add(-1); (v ^ batchClosedBit) == 0 { + // The [batchClosedBit] high bit is set, and there are no outstanding + // references. The user of the Batch called [Batch.Close], expecting the + // batch to be recycled. However, our outstanding reference count + // prevented recycling. As the last to dereference, we're now + // responsible for releasing the batch. + b.lifecycle.Store(0) + b.release() + } } // batchInternal contains the set of fields within Batch that are non-atomic and @@ -205,10 +275,9 @@ type batchInternal struct { // batches. Large batches will set the data field to nil when committed as // the data has been moved to a flushableBatch and inserted into the queue of // memtables. - data []byte - cmp Compare - formatKey base.FormatKey - abbreviatedKey AbbreviatedKey + data []byte + comparer *base.Comparer + opts batchOptions // An upper bound on required space to add this batch to a memtable. // Note that although batches are limited to 4 GiB in size, that limit @@ -252,7 +321,7 @@ type batchInternal struct { // tombstonesSeqNum. This is the case for all new iterators created over a // batch but it's not the case for all cloned iterators. tombstones []keyspan.Span - tombstonesSeqNum uint64 + tombstonesSeqNum base.SeqNum // Fragmented range key spans. Cached the first time a range key iterator is // requested. The cache is invalidated whenever a new range key @@ -261,7 +330,7 @@ type batchInternal struct { // tombstonesSeqNum. This is the case for all new iterators created over a // batch but it's not the case for all cloned iterators. rangeKeys []keyspan.Span - rangeKeysSeqNum uint64 + rangeKeysSeqNum base.SeqNum // The flushableBatch wrapper if the batch is too large to fit in the // memtable. @@ -378,14 +447,18 @@ var indexedBatchPool = sync.Pool{ }, } -func newBatch(db *DB) *Batch { +func newBatch(db *DB, opts ...BatchOption) *Batch { b := batchPool.Get().(*Batch) b.db = db + b.opts.ensureDefaults() + for _, opt := range opts { + opt(&b.opts) + } return b } -func newBatchWithSize(db *DB, size int) *Batch { - b := newBatch(db) +func newBatchWithSize(db *DB, size int, opts ...BatchOption) *Batch { + b := newBatch(db, opts...) if cap(b.data) < size { b.data = rawalloc.New(0, size) } @@ -394,12 +467,11 @@ func newBatchWithSize(db *DB, size int) *Batch { func newIndexedBatch(db *DB, comparer *Comparer) *Batch { i := indexedBatchPool.Get().(*indexedBatch) - i.batch.cmp = comparer.Compare - i.batch.formatKey = comparer.FormatKey - i.batch.abbreviatedKey = comparer.AbbreviatedKey + i.batch.comparer = comparer i.batch.db = db i.batch.index = &i.index - i.batch.index.Init(&i.batch.data, i.batch.cmp, i.batch.abbreviatedKey) + i.batch.index.Init(&i.batch.data, comparer.Compare, comparer.AbbreviatedKey) + i.batch.opts.ensureDefaults() return &i.batch } @@ -414,11 +486,11 @@ func newIndexedBatchWithSize(db *DB, comparer *Comparer, size int) *Batch { // nextSeqNum returns the batch "sequence number" that will be given to the next // key written to the batch. During iteration keys within an indexed batch are // given a sequence number consisting of their offset within the batch combined -// with the base.InternalKeySeqNumBatch bit. These sequence numbers are only +// with the base.SeqNumBatchBit bit. These sequence numbers are only // used during iteration, and the keys are assigned ordinary sequence numbers // when the batch is committed. -func (b *Batch) nextSeqNum() uint64 { - return uint64(len(b.data)) | base.InternalKeySeqNumBatch +func (b *Batch) nextSeqNum() base.SeqNum { + return base.SeqNum(len(b.data)) | base.SeqNumBatchBit } func (b *Batch) release() { @@ -436,10 +508,8 @@ func (b *Batch) release() { // but necessary so that we can use atomic.StoreUint32 for the Batch.applied // field. Without using an atomic to clear that field the Go race detector // complains. - b.Reset() - b.cmp = nil - b.formatKey = nil - b.abbreviatedKey = nil + b.reset() + b.comparer = nil if b.index == nil { batchPool.Put(b) @@ -451,7 +521,7 @@ func (b *Batch) release() { func (b *Batch) refreshMemTableSize() error { b.memTableSize = 0 - if len(b.data) < batchHeaderLen { + if len(b.data) < batchrepr.HeaderLen { return nil } @@ -471,28 +541,46 @@ func (b *Batch) refreshMemTableSize() error { b.countRangeDels++ case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: b.countRangeKeys++ + case InternalKeyKindSet, InternalKeyKindDelete, InternalKeyKindMerge, InternalKeyKindSingleDelete, InternalKeyKindSetWithDelete: + // fallthrough case InternalKeyKindDeleteSized: if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete { b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete } + case InternalKeyKindLogData: + // LogData does not contribute to memtable size. + continue case InternalKeyKindIngestSST: if b.minimumFormatMajorVersion < FormatFlushableIngest { b.minimumFormatMajorVersion = FormatFlushableIngest } // This key kind doesn't contribute to the memtable size. continue + case InternalKeyKindExcise: + if b.minimumFormatMajorVersion < FormatFlushableIngestExcises { + b.minimumFormatMajorVersion = FormatFlushableIngestExcises + } + // This key kind doesn't contribute to the memtable size. + continue + default: + // Note In some circumstances this might be temporary memory + // corruption that can be recovered by discarding the batch and + // trying again. In other cases, the batch repr might've been + // already persisted elsewhere, and we'll loop continuously trying + // to commit the same corrupted batch. The caller is responsible for + // distinguishing. + return errors.Wrapf(ErrInvalidBatch, "unrecognized kind %v", kind) } b.memTableSize += memTableEntrySize(len(key), len(value)) } - if b.countRangeKeys > 0 && b.minimumFormatMajorVersion < FormatRangeKeys { - b.minimumFormatMajorVersion = FormatRangeKeys - } return nil } // Apply the operations contained in the batch to the receiver batch. // // It is safe to modify the contents of the arguments after Apply returns. +// +// Apply returns ErrInvalidBatch if the provided batch is invalid in any way. func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error { if b.ingestedSSTBatch { panic("pebble: invalid batch application") @@ -500,23 +588,23 @@ func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error { if len(batch.data) == 0 { return nil } - if len(batch.data) < batchHeaderLen { + if len(batch.data) < batchrepr.HeaderLen { return ErrInvalidBatch } offset := len(b.data) if offset == 0 { b.init(offset) - offset = batchHeaderLen + offset = batchrepr.HeaderLen } - b.data = append(b.data, batch.data[batchHeaderLen:]...) + b.data = append(b.data, batch.data[batchrepr.HeaderLen:]...) b.setCount(b.Count() + batch.Count()) if b.db != nil || b.index != nil { // Only iterate over the new entries if we need to track memTableSize or in // order to update the index. - for iter := BatchReader(b.data[offset:]); len(iter) > 0; { + for iter := batchrepr.Reader(b.data[offset:]); len(iter) > 0; { offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0])) kind, key, value, ok, err := iter.Next() if !ok { @@ -530,8 +618,22 @@ func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error { b.countRangeDels++ case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: b.countRangeKeys++ - case InternalKeyKindIngestSST: + case InternalKeyKindIngestSST, InternalKeyKindExcise: panic("pebble: invalid key kind for batch") + case InternalKeyKindLogData: + // LogData does not contribute to memtable size. + continue + case InternalKeyKindSet, InternalKeyKindDelete, InternalKeyKindMerge, + InternalKeyKindSingleDelete, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized: + // fallthrough + default: + // Note In some circumstances this might be temporary memory + // corruption that can be recovered by discarding the batch and + // trying again. In other cases, the batch repr might've been + // already persisted elsewhere, and we'll loop continuously + // trying to commit the same corrupted batch. The caller is + // responsible for distinguishing. + return errors.Wrapf(ErrInvalidBatch, "unrecognized kind %v", kind) } if b.index != nil { var err error @@ -540,14 +642,14 @@ func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error { b.tombstones = nil b.tombstonesSeqNum = 0 if b.rangeDelIndex == nil { - b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) + b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.comparer.Compare, b.comparer.AbbreviatedKey) } err = b.rangeDelIndex.Add(uint32(offset)) case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: b.rangeKeys = nil b.rangeKeysSeqNum = 0 if b.rangeKeyIndex == nil { - b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) + b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.comparer.Compare, b.comparer.AbbreviatedKey) } err = b.rangeKeyIndex.Add(uint32(offset)) default: @@ -582,7 +684,7 @@ func (b *Batch) prepareDeferredKeyValueRecord(keyLen, valueLen int, kind Interna panic("pebble: batch already committing") } if len(b.data) == 0 { - b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen) + b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchrepr.HeaderLen) } b.count++ b.memTableSize += memTableEntrySize(keyLen, valueLen) @@ -634,7 +736,7 @@ func (b *Batch) prepareDeferredKeyRecord(keyLen int, kind InternalKeyKind) { panic("pebble: batch already committing") } if len(b.data) == 0 { - b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen) + b.init(keyLen + binary.MaxVarintLen64 + batchrepr.HeaderLen) } b.count++ b.memTableSize += memTableEntrySize(keyLen, 0) @@ -666,31 +768,38 @@ func (b *Batch) prepareDeferredKeyRecord(keyLen int, kind InternalKeyKind) { b.data = b.data[:pos+keyLen] } -// AddInternalKey allows the caller to add an internal key of point key kinds to -// a batch. Passing in an internal key of kind RangeKey* or RangeDelete will -// result in a panic. Note that the seqnum in the internal key is effectively -// ignored, even though the Kind is preserved. This is because the batch format -// does not allow for a per-key seqnum to be specified, only a batch-wide one. +// AddInternalKey allows the caller to add an internal key of point key or range +// key kinds (but not RangeDelete) to a batch. Passing in an internal key of +// kind RangeDelete will result in a panic. Note that the seqnum in the internal +// key is effectively ignored, even though the Kind is preserved. This is +// because the batch format does not allow for a per-key seqnum to be specified, +// only a batch-wide one. // // Note that non-indexed keys (IngestKeyKind{LogData,IngestSST}) are not // supported with this method as they require specialized logic. func (b *Batch) AddInternalKey(key *base.InternalKey, value []byte, _ *WriteOptions) error { keyLen := len(key.UserKey) hasValue := false - switch key.Kind() { - case InternalKeyKindRangeDelete, InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: - panic("unexpected range delete or range key kind in AddInternalKey") + switch kind := key.Kind(); kind { + case InternalKeyKindRangeDelete: + panic("unexpected range delete in AddInternalKey") case InternalKeyKindSingleDelete, InternalKeyKindDelete: - b.prepareDeferredKeyRecord(len(key.UserKey), key.Kind()) + b.prepareDeferredKeyRecord(keyLen, kind) + b.deferredOp.index = b.index + case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: + b.prepareDeferredKeyValueRecord(keyLen, len(value), kind) + hasValue = true + b.incrementRangeKeysCount() default: - b.prepareDeferredKeyValueRecord(keyLen, len(value), key.Kind()) + b.prepareDeferredKeyValueRecord(keyLen, len(value), kind) hasValue = true + b.deferredOp.index = b.index } - b.deferredOp.index = b.index copy(b.deferredOp.Key, key.UserKey) if hasValue { copy(b.deferredOp.Value, value) } + // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining // in go1.13 will remove the need for this. if b.index != nil { @@ -850,7 +959,7 @@ func (b *Batch) DeleteSizedDeferred(keyLen int, deletedValueSize uint32) *Deferr } // SingleDelete adds an action to the batch that single deletes the entry for key. -// See Writer.SingleDelete for more details on the semantics of SingleDelete. +// WARNING: See the detailed warning in Writer.SingleDelete before using this. // // It is safe to modify the contents of the arguments after SingleDelete returns. func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error { @@ -870,6 +979,8 @@ func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error { // operation to the batch, except it only takes in key/value lengths instead of // complete slices, letting the caller encode into those objects and then call // Finish() on the returned object. +// +// WARNING: See the detailed warning in Writer.SingleDelete before using this. func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp { b.prepareDeferredKeyRecord(keyLen, InternalKeyKindSingleDelete) b.deferredOp.index = b.index @@ -910,7 +1021,7 @@ func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp { b.tombstonesSeqNum = 0 // Range deletions are rare, so we lazily allocate the index for them. if b.rangeDelIndex == nil { - b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) + b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.comparer.Compare, b.comparer.AbbreviatedKey) } b.deferredOp.index = b.rangeDelIndex } @@ -924,6 +1035,15 @@ func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp { // // It is safe to modify the contents of the arguments after RangeKeySet returns. func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error { + if invariants.Enabled && b.db != nil { + // RangeKeySet is only supported on prefix keys. + if b.db.opts.Comparer.Split(start) != len(start) { + panic("RangeKeySet called with suffixed start key") + } + if b.db.opts.Comparer.Split(end) != len(end) { + panic("RangeKeySet called with suffixed end key") + } + } suffixValues := [1]rangekey.SuffixValue{{Suffix: suffix, Value: value}} internalValueLen := rangekey.EncodedSetValueLen(end, suffixValues[:]) @@ -951,15 +1071,12 @@ func (b *Batch) rangeKeySetDeferred(startLen, internalValueLen int) *DeferredBat func (b *Batch) incrementRangeKeysCount() { b.countRangeKeys++ - if b.minimumFormatMajorVersion < FormatRangeKeys { - b.minimumFormatMajorVersion = FormatRangeKeys - } if b.index != nil { b.rangeKeys = nil b.rangeKeysSeqNum = 0 // Range keys are rare, so we lazily allocate the index for them. if b.rangeKeyIndex == nil { - b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) + b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.comparer.Compare, b.comparer.AbbreviatedKey) } b.deferredOp.index = b.rangeKeyIndex } @@ -974,6 +1091,15 @@ func (b *Batch) incrementRangeKeysCount() { // It is safe to modify the contents of the arguments after RangeKeyUnset // returns. func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error { + if invariants.Enabled && b.db != nil { + // RangeKeyUnset is only supported on prefix keys. + if b.db.opts.Comparer.Split(start) != len(start) { + panic("RangeKeyUnset called with suffixed start key") + } + if b.db.opts.Comparer.Split(end) != len(end) { + panic("RangeKeyUnset called with suffixed end key") + } + } suffixes := [1][]byte{suffix} internalValueLen := rangekey.EncodedUnsetValueLen(end, suffixes[:]) @@ -1007,6 +1133,15 @@ func (b *Batch) rangeKeyUnsetDeferred(startLen, internalValueLen int) *DeferredB // It is safe to modify the contents of the arguments after RangeKeyDelete // returns. func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error { + if invariants.Enabled && b.db != nil { + // RangeKeyDelete is only supported on prefix keys. + if b.db.opts.Comparer.Split(start) != len(start) { + panic("RangeKeyDelete called with suffixed start key") + } + if b.db.opts.Comparer.Split(end) != len(end) { + panic("RangeKeyDelete called with suffixed end key") + } + } deferredOp := b.RangeKeyDeleteDeferred(len(start), len(end)) copy(deferredOp.Key, start) copy(deferredOp.Value, end) @@ -1047,9 +1182,9 @@ func (b *Batch) LogData(data []byte, _ *WriteOptions) error { return nil } -// IngestSST adds the FileNum for an sstable to the batch. The data will only be +// IngestSST adds the TableNum for an sstable to the batch. The data will only be // written to the WAL (not added to memtables or sstables). -func (b *Batch) ingestSST(fileNum base.FileNum) { +func (b *Batch) ingestSST(tableNum base.TableNum) { if b.Empty() { b.ingestedSSTBatch = true } else if !b.ingestedSSTBatch { @@ -1059,7 +1194,7 @@ func (b *Batch) ingestSST(fileNum base.FileNum) { origMemTableSize := b.memTableSize var buf [binary.MaxVarintLen64]byte - length := binary.PutUvarint(buf[:], uint64(fileNum)) + length := binary.PutUvarint(buf[:], uint64(tableNum)) b.prepareDeferredKeyRecord(length, InternalKeyKindIngestSST) copy(b.deferredOp.Key, buf[:length]) // Since IngestSST writes only to the WAL and does not affect the memtable, @@ -1070,17 +1205,36 @@ func (b *Batch) ingestSST(fileNum base.FileNum) { b.minimumFormatMajorVersion = FormatFlushableIngest } +// Excise adds the excise span for a flushable ingest containing an excise. The data +// will only be written to the WAL (not added to memtables or sstables). +func (b *Batch) excise(start, end []byte) { + if b.Empty() { + b.ingestedSSTBatch = true + } else if !b.ingestedSSTBatch { + // Batch contains other key kinds. + panic("pebble: invalid call to excise") + } + + origMemTableSize := b.memTableSize + b.prepareDeferredKeyValueRecord(len(start), len(end), InternalKeyKindExcise) + copy(b.deferredOp.Key, start) + copy(b.deferredOp.Value, end) + // Since excise writes only to the WAL and does not affect the memtable, + // we restore b.memTableSize to its original value. Note that Batch.count + // is not reset because for the InternalKeyKindIngestSST/Excise the count + // is the number of sstable paths which have been added to the batch. + b.memTableSize = origMemTableSize + b.minimumFormatMajorVersion = FormatFlushableIngestExcises +} + // Empty returns true if the batch is empty, and false otherwise. func (b *Batch) Empty() bool { - return len(b.data) <= batchHeaderLen + return batchrepr.IsEmpty(b.data) } // Len returns the current size of the batch in bytes. func (b *Batch) Len() int { - if len(b.data) <= batchHeaderLen { - return batchHeaderLen - } - return len(b.data) + return max(batchrepr.HeaderLen, len(b.data)) } // Repr returns the underlying batch representation. It is not safe to modify @@ -1088,21 +1242,25 @@ func (b *Batch) Len() int { // though any other mutation operation may do so. func (b *Batch) Repr() []byte { if len(b.data) == 0 { - b.init(batchHeaderLen) + b.init(batchrepr.HeaderLen) } - binary.LittleEndian.PutUint32(b.countData(), b.Count()) + batchrepr.SetCount(b.data, b.Count()) return b.data } // SetRepr sets the underlying batch representation. The batch takes ownership // of the supplied slice. It is not safe to modify it afterwards until the // Batch is no longer in use. +// +// SetRepr may return ErrInvalidBatch if the supplied slice fails to decode in +// any way. It will not return an error in any other circumstance. func (b *Batch) SetRepr(data []byte) error { - if len(data) < batchHeaderLen { - return base.CorruptionErrorf("invalid batch") + h, ok := batchrepr.ReadHeader(data) + if !ok { + return ErrInvalidBatch } b.data = data - b.count = uint64(binary.LittleEndian.Uint32(b.countData())) + b.count = uint64(h.Count) var err error if b.db != nil { // Only track memTableSize for batches that will be committed to the DB. @@ -1119,16 +1277,29 @@ func (b *Batch) SetRepr(data []byte) error { // later mutations. Its view can be refreshed via RefreshBatchSnapshot or // SetOptions(). func (b *Batch) NewIter(o *IterOptions) (*Iterator, error) { - return b.NewIterWithContext(context.Background(), o), nil + return b.NewIterWithContext(context.Background(), o) } // NewIterWithContext is like NewIter, and additionally accepts a context for // tracing. -func (b *Batch) NewIterWithContext(ctx context.Context, o *IterOptions) *Iterator { +func (b *Batch) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) { + if b.index == nil { + return nil, ErrNotIndexed + } + return b.db.newIter(ctx, b, newIterOpts{}, o), nil +} + +// NewBatchOnlyIter constructs an iterator that only reads the contents of the +// batch, and does not overlay the batch mutations on top of the DB state. +// +// The returned Iterator observes all of the Batch's existing mutations, but +// no later mutations. Its view can be refreshed via RefreshBatchSnapshot or +// SetOptions(). +func (b *Batch) NewBatchOnlyIter(ctx context.Context, o *IterOptions) (*Iterator, error) { if b.index == nil { - return &Iterator{err: ErrNotIndexed} + return nil, ErrNotIndexed } - return b.db.newIter(ctx, b, snapshotIterOpts{}, o) + return b.db.newIter(ctx, b, newIterOpts{batch: batchIterOpts{batchOnly: true}}, o), nil } // newInternalIter creates a new internalIterator that iterates over the @@ -1141,7 +1312,6 @@ func (b *Batch) newInternalIter(o *IterOptions) *batchIter { func (b *Batch) initInternalIter(o *IterOptions, iter *batchIter) { *iter = batchIter{ - cmp: b.cmp, batch: b, iter: b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()), // NB: We explicitly do not propagate the batch snapshot to the point @@ -1165,11 +1335,11 @@ func (b *Batch) initInternalIter(o *IterOptions, iter *batchIter) { // Filtering these batch points within the merging iterator ensures that // the batch iterator never needs to iterate beyond 'baz', because it // already found a smaller, visible key 'bax'. - snapshot: base.InternalKeySeqNumMax, + snapshot: base.SeqNumMax, } } -func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter { +func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot base.SeqNum) *keyspan.Iter { // Construct an iterator even if rangeDelIndex is nil, because it is allowed // to refresh later, so we need the container to exist. iter := new(keyspan.Iter) @@ -1177,9 +1347,9 @@ func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot uint64) *keyspan.I return iter } -func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) { +func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot base.SeqNum) { if b.rangeDelIndex == nil { - iter.Init(b.cmp, nil) + iter.Init(b.comparer.Compare, nil) return } @@ -1193,26 +1363,25 @@ func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapsh // cleared. nextSeqNum := b.nextSeqNum() if b.tombstones != nil && b.tombstonesSeqNum <= batchSnapshot { - iter.Init(b.cmp, b.tombstones) + iter.Init(b.comparer.Compare, b.tombstones) return } tombstones := make([]keyspan.Span, 0, b.countRangeDels) frag := &keyspan.Fragmenter{ - Cmp: b.cmp, - Format: b.formatKey, + Cmp: b.comparer.Compare, + Format: b.comparer.FormatKey, Emit: func(s keyspan.Span) { tombstones = append(tombstones, s) }, } it := &batchIter{ - cmp: b.cmp, batch: b, iter: b.rangeDelIndex.NewIter(nil, nil), snapshot: batchSnapshot, } fragmentRangeDels(frag, it, int(b.countRangeDels)) - iter.Init(b.cmp, tombstones) + iter.Init(b.comparer.Compare, tombstones) // If we just read all the tombstones in the batch (eg, batchSnapshot was // set to b.nextSeqNum()), then cache the tombstones so that a subsequent @@ -1234,8 +1403,8 @@ func fragmentRangeDels(frag *keyspan.Fragmenter, it internalIterator, count int) // Use a single []keyspan.Key buffer to avoid allocating many // individual []keyspan.Key slices with a single element each. keyBuf := make([]keyspan.Key, 0, count) - for key, val := it.First(); key != nil; key, val = it.Next() { - s := rangedel.Decode(*key, val.InPlaceValue(), keyBuf) + for kv := it.First(); kv != nil; kv = it.Next() { + s := rangedel.Decode(kv.K, kv.InPlaceValue(), keyBuf) keyBuf = s.Keys[len(s.Keys):] // Set a fixed capacity to avoid accidental overwriting. @@ -1245,7 +1414,7 @@ func fragmentRangeDels(frag *keyspan.Fragmenter, it internalIterator, count int) frag.Finish() } -func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter { +func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot base.SeqNum) *keyspan.Iter { // Construct an iterator even if rangeKeyIndex is nil, because it is allowed // to refresh later, so we need the container to exist. iter := new(keyspan.Iter) @@ -1253,9 +1422,9 @@ func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot uint64) *keyspan.I return iter } -func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) { +func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot base.SeqNum) { if b.rangeKeyIndex == nil { - iter.Init(b.cmp, nil) + iter.Init(b.comparer.Compare, nil) return } @@ -1268,26 +1437,25 @@ func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapsh // sequence number the cache would've been cleared. nextSeqNum := b.nextSeqNum() if b.rangeKeys != nil && b.rangeKeysSeqNum <= batchSnapshot { - iter.Init(b.cmp, b.rangeKeys) + iter.Init(b.comparer.Compare, b.rangeKeys) return } rangeKeys := make([]keyspan.Span, 0, b.countRangeKeys) frag := &keyspan.Fragmenter{ - Cmp: b.cmp, - Format: b.formatKey, + Cmp: b.comparer.Compare, + Format: b.comparer.FormatKey, Emit: func(s keyspan.Span) { rangeKeys = append(rangeKeys, s) }, } it := &batchIter{ - cmp: b.cmp, batch: b, iter: b.rangeKeyIndex.NewIter(nil, nil), snapshot: batchSnapshot, } - fragmentRangeKeys(frag, it, int(b.countRangeKeys)) - iter.Init(b.cmp, rangeKeys) + _ = fragmentRangeKeys(frag, it, int(b.countRangeKeys)) + iter.Init(b.comparer.Compare, rangeKeys) // If we just read all the range keys in the batch (eg, batchSnapshot was // set to b.nextSeqNum()), then cache the range keys so that a subsequent @@ -1309,8 +1477,8 @@ func fragmentRangeKeys(frag *keyspan.Fragmenter, it internalIterator, count int) // Use a single []keyspan.Key buffer to avoid allocating many // individual []keyspan.Key slices with a single element each. keyBuf := make([]keyspan.Key, 0, count) - for ik, val := it.First(); ik != nil; ik, val = it.Next() { - s, err := rangekey.Decode(*ik, val.InPlaceValue(), keyBuf) + for kv := it.First(); kv != nil; kv = it.Next() { + s, err := rangekey.Decode(kv.K, kv.InPlaceValue(), keyBuf) if err != nil { return err } @@ -1331,8 +1499,41 @@ func (b *Batch) Commit(o *WriteOptions) error { // Close closes the batch without committing it. func (b *Batch) Close() error { - b.release() - return nil + // The storage engine commit pipeline may retain a pointer to b.data beyond + // when Commit() returns. This is possible when configured for WAL failover; + // we don't know if we might need to read the batch data again until the + // batch has been durably synced [even if the committer doesn't care to wait + // for the sync and Sync()=false]. + // + // We still want to recycle these batches. The b.lifecycle atomic negotiates + // the batch's lifecycle. If the commit pipeline still might read b.data, + // b.lifecycle will be nonzeroed [the low bits hold a ref count]. + for { + v := b.lifecycle.Load() + switch { + case v == 0: + // A zero value indicates that the commit pipeline has no + // outstanding references to the batch. The commit pipeline is + // required to acquire a ref synchronously, so there is no risk that + // the commit pipeline will grab a ref after the call to release. We + // can simply release the batch. + b.release() + return nil + case (v & batchClosedBit) != 0: + // The batch has a batchClosedBit: This batch has already been closed. + return ErrClosed + default: + // There's an outstanding reference. Set the batch released bit so + // that the commit pipeline knows it should release the batch when + // it unrefs. + if b.lifecycle.CompareAndSwap(v, v|batchClosedBit) { + return nil + } + // CAS Failed—this indicates the outstanding reference just + // decremented (or the caller illegally closed the batch twice). + // Loop to reload. + } + } } // Indexed returns true if the batch is indexed (i.e. supports read @@ -1344,18 +1545,16 @@ func (b *Batch) Indexed() bool { // init ensures that the batch data slice is initialized to meet the // minimum required size and allocates space for the batch header. func (b *Batch) init(size int) { - n := batchInitialSize + b.opts.ensureDefaults() + n := b.opts.initialSizeBytes for n < size { n *= 2 } if cap(b.data) < n { - b.data = rawalloc.New(batchHeaderLen, n) - } - b.data = b.data[:batchHeaderLen] - // Zero the sequence number in the header. - for i := 0; i < len(b.data); i++ { - b.data[i] = 0 + b.data = rawalloc.New(batchrepr.HeaderLen, n) } + b.data = b.data[:batchrepr.HeaderLen] + clear(b.data) // Zero the sequence number in the header } // Reset resets the batch for reuse. The underlying byte slice (that is @@ -1364,19 +1563,31 @@ func (b *Batch) init(size int) { // of releasing resources when appropriate for batches that are internally // being reused. func (b *Batch) Reset() { + // In some configurations (WAL failover) the commit pipeline may retain + // b.data beyond a call to commit the batch. When this happens, b.lifecycle + // is nonzero (see the comment above b.lifecycle). In this case it's unsafe + // to mutate b.data, so we discard it. Note that Reset must not be called on + // a closed batch, so v > 0 implies a non-zero ref count and not + // batchClosedBit being set. + if v := b.lifecycle.Load(); v > 0 { + b.data = nil + } + b.reset() +} + +func (b *Batch) reset() { // Zero out the struct, retaining only the fields necessary for manual // reuse. b.batchInternal = batchInternal{ - data: b.data, - cmp: b.cmp, - formatKey: b.formatKey, - abbreviatedKey: b.abbreviatedKey, - index: b.index, - db: b.db, + data: b.data, + comparer: b.comparer, + opts: b.opts, + index: b.index, + db: b.db, } b.applied.Store(false) if b.data != nil { - if cap(b.data) > batchMaxRetainedSize { + if cap(b.data) > b.opts.maxRetainedSizeBytes { // If the capacity of the buffer is larger than our maximum // retention size, don't re-use it. Let it be GC-ed instead. // This prevents the memory from an unusually large batch from @@ -1384,30 +1595,15 @@ func (b *Batch) Reset() { b.data = nil } else { // Otherwise, reset the buffer for re-use. - b.data = b.data[:batchHeaderLen] - // Zero the sequence number in the header. - for i := 0; i < len(b.data); i++ { - b.data[i] = 0 - } + b.data = b.data[:batchrepr.HeaderLen] + clear(b.data) } } if b.index != nil { - b.index.Init(&b.data, b.cmp, b.abbreviatedKey) + b.index.Init(&b.data, b.comparer.Compare, b.comparer.AbbreviatedKey) } } -// seqNumData returns the 8 byte little-endian sequence number. Zero means that -// the batch has not yet been applied. -func (b *Batch) seqNumData() []byte { - return b.data[:8] -} - -// countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff" -// means that the batch is invalid. -func (b *Batch) countData() []byte { - return b.data[8:12] -} - func (b *Batch) grow(n int) { newSize := len(b.data) + n if uint64(newSize) >= maxBatchSize { @@ -1425,18 +1621,18 @@ func (b *Batch) grow(n int) { b.data = b.data[:newSize] } -func (b *Batch) setSeqNum(seqNum uint64) { - binary.LittleEndian.PutUint64(b.seqNumData(), seqNum) +func (b *Batch) setSeqNum(seqNum base.SeqNum) { + batchrepr.SetSeqNum(b.data, seqNum) } // SeqNum returns the batch sequence number which is applied to the first // record in the batch. The sequence number is incremented for each subsequent // record. It returns zero if the batch is empty. -func (b *Batch) SeqNum() uint64 { +func (b *Batch) SeqNum() base.SeqNum { if len(b.data) == 0 { - b.init(batchHeaderLen) + b.init(batchrepr.HeaderLen) } - return binary.LittleEndian.Uint64(b.seqNumData()) + return batchrepr.ReadSeqNum(b.data) } func (b *Batch) setCount(v uint32) { @@ -1449,62 +1645,28 @@ func (b *Batch) setCount(v uint32) { // batch isn't applied to the memtable. func (b *Batch) Count() uint32 { if b.count > math.MaxUint32 { - panic(ErrInvalidBatch) + panic(batchrepr.ErrInvalidBatch) } return uint32(b.count) } -// Reader returns a BatchReader for the current batch contents. If the batch is -// mutated, the new entries will not be visible to the reader. -func (b *Batch) Reader() BatchReader { +// Reader returns a batchrepr.Reader for the current batch contents. If the +// batch is mutated, the new entries will not be visible to the reader. +func (b *Batch) Reader() batchrepr.Reader { if len(b.data) == 0 { - b.init(batchHeaderLen) - } - return b.data[batchHeaderLen:] -} - -func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) { - // TODO(jackson): This will index out of bounds if there's no varint or an - // invalid varint (eg, a single 0xff byte). Correcting will add a bit of - // overhead. We could avoid that overhead whenever len(data) >= - // binary.MaxVarint32? - - var v uint32 - var n int - ptr := unsafe.Pointer(&data[0]) - if a := *((*uint8)(ptr)); a < 128 { - v = uint32(a) - n = 1 - } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { - v = uint32(b)<<7 | uint32(a) - n = 2 - } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { - v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) - n = 3 - } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { - v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - n = 4 - } else { - d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) - v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) - n = 5 - } - - data = data[n:] - if v > uint32(len(data)) { - return nil, nil, false + b.init(batchrepr.HeaderLen) } - return data[v:], data[:v], true + return batchrepr.Read(b.data) } // SyncWait is to be used in conjunction with DB.ApplyNoSyncWait. func (b *Batch) SyncWait() error { - now := time.Now() + now := crtime.NowMono() b.fsyncWait.Wait() if b.commitErr != nil { b.db = nil // prevent batch reuse on error } - waitDuration := time.Since(now) + waitDuration := now.Elapsed() b.commitStats.CommitWaitDuration += waitDuration b.commitStats.TotalDuration += waitDuration return b.commitErr @@ -1517,60 +1679,18 @@ func (b *Batch) CommitStats() BatchCommitStats { return b.commitStats } -// BatchReader iterates over the entries contained in a batch. -type BatchReader []byte - -// ReadBatch constructs a BatchReader from a batch representation. The -// header is not validated. ReadBatch returns a new batch reader and the -// count of entries contained within the batch. -func ReadBatch(repr []byte) (r BatchReader, count uint32) { - if len(repr) <= batchHeaderLen { - return nil, count - } - count = binary.LittleEndian.Uint32(repr[batchCountOffset:batchHeaderLen]) - return repr[batchHeaderLen:], count -} - -// Next returns the next entry in this batch, if there is one. If the reader has -// reached the end of the batch, Next returns ok=false and a nil error. If the -// batch is corrupt and the next entry is illegible, Next returns ok=false and a -// non-nil error. -func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool, err error) { - if len(*r) == 0 { - return 0, nil, nil, false, nil - } - kind = InternalKeyKind((*r)[0]) - if kind > InternalKeyKindMax { - return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "invalid key kind 0x%x", (*r)[0]) - } - *r, ukey, ok = batchDecodeStr((*r)[1:]) - if !ok { - return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding user key") - } - switch kind { - case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, - InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete, - InternalKeyKindDeleteSized: - *r, value, ok = batchDecodeStr(*r) - if !ok { - return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding %s value", kind) - } - } - return kind, ukey, value, true, nil -} - // Note: batchIter mirrors the implementation of flushableBatchIter. Keep the // two in sync. type batchIter struct { - cmp Compare batch *Batch iter batchskl.Iterator + kv base.InternalKV err error // snapshot holds a batch "sequence number" at which the batch is being // read. This sequence number has the InternalKeySeqNumBatch bit set, so it // encodes an offset within the batch. Only batch entries earlier than the // offset are visible during iteration. - snapshot uint64 + snapshot base.SeqNum } // batchIter implements the base.InternalIterator interface. @@ -1580,7 +1700,7 @@ func (i *batchIter) String() string { return "batch" } -func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { +func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { // Ignore TrySeekUsingNext if the view of the batch changed. if flags.TrySeekUsingNext() && flags.BatchJustRefreshed() { flags = flags.DisableTrySeekUsingNext() @@ -1592,66 +1712,87 @@ func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, ba ikey = i.iter.Next() } if ikey == nil { - return nil, base.LazyValue{} + i.kv = base.InternalKV{} + return nil } - return ikey, base.MakeInPlaceValue(i.value()) + i.kv.K = *ikey + i.kv.V = base.MakeInPlaceValue(i.value()) + return &i.kv } -func (i *batchIter) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - i.err = nil // clear cached iteration error - return i.SeekGE(key, flags) +func (i *batchIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + kv := i.SeekGE(key, flags) + if kv == nil { + return nil + } + // If the key doesn't have the sought prefix, return nil. + if !bytes.Equal(i.batch.comparer.Split.Prefix(kv.K.UserKey), prefix) { + i.kv = base.InternalKV{} + return nil + } + return kv } -func (i *batchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { +func (i *batchIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { i.err = nil // clear cached iteration error ikey := i.iter.SeekLT(key) for ikey != nil && ikey.SeqNum() >= i.snapshot { ikey = i.iter.Prev() } if ikey == nil { - return nil, base.LazyValue{} + i.kv = base.InternalKV{} + return nil } - return ikey, base.MakeInPlaceValue(i.value()) + i.kv.K = *ikey + i.kv.V = base.MakeInPlaceValue(i.value()) + return &i.kv } -func (i *batchIter) First() (*InternalKey, base.LazyValue) { +func (i *batchIter) First() *base.InternalKV { i.err = nil // clear cached iteration error ikey := i.iter.First() for ikey != nil && ikey.SeqNum() >= i.snapshot { ikey = i.iter.Next() } if ikey == nil { - return nil, base.LazyValue{} + i.kv = base.InternalKV{} + return nil } - return ikey, base.MakeInPlaceValue(i.value()) + i.kv.K = *ikey + i.kv.V = base.MakeInPlaceValue(i.value()) + return &i.kv } -func (i *batchIter) Last() (*InternalKey, base.LazyValue) { +func (i *batchIter) Last() *base.InternalKV { i.err = nil // clear cached iteration error ikey := i.iter.Last() for ikey != nil && ikey.SeqNum() >= i.snapshot { ikey = i.iter.Prev() } if ikey == nil { - return nil, base.LazyValue{} + i.kv = base.InternalKV{} + return nil } - return ikey, base.MakeInPlaceValue(i.value()) + i.kv.K = *ikey + i.kv.V = base.MakeInPlaceValue(i.value()) + return &i.kv } -func (i *batchIter) Next() (*InternalKey, base.LazyValue) { +func (i *batchIter) Next() *base.InternalKV { ikey := i.iter.Next() for ikey != nil && ikey.SeqNum() >= i.snapshot { ikey = i.iter.Next() } if ikey == nil { - return nil, base.LazyValue{} + i.kv = base.InternalKV{} + return nil } - return ikey, base.MakeInPlaceValue(i.value()) + i.kv.K = *ikey + i.kv.V = base.MakeInPlaceValue(i.value()) + return &i.kv } -func (i *batchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { +func (i *batchIter) NextPrefix(succKey []byte) *base.InternalKV { // Because NextPrefix was invoked `succKey` must be ≥ the key at i's current // position. Seek the arena iterator using TrySeekUsingNext. ikey := i.iter.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext()) @@ -1659,20 +1800,26 @@ func (i *batchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { ikey = i.iter.Next() } if ikey == nil { - return nil, base.LazyValue{} + i.kv = base.InternalKV{} + return nil } - return ikey, base.MakeInPlaceValue(i.value()) + i.kv.K = *ikey + i.kv.V = base.MakeInPlaceValue(i.value()) + return &i.kv } -func (i *batchIter) Prev() (*InternalKey, base.LazyValue) { +func (i *batchIter) Prev() *base.InternalKV { ikey := i.iter.Prev() for ikey != nil && ikey.SeqNum() >= i.snapshot { ikey = i.iter.Prev() } if ikey == nil { - return nil, base.LazyValue{} + i.kv = base.InternalKV{} + return nil } - return ikey, base.MakeInPlaceValue(i.value()) + i.kv.K = *ikey + i.kv.V = base.MakeInPlaceValue(i.value()) + return &i.kv } func (i *batchIter) value() []byte { @@ -1687,7 +1834,7 @@ func (i *batchIter) value() []byte { case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete, InternalKeyKindDeleteSized: - _, value, ok := batchDecodeStr(data[keyEnd:]) + _, value, ok := batchrepr.DecodeStr(data[keyEnd:]) if !ok { return nil } @@ -1710,6 +1857,13 @@ func (i *batchIter) SetBounds(lower, upper []byte) { i.iter.SetBounds(lower, upper) } +func (i *batchIter) SetContext(_ context.Context) {} + +// DebugTree is part of the InternalIterator interface. +func (i *batchIter) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p)", i, i) +} + type flushableBatchEntry struct { // offset is the byte offset of the record within the batch repr. offset uint32 @@ -1726,13 +1880,13 @@ type flushableBatchEntry struct { // flushableBatch wraps an existing batch and provides the interfaces needed // for making the batch flushable (i.e. able to mimic a memtable). type flushableBatch struct { - cmp Compare - formatKey base.FormatKey - data []byte + cmp Compare + comparer *base.Comparer + data []byte // The base sequence number for the entries in the batch. This is the same // value as Batch.seqNum() and is cached here for performance. - seqNum uint64 + seqNum base.SeqNum // A slice of offsets and indices for the entries in the batch. Used to // implement flushableBatchIter. Unlike the indexing on a normal batch, a @@ -1760,10 +1914,10 @@ var _ flushable = (*flushableBatch)(nil) // of the batch data. func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error) { b := &flushableBatch{ - data: batch.data, - cmp: comparer.Compare, - formatKey: comparer.FormatKey, - offsets: make([]flushableBatchEntry, 0, batch.Count()), + data: batch.data, + cmp: comparer.Compare, + comparer: comparer, + offsets: make([]flushableBatchEntry, 0, batch.Count()), } if b.data != nil { // Note that this sequence number is not correct when this batch has not @@ -1774,10 +1928,10 @@ func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error } var rangeDelOffsets []flushableBatchEntry var rangeKeyOffsets []flushableBatchEntry - if len(b.data) > batchHeaderLen { + if len(b.data) > batchrepr.HeaderLen { // Non-empty batch. var index uint32 - for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ { + for iter := batchrepr.Read(b.data); len(iter) > 0; { offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0])) kind, key, _, ok, err := iter.Next() if !ok { @@ -1805,9 +1959,31 @@ func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error rangeDelOffsets = append(rangeDelOffsets, entry) case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: rangeKeyOffsets = append(rangeKeyOffsets, entry) - default: + case InternalKeyKindLogData: + // Skip it; we never want to iterate over LogDatas. + continue + case InternalKeyKindSet, InternalKeyKindDelete, InternalKeyKindMerge, + InternalKeyKindSingleDelete, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized: b.offsets = append(b.offsets, entry) + default: + // Note In some circumstances this might be temporary memory + // corruption that can be recovered by discarding the batch and + // trying again. In other cases, the batch repr might've been + // already persisted elsewhere, and we'll loop continuously trying + // to commit the same corrupted batch. The caller is responsible for + // distinguishing. + return nil, errors.Wrapf(ErrInvalidBatch, "unrecognized kind %v", kind) } + // NB: index (used for entry.offset above) must not reach the + // batch.count, because the offset is used in conjunction with the + // batch's sequence number to assign sequence numbers to keys within + // the batch. If we assign KV's indexes as high as batch.count, + // we'll begin assigning keys sequence numbers that weren't + // allocated. + if index >= uint32(batch.count) { + return nil, base.AssertionFailedf("pebble: batch entry index %d ≥ batch.count %d", index, batch.count) + } + index++ } } @@ -1824,7 +2000,7 @@ func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error if len(rangeDelOffsets) > 0 { frag := &keyspan.Fragmenter{ Cmp: b.cmp, - Format: b.formatKey, + Format: b.comparer.FormatKey, Emit: func(s keyspan.Span) { b.tombstones = append(b.tombstones, s) }, @@ -1841,7 +2017,7 @@ func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error if len(rangeKeyOffsets) > 0 { frag := &keyspan.Fragmenter{ Cmp: b.cmp, - Format: b.formatKey, + Format: b.comparer.FormatKey, Emit: func(s keyspan.Span) { b.rangeKeys = append(b.rangeKeys, s) }, @@ -1853,12 +2029,14 @@ func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error cmp: b.cmp, index: -1, } - fragmentRangeKeys(frag, it, len(rangeKeyOffsets)) + if err := fragmentRangeKeys(frag, it, len(rangeKeyOffsets)); err != nil { + return nil, err + } } return b, nil } -func (b *flushableBatch) setSeqNum(seqNum uint64) { +func (b *flushableBatch) setSeqNum(seqNum base.SeqNum) { if b.seqNum != 0 { panic(fmt.Sprintf("pebble: flushableBatch.seqNum already set: %d", b.seqNum)) } @@ -1918,7 +2096,7 @@ func (b *flushableBatch) newIter(o *IterOptions) internalIterator { } // newFlushIter is part of the flushable interface. -func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator { +func (b *flushableBatch) newFlushIter(o *IterOptions) internalIterator { return &flushFlushableBatchIter{ flushableBatchIter: flushableBatchIter{ batch: b, @@ -1927,7 +2105,6 @@ func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) inte cmp: b.cmp, index: -1, }, - bytesIterated: bytesFlushed, } } @@ -1952,7 +2129,7 @@ func (b *flushableBatch) containsRangeKeys() bool { return len(b.rangeKeys) > 0 // inuseBytes is part of the flushable interface. func (b *flushableBatch) inuseBytes() uint64 { - return uint64(len(b.data) - batchHeaderLen) + return uint64(len(b.data) - batchrepr.HeaderLen) } // totalBytes is part of the flushable interface. @@ -1967,6 +2144,13 @@ func (b *flushableBatch) readyForFlush() bool { return true } +// computePossibleOverlaps is part of the flushable interface. +func (b *flushableBatch) computePossibleOverlaps( + fn func(bounded) shouldContinue, bounded ...bounded, +) { + computePossibleOverlapsGenericImpl[*flushableBatch](b, b.cmp, fn, bounded) +} + // Note: flushableBatchIter mirrors the implementation of batchIter. Keep the // two in sync. type flushableBatchIter struct { @@ -1982,7 +2166,7 @@ type flushableBatchIter struct { index int // For internal use by the implementation. - key InternalKey + kv base.InternalKV err error // Optionally initialize to bounds of iteration, if any. @@ -2000,38 +2184,42 @@ func (i *flushableBatchIter) String() string { // SeekGE implements internalIterator.SeekGE, as documented in the pebble // package. Ignore flags.TrySeekUsingNext() since we don't expect this // optimization to provide much benefit here at the moment. -func (i *flushableBatchIter) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*InternalKey, base.LazyValue) { +func (i *flushableBatchIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { i.err = nil // clear cached iteration error ikey := base.MakeSearchKey(key) i.index = sort.Search(len(i.offsets), func(j int) bool { return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0 }) if i.index >= len(i.offsets) { - return nil, base.LazyValue{} + return nil } - i.key = i.getKey(i.index) - if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { + kv := i.getKV(i.index) + if i.upper != nil && i.cmp(kv.K.UserKey, i.upper) >= 0 { i.index = len(i.offsets) - return nil, base.LazyValue{} + return nil } - return &i.key, i.value() + return kv } // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the // pebble package. func (i *flushableBatchIter) SeekPrefixGE( prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - return i.SeekGE(key, flags) +) *base.InternalKV { + kv := i.SeekGE(key, flags) + if kv == nil { + return nil + } + // If the key doesn't have the sought prefix, return nil. + if !bytes.Equal(i.batch.comparer.Split.Prefix(kv.K.UserKey), prefix) { + return nil + } + return kv } // SeekLT implements internalIterator.SeekLT, as documented in the pebble // package. -func (i *flushableBatchIter) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*InternalKey, base.LazyValue) { +func (i *flushableBatchIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { i.err = nil // clear cached iteration error ikey := base.MakeSearchKey(key) i.index = sort.Search(len(i.offsets), func(j int) bool { @@ -2039,85 +2227,85 @@ func (i *flushableBatchIter) SeekLT( }) i.index-- if i.index < 0 { - return nil, base.LazyValue{} + return nil } - i.key = i.getKey(i.index) - if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { + kv := i.getKV(i.index) + if i.lower != nil && i.cmp(kv.K.UserKey, i.lower) < 0 { i.index = -1 - return nil, base.LazyValue{} + return nil } - return &i.key, i.value() + return kv } // First implements internalIterator.First, as documented in the pebble // package. -func (i *flushableBatchIter) First() (*InternalKey, base.LazyValue) { +func (i *flushableBatchIter) First() *base.InternalKV { i.err = nil // clear cached iteration error if len(i.offsets) == 0 { - return nil, base.LazyValue{} + return nil } i.index = 0 - i.key = i.getKey(i.index) - if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { + kv := i.getKV(i.index) + if i.upper != nil && i.cmp(kv.K.UserKey, i.upper) >= 0 { i.index = len(i.offsets) - return nil, base.LazyValue{} + return nil } - return &i.key, i.value() + return kv } // Last implements internalIterator.Last, as documented in the pebble // package. -func (i *flushableBatchIter) Last() (*InternalKey, base.LazyValue) { +func (i *flushableBatchIter) Last() *base.InternalKV { i.err = nil // clear cached iteration error if len(i.offsets) == 0 { - return nil, base.LazyValue{} + return nil } i.index = len(i.offsets) - 1 - i.key = i.getKey(i.index) - if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { + kv := i.getKV(i.index) + if i.lower != nil && i.cmp(kv.K.UserKey, i.lower) < 0 { i.index = -1 - return nil, base.LazyValue{} + return nil } - return &i.key, i.value() + return kv } // Note: flushFlushableBatchIter.Next mirrors the implementation of // flushableBatchIter.Next due to performance. Keep the two in sync. -func (i *flushableBatchIter) Next() (*InternalKey, base.LazyValue) { +func (i *flushableBatchIter) Next() *base.InternalKV { if i.index == len(i.offsets) { - return nil, base.LazyValue{} + return nil } i.index++ if i.index == len(i.offsets) { - return nil, base.LazyValue{} + return nil } - i.key = i.getKey(i.index) - if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { + kv := i.getKV(i.index) + if i.upper != nil && i.cmp(kv.K.UserKey, i.upper) >= 0 { i.index = len(i.offsets) - return nil, base.LazyValue{} + return nil } - return &i.key, i.value() + return kv } -func (i *flushableBatchIter) Prev() (*InternalKey, base.LazyValue) { +func (i *flushableBatchIter) Prev() *base.InternalKV { if i.index < 0 { - return nil, base.LazyValue{} + return nil } i.index-- if i.index < 0 { - return nil, base.LazyValue{} + return nil } - i.key = i.getKey(i.index) - if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { + kv := i.getKV(i.index) + if i.lower != nil && i.cmp(kv.K.UserKey, i.lower) < 0 { i.index = -1 - return nil, base.LazyValue{} + return nil } - return &i.key, i.value() + return kv } // Note: flushFlushableBatchIter.NextPrefix mirrors the implementation of // flushableBatchIter.NextPrefix due to performance. Keep the two in sync. -func (i *flushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { +func (i *flushableBatchIter) NextPrefix(succKey []byte) *base.InternalKV { return i.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext()) } @@ -2125,19 +2313,27 @@ func (i *flushableBatchIter) getKey(index int) InternalKey { e := &i.offsets[index] kind := InternalKeyKind(i.data[e.offset]) key := i.data[e.keyStart:e.keyEnd] - return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind) + return base.MakeInternalKey(key, i.batch.seqNum+base.SeqNum(e.index), kind) +} + +func (i *flushableBatchIter) getKV(index int) *base.InternalKV { + i.kv = base.InternalKV{ + K: i.getKey(index), + V: base.MakeInPlaceValue(i.extractValue()), + } + return &i.kv } -func (i *flushableBatchIter) value() base.LazyValue { +func (i *flushableBatchIter) extractValue() []byte { p := i.data[i.offsets[i.index].offset:] if len(p) == 0 { i.err = base.CorruptionErrorf("corrupted batch") - return base.LazyValue{} + return nil } kind := InternalKeyKind(p[0]) if kind > InternalKeyKindMax { i.err = base.CorruptionErrorf("corrupted batch") - return base.LazyValue{} + return nil } var value []byte var ok bool @@ -2146,13 +2342,13 @@ func (i *flushableBatchIter) value() base.LazyValue { InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete, InternalKeyKindDeleteSized: keyEnd := i.offsets[i.index].keyEnd - _, value, ok = batchDecodeStr(i.data[keyEnd:]) + _, value, ok = batchrepr.DecodeStr(i.data[keyEnd:]) if !ok { i.err = base.CorruptionErrorf("corrupted batch") - return base.LazyValue{} + return nil } } - return base.MakeInPlaceValue(value) + return value } func (i *flushableBatchIter) Valid() bool { @@ -2172,11 +2368,17 @@ func (i *flushableBatchIter) SetBounds(lower, upper []byte) { i.upper = upper } +func (i *flushableBatchIter) SetContext(_ context.Context) {} + +// DebugTree is part of the InternalIterator interface. +func (i *flushableBatchIter) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p)", i, i) +} + // flushFlushableBatchIter is similar to flushableBatchIter but it keeps track // of number of bytes iterated. type flushFlushableBatchIter struct { flushableBatchIter - bytesIterated *uint64 } // flushFlushableBatchIter implements the base.InternalIterator interface. @@ -2186,82 +2388,80 @@ func (i *flushFlushableBatchIter) String() string { return "flushable-batch" } -func (i *flushFlushableBatchIter) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*InternalKey, base.LazyValue) { +func (i *flushFlushableBatchIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { panic("pebble: SeekGE unimplemented") } func (i *flushFlushableBatchIter) SeekPrefixGE( prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { +) *base.InternalKV { panic("pebble: SeekPrefixGE unimplemented") } -func (i *flushFlushableBatchIter) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*InternalKey, base.LazyValue) { +func (i *flushFlushableBatchIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { panic("pebble: SeekLT unimplemented") } -func (i *flushFlushableBatchIter) First() (*InternalKey, base.LazyValue) { +func (i *flushFlushableBatchIter) First() *base.InternalKV { i.err = nil // clear cached iteration error - key, val := i.flushableBatchIter.First() - if key == nil { - return nil, base.LazyValue{} - } - entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset - *i.bytesIterated += uint64(entryBytes) + i.valueSize() - return key, val + return i.flushableBatchIter.First() } -func (i *flushFlushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { +func (i *flushFlushableBatchIter) NextPrefix(succKey []byte) *base.InternalKV { panic("pebble: Prev unimplemented") } // Note: flushFlushableBatchIter.Next mirrors the implementation of // flushableBatchIter.Next due to performance. Keep the two in sync. -func (i *flushFlushableBatchIter) Next() (*InternalKey, base.LazyValue) { +func (i *flushFlushableBatchIter) Next() *base.InternalKV { if i.index == len(i.offsets) { - return nil, base.LazyValue{} + return nil } i.index++ if i.index == len(i.offsets) { - return nil, base.LazyValue{} + return nil } - i.key = i.getKey(i.index) - entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset - *i.bytesIterated += uint64(entryBytes) + i.valueSize() - return &i.key, i.value() + return i.getKV(i.index) } -func (i flushFlushableBatchIter) Prev() (*InternalKey, base.LazyValue) { +func (i flushFlushableBatchIter) Prev() *base.InternalKV { panic("pebble: Prev unimplemented") } -func (i flushFlushableBatchIter) valueSize() uint64 { - p := i.data[i.offsets[i.index].offset:] - if len(p) == 0 { - i.err = base.CorruptionErrorf("corrupted batch") - return 0 +// batchOptions holds the parameters to configure batch. +type batchOptions struct { + initialSizeBytes int + maxRetainedSizeBytes int +} + +// ensureDefaults creates batch options with default values. +func (o *batchOptions) ensureDefaults() { + if o.initialSizeBytes <= 0 { + o.initialSizeBytes = defaultBatchInitialSize } - kind := InternalKeyKind(p[0]) - if kind > InternalKeyKindMax { - i.err = base.CorruptionErrorf("corrupted batch") - return 0 + if o.maxRetainedSizeBytes <= 0 { + o.maxRetainedSizeBytes = defaultBatchMaxRetainedSize } - var length uint64 - switch kind { - case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete: - keyEnd := i.offsets[i.index].keyEnd - v, n := binary.Uvarint(i.data[keyEnd:]) - if n <= 0 { - i.err = base.CorruptionErrorf("corrupted batch") - return 0 - } - length = v + uint64(n) +} + +// BatchOption allows customizing the batch. +type BatchOption func(*batchOptions) + +// WithInitialSizeBytes sets a custom initial size for the batch. Defaults +// to 1KB. +func WithInitialSizeBytes(s int) BatchOption { + return func(opts *batchOptions) { + opts.initialSizeBytes = s + } +} + +// WithMaxRetainedSizeBytes sets a custom max size for the batch to be +// re-used. Any batch which exceeds the max retained size would be GC-ed. +// Defaults to 1MB. +func WithMaxRetainedSizeBytes(s int) BatchOption { + return func(opts *batchOptions) { + opts.maxRetainedSizeBytes = s } - return length } // batchSort returns iterators for the sorted contents of the batch. It is diff --git a/vendor/github.com/cockroachdb/pebble/v2/batchrepr/reader.go b/vendor/github.com/cockroachdb/pebble/v2/batchrepr/reader.go new file mode 100644 index 0000000..f8cf9b4 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/batchrepr/reader.go @@ -0,0 +1,146 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package batchrepr provides interfaces for reading and writing the binary +// batch representation. This batch representation is used in-memory while +// constructing a batch and on-disk within the write-ahead log. +package batchrepr + +import ( + "encoding/binary" + "fmt" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/pkg/errors" +) + +// ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted. +var ErrInvalidBatch = base.MarkCorruptionError(errors.New("pebble: invalid batch")) + +const ( + // HeaderLen is the length of the batch header in bytes. + HeaderLen = 12 + // countOffset is the index into the batch representation where the + // count is stored, encoded as a little-endian uint32. + countOffset = 8 +) + +// IsEmpty returns true iff the batch contains zero keys. +func IsEmpty(repr []byte) bool { + return len(repr) <= HeaderLen +} + +// ReadHeader reads the contents of the batch header. If the repr is too small +// to contain a valid batch header, ReadHeader returns ok=false. +func ReadHeader(repr []byte) (h Header, ok bool) { + if len(repr) < HeaderLen { + return h, false + } + return Header{ + SeqNum: ReadSeqNum(repr), + Count: binary.LittleEndian.Uint32(repr[countOffset:HeaderLen]), + }, true +} + +// Header describes the contents of a batch header. +type Header struct { + // SeqNum is the sequence number at which the batch is committed. A batch + // that has not yet committed will have a zero sequence number. + SeqNum base.SeqNum + // Count is the count of keys written to the batch. + Count uint32 +} + +// String returns a string representation of the header's contents. +func (h Header) String() string { + return fmt.Sprintf("[seqNum=%d,count=%d]", h.SeqNum, h.Count) +} + +// ReadSeqNum reads the sequence number encoded within the batch. ReadSeqNum +// does not validate that the repr is valid. It's exported only for very +// performance sensitive code paths that should not necessarily read the rest of +// the header as well. +func ReadSeqNum(repr []byte) base.SeqNum { + return base.SeqNum(binary.LittleEndian.Uint64(repr[:countOffset])) +} + +// Read constructs a Reader from an encoded batch representation, ignoring the +// contents of the Header. +func Read(repr []byte) (r Reader) { + if len(repr) <= HeaderLen { + return nil + } + return repr[HeaderLen:] +} + +// Reader iterates over the entries contained in a batch. +type Reader []byte + +// Next returns the next entry in this batch, if there is one. If the reader has +// reached the end of the batch, Next returns ok=false and a nil error. If the +// batch is corrupt and the next entry is illegible, Next returns ok=false and a +// non-nil error. +func (r *Reader) Next() (kind base.InternalKeyKind, ukey []byte, value []byte, ok bool, err error) { + if len(*r) == 0 { + return 0, nil, nil, false, nil + } + kind = base.InternalKeyKind((*r)[0]) + if kind > base.InternalKeyKindMax { + return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "invalid key kind 0x%x", (*r)[0]) + } + *r, ukey, ok = DecodeStr((*r)[1:]) + if !ok { + return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding user key") + } + switch kind { + case base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindRangeDelete, + base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindDeleteSized, base.InternalKeyKindExcise: + *r, value, ok = DecodeStr(*r) + if !ok { + return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding %s value", kind) + } + } + return kind, ukey, value, true, nil +} + +// DecodeStr decodes a varint encoded string from data, returning the remainder +// of data and the decoded string. It returns ok=false if the varint is invalid. +// +// TODO(jackson): This should be unexported once pebble package callers have +// been updated to use appropriate abstractions. +func DecodeStr(data []byte) (odata []byte, s []byte, ok bool) { + // TODO(jackson): This will index out of bounds if there's no varint or an + // invalid varint (eg, a single 0xff byte). Correcting will add a bit of + // overhead. We could avoid that overhead whenever len(data) >= + // binary.MaxVarint32? + + var v uint32 + var n int + ptr := unsafe.Pointer(&data[0]) + if a := *((*uint8)(ptr)); a < 128 { + v = uint32(a) + n = 1 + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + v = uint32(b)<<7 | uint32(a) + n = 2 + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + n = 3 + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + n = 4 + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + n = 5 + } + + data = data[n:] + if v > uint32(len(data)) { + return nil, nil, false + } + return data[v:], data[:v], true +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/batchrepr/writer.go b/vendor/github.com/cockroachdb/pebble/v2/batchrepr/writer.go new file mode 100644 index 0000000..9e7b6c0 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/batchrepr/writer.go @@ -0,0 +1,25 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package batchrepr + +import ( + "encoding/binary" + + "github.com/cockroachdb/pebble/v2/internal/base" +) + +// SetSeqNum mutates the provided batch representation, storing the provided +// sequence number in its header. The provided byte slice must already be at +// least HeaderLen bytes long or else SetSeqNum will panic. +func SetSeqNum(repr []byte, seqNum base.SeqNum) { + binary.LittleEndian.PutUint64(repr[:countOffset], uint64(seqNum)) +} + +// SetCount mutates the provided batch representation, storing the provided +// count in its header. The provided byte slice must already be at least +// HeaderLen bytes long or else SetCount will panic. +func SetCount(repr []byte, count uint32) { + binary.LittleEndian.PutUint32(repr[countOffset:HeaderLen], count) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/blob_rewrite.go b/vendor/github.com/cockroachdb/pebble/v2/blob_rewrite.go new file mode 100644 index 0000000..8d8ebae --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/blob_rewrite.go @@ -0,0 +1,484 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "container/heap" + "context" + "iter" + "runtime/pprof" + "slices" + "sync/atomic" + "time" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/problemspans" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/colblk" +) + +// A pickedBlobFileCompaction is a blob file rewrite compaction that has been +// picked by the compaction picker. +type pickedBlobFileCompaction struct { + vers *manifest.Version + file manifest.BlobFileMetadata + referencingTables []*manifest.TableMetadata +} + +// Assert that *pickedBlobFileCompaction implements the pickedCompaction +// interface. +var _ pickedCompaction = (*pickedBlobFileCompaction)(nil) + +func (c *pickedBlobFileCompaction) ManualID() uint64 { return 0 } + +func (c *pickedBlobFileCompaction) WaitingCompaction() WaitingCompaction { + entry := scheduledCompactionMap[compactionKindBlobFileRewrite] + return WaitingCompaction{ + Optional: entry.optional, + Priority: entry.priority, + } +} + +func (c *pickedBlobFileCompaction) ConstructCompaction( + d *DB, grantHandle CompactionGrantHandle, +) compaction { + // Add a reference to the version. The compaction will release the reference + // when it completes. + c.vers.Ref() + return &blobFileRewriteCompaction{ + beganAt: d.timeNow(), + grantHandle: grantHandle, + version: c.vers, + input: c.file, + referencingTables: c.referencingTables, + objCreateOpts: objstorage.CreateOptions{ + // TODO(jackson): Enable shared storage for blob files. + PreferSharedStorage: false, + WriteCategory: getDiskWriteCategoryForCompaction(d.opts, compactionKindBlobFileRewrite), + }, + } +} + +// A blobFileRewriteCompaction is a special variant of a compaction that +// rewrites a blob file without rewriting sstables. When the compaction +// completes, the Version's mapping of blob file ID to disk file number is +// updated to point to the new blob file. The blob file is rewritten without +// copying over values that are no longer referenced by any tables, reclaiming +// disk space. +type blobFileRewriteCompaction struct { + // cancel is a bool that can be used by other goroutines to signal a compaction + // to cancel, such as if a conflicting excise operation raced it to manifest + // application. Only holders of the manifest lock will write to this atomic. + cancel atomic.Bool + // beganAt is the time when the compaction began. + beganAt time.Time + // grantHandle is a handle to the compaction that can be used to track + // progress. + grantHandle CompactionGrantHandle + // version is a referenced version obtained when the compaction was picked. + // This version must be unreferenced when the compaction is complete. + version *manifest.Version + // versionEditApplied is set to true when a compaction has completed and the + // resulting version has been installed (if successful), but the compaction + // goroutine is still cleaning up (eg, deleting obsolete files). + versionEditApplied bool + // input is the blob file that is being rewritten. + input manifest.BlobFileMetadata + // referencingTables is the set of sstables that reference the input blob + // file in version. + referencingTables []*manifest.TableMetadata + objCreateOpts objstorage.CreateOptions + internalIteratorStats base.InternalIteratorStats + bytesWritten atomic.Int64 // Total bytes written to the new blob file. +} + +// Assert that *blobFileRewriteCompaction implements the Compaction interface. +var _ compaction = (*blobFileRewriteCompaction)(nil) + +func (c *blobFileRewriteCompaction) AddInProgressLocked(d *DB) { + d.mu.compact.inProgress[c] = struct{}{} + // TODO(jackson): Currently the compaction picker iterates through all + // ongoing compactions in order to limit the number of concurrent blob + // rewrite compactions to 1. + // + // Consider instead tracking which blob files are being rewritten, and we + // can allow multiple concurrent blob rewrite compactions as long as they + // compact different blob files. +} + +func (c *blobFileRewriteCompaction) BeganAt() time.Time { return c.beganAt } +func (c *blobFileRewriteCompaction) Bounds() *base.UserKeyBounds { return nil } +func (c *blobFileRewriteCompaction) Cancel() { c.cancel.Store(true) } +func (c *blobFileRewriteCompaction) IsDownload() bool { return false } +func (c *blobFileRewriteCompaction) IsFlush() bool { return false } +func (c *blobFileRewriteCompaction) GrantHandle() CompactionGrantHandle { return c.grantHandle } +func (c *blobFileRewriteCompaction) Tables() iter.Seq2[int, *manifest.TableMetadata] { + // No tables; return an empty iterator. + return func(yield func(int, *manifest.TableMetadata) bool) {} +} + +func (c *blobFileRewriteCompaction) ObjioTracingContext(ctx context.Context) context.Context { + if objiotracing.Enabled { + ctx = objiotracing.WithReason(ctx, objiotracing.ForCompaction) + } + return ctx +} + +func (c *blobFileRewriteCompaction) PprofLabels(UserKeyCategories) pprof.LabelSet { + return pprof.Labels("pebble", "blob-rewrite") +} + +func (c *blobFileRewriteCompaction) VersionEditApplied() bool { + return c.versionEditApplied +} + +func (c *blobFileRewriteCompaction) Execute(jobID JobID, d *DB) error { + ctx := context.TODO() + if objiotracing.Enabled { + ctx = objiotracing.WithReason(ctx, objiotracing.ForCompaction) + } + c.grantHandle.Started() + // The version stored in the compaction is ref'd when the compaction is + // created. We're responsible for un-refing it when the compaction is + // complete. + defer c.version.UnrefLocked() + + // Notify the event listener that the compaction has begun. + info := BlobFileRewriteInfo{ + JobID: int(jobID), + Input: BlobFileInfo{ + BlobFileID: c.input.FileID, + DiskFileNum: c.input.Physical.FileNum, + Size: c.input.Physical.Size, + ValueSize: c.input.Physical.ValueSize, + }, + } + d.opts.EventListener.BlobFileRewriteBegin(info) + startTime := d.timeNow() + + // Run the blob file rewrite. + objMeta, ve, err := d.runBlobFileRewriteLocked(ctx, jobID, c) + + info.Duration = d.timeNow().Sub(startTime) + + // Update the version with the remapped blob file. + if err == nil { + info.Output.BlobFileID = ve.NewBlobFiles[0].FileID + info.Output.DiskFileNum = ve.NewBlobFiles[0].Physical.FileNum + info.Output.Size = ve.NewBlobFiles[0].Physical.Size + info.Output.ValueSize = ve.NewBlobFiles[0].Physical.ValueSize + err = d.mu.versions.UpdateVersionLocked(func() (versionUpdate, error) { + // It's possible that concurrent compactions removed references to + // the blob file while the blob file rewrite compaction was running. + // Now that we have the manifest lock, check if the blob file is + // still current. If not, we bubble up ErrCancelledCompaction. + v := d.mu.versions.currentVersion() + currentDiskFileNum, ok := v.BlobFiles.Lookup(c.input.FileID) + if !ok { + return versionUpdate{}, errors.Wrapf(ErrCancelledCompaction, + "blob file %s became unreferenced", c.input.FileID) + } + // Assert that the current version's disk file number for the blob + // matches the one we rewrote. This compaction should be the only + // rewrite compaction running for this blob file. + if currentDiskFileNum != c.input.Physical.FileNum { + return versionUpdate{}, base.AssertionFailedf( + "blob file %s was rewritten to %s during rewrite compaction of %s", + c.input.FileID, currentDiskFileNum, c.input.Physical.FileNum) + } + return versionUpdate{ + VE: ve, + JobID: jobID, + InProgressCompactionsFn: func() []compactionInfo { + return d.getInProgressCompactionInfoLocked(c) + }, + }, nil + }) + } + + d.mu.versions.incrementCompactions(compactionKindBlobFileRewrite, nil, c.bytesWritten.Load(), err) + d.mu.versions.incrementCompactionBytes(-c.bytesWritten.Load()) + + // Update the read state to publish the new version. + if err == nil { + d.updateReadStateLocked(d.opts.DebugCheck) + } + + // Ensure we clean up the blob file we created on failure. + if err != nil { + if objMeta.DiskFileNum != 0 { + d.mu.versions.obsoleteBlobs = mergeObsoleteFiles(d.mu.versions.obsoleteBlobs, []obsoleteFile{ + { + fileType: base.FileTypeBlob, + fs: d.opts.FS, + path: d.objProvider.Path(objMeta), + fileNum: objMeta.DiskFileNum, + // We don't know the size of the output blob file--it may have + // been half-written. We use the input blob file size as an + // approximation for deletion pacing. + fileSize: c.input.Physical.Size, + isLocal: true, + }, + }) + } + } + + // Notify the event listener that the compaction has ended. + now := d.timeNow() + info.TotalDuration = now.Sub(c.beganAt) + info.Done = true + info.Err = err + d.opts.EventListener.BlobFileRewriteEnd(info) + return nil +} + +func (c *blobFileRewriteCompaction) Info() compactionInfo { + return compactionInfo{ + kind: compactionKindBlobFileRewrite, + versionEditApplied: c.versionEditApplied, + outputLevel: -1, + } +} + +func (c *blobFileRewriteCompaction) RecordError(*problemspans.ByLevel, error) { + // TODO(jackson): Track problematic blob files and avoid re-picking the same + // blob file compaction. +} + +// runBlobFileRewriteLocked runs a blob file rewrite. d.mu must be held when +// calling this, although it may be dropped and re-acquired during the course of +// the method. +func (d *DB) runBlobFileRewriteLocked( + ctx context.Context, jobID JobID, c *blobFileRewriteCompaction, +) (objstorage.ObjectMetadata, *manifest.VersionEdit, error) { + // Drop the database mutex while we perform the rewrite, and re-acquire it + // before returning. + d.mu.Unlock() + defer d.mu.Lock() + + // Construct the block.ReadEnv configured with a buffer pool. Setting the + // buffer pool ensures we won't cache blocks in the block cache. As soon as + // the compaction finishes new iterators will read the new blob file, so it + // would be unlikely the cached blocks would be reused. + var bufferPool block.BufferPool + bufferPool.Init(4) + defer bufferPool.Release() + env := block.ReadEnv{ + Stats: &c.internalIteratorStats, + BufferPool: &bufferPool, + ReportCorruptionFn: d.reportCorruption, + } + + // Create a new file for the rewritten blob file. + writable, objMeta, err := d.newCompactionOutputBlob(jobID, compactionKindBlobFileRewrite, -1, &c.bytesWritten, c.objCreateOpts) + if err != nil { + return objstorage.ObjectMetadata{}, nil, err + } + // Initialize a blob file rewriter. We pass L6 to MakeBlobWriterOptions. + // There's no single associated level with a blob file. A long-lived blob + // file that gets rewritten is likely to mostly be referenced from L6. + // TODO(jackson): Consider refactoring to remove the level association. + rewriter := newBlobFileRewriter( + d.fileCache, + env, + objMeta.DiskFileNum, + writable, + d.opts.MakeBlobWriterOptions(6), + c.referencingTables, + c.input, + ) + // Perform the rewrite. + stats, err := rewriter.Rewrite(ctx) + if err != nil { + return objstorage.ObjectMetadata{}, nil, err + } + + // Sync the object provider to ensure the metadata for the blob file is + // persisted. + if err := d.objProvider.Sync(); err != nil { + return objstorage.ObjectMetadata{}, nil, err + } + + ve := &manifest.VersionEdit{ + DeletedBlobFiles: map[manifest.DeletedBlobFileEntry]*manifest.PhysicalBlobFile{ + { + FileID: c.input.FileID, + FileNum: c.input.Physical.FileNum, + }: c.input.Physical, + }, + NewBlobFiles: []manifest.BlobFileMetadata{ + { + FileID: c.input.FileID, + Physical: &manifest.PhysicalBlobFile{ + FileNum: objMeta.DiskFileNum, + Size: stats.FileLen, + ValueSize: stats.UncompressedValueBytes, + CreationTime: uint64(d.timeNow().Unix()), + }, + }, + }, + } + return objMeta, ve, nil +} + +// blockHeap is a min-heap of blob reference liveness encodings, ordered by +// blockID. We use this to help us determine the overall liveness of values in +// each blob block by combining the blob reference liveness encodings of all +// referencing sstables for a particular blockID. +type blockHeap []*sstable.BlobRefLivenessEncoding + +// Len implements sort.Interface. +func (h blockHeap) Len() int { return len(h) } + +// Less implements sort.Interface. +func (h blockHeap) Less(i, j int) bool { return h[i].BlockID < h[j].BlockID } + +// Swap implements sort.Interface. +func (h blockHeap) Swap(i, j int) { + h[i], h[j] = h[j], h[i] +} + +// Push implements heap.Interface. +func (h *blockHeap) Push(x any) { + blobEnc := x.(*sstable.BlobRefLivenessEncoding) + *h = append(*h, blobEnc) +} + +// Pop implements heap.Interface. +func (h *blockHeap) Pop() any { + old := *h + n := len(old) + item := old[n-1] + old[n-1] = nil + *h = old[0 : n-1] + return item +} + +// blockValues holds the accumulated liveness data for blockID. +type blockValues struct { + blockID blob.BlockID + valuesSize int + liveValueIDs []int +} + +// blobFileRewriter is responsible for rewriting blob files by combining and +// processing blob reference liveness encodings from multiple SSTables. It +// maintains state for writing to an output blob file. +type blobFileRewriter struct { + fc *fileCacheHandle + readEnv block.ReadEnv + sstables []*manifest.TableMetadata + inputBlob manifest.BlobFileMetadata + rw *blob.FileRewriter + blkHeap blockHeap +} + +func newBlobFileRewriter( + fc *fileCacheHandle, + readEnv block.ReadEnv, + outputFileNum base.DiskFileNum, + w objstorage.Writable, + opts blob.FileWriterOptions, + sstables []*manifest.TableMetadata, + inputBlob manifest.BlobFileMetadata, +) *blobFileRewriter { + rw := blob.NewFileRewriter(inputBlob.FileID, inputBlob.Physical.FileNum, fc, readEnv, outputFileNum, w, opts) + return &blobFileRewriter{ + fc: fc, + readEnv: readEnv, + rw: rw, + sstables: sstables, + inputBlob: inputBlob, + blkHeap: blockHeap{}, + } +} + +// generateHeap populates rw.blkHeap with the blob reference liveness encodings +// for each referencing sstable, rw.sstables. +func (rw *blobFileRewriter) generateHeap(ctx context.Context) error { + heap.Init(&rw.blkHeap) + + var decoder colblk.ReferenceLivenessBlockDecoder + // For each sstable that references the input blob file, push its + // sstable.BlobLivenessEncoding on to the heap. + for _, sst := range rw.sstables { + // Validate that the sstable contains a reference to the input blob + // file. + refID, ok := sst.BlobReferences.IDByBlobFileID(rw.inputBlob.FileID) + if !ok { + return errors.AssertionFailedf("table %s doesn't contain a reference to blob file %s", + sst.TableNum, rw.inputBlob.FileID) + } + err := rw.fc.withReader(ctx, rw.readEnv, sst, func(r *sstable.Reader, readEnv sstable.ReadEnv) error { + h, err := r.ReadBlobRefIndexBlock(ctx, readEnv.Block) + if err != nil { + return err + } + defer h.Release() + decoder.Init(h.BlockData()) + bitmapEncodings := slices.Clone(decoder.LivenessAtReference(int(refID))) + // TODO(annie): We should instead maintain 1 heap item per sstable + // instead of 1 heap item per sstable block ref to reduce the heap + // comparisons to O(sstables). + for _, enc := range sstable.DecodeBlobRefLivenessEncoding(bitmapEncodings) { + heap.Push(&rw.blkHeap, &enc) + } + return nil + }) + if err != nil { + return err + } + } + return nil +} + +func (rw *blobFileRewriter) Rewrite(ctx context.Context) (blob.FileWriterStats, error) { + if err := rw.generateHeap(ctx); err != nil { + return blob.FileWriterStats{}, err + } + if rw.blkHeap.Len() == 0 { + return blob.FileWriterStats{}, errors.AssertionFailedf("heap empty") + } + + // Begin constructing our output blob file. We maintain a map of blockID + // to accumulated liveness data across all referencing sstables. + firstBlock := heap.Pop(&rw.blkHeap).(*sstable.BlobRefLivenessEncoding) + pending := blockValues{ + blockID: firstBlock.BlockID, + valuesSize: firstBlock.ValuesSize, + liveValueIDs: slices.Collect(sstable.IterSetBitsInRunLengthBitmap(firstBlock.Bitmap)), + } + for rw.blkHeap.Len() > 0 { + nextBlock := heap.Pop(&rw.blkHeap).(*sstable.BlobRefLivenessEncoding) + + // If we are encountering a new block, write the last accumulated block + // to the blob file. + if pending.blockID != nextBlock.BlockID { + // Write the last accumulated block's values to the blob file. + err := rw.rw.CopyBlock(ctx, pending.blockID, pending.valuesSize, pending.liveValueIDs) + if err != nil { + return blob.FileWriterStats{}, err + } + pending = blockValues{blockID: nextBlock.BlockID, liveValueIDs: pending.liveValueIDs[:0]} + } + // Update the accumulated encoding for this block. + pending.valuesSize += nextBlock.ValuesSize + pending.liveValueIDs = slices.AppendSeq(pending.liveValueIDs, + sstable.IterSetBitsInRunLengthBitmap(nextBlock.Bitmap)) + } + + // Copy the last accumulated block. + err := rw.rw.CopyBlock(ctx, pending.blockID, pending.valuesSize, pending.liveValueIDs) + if err != nil { + return blob.FileWriterStats{}, err + } + return rw.rw.Close() +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/bloom/bloom.go b/vendor/github.com/cockroachdb/pebble/v2/bloom/bloom.go new file mode 100644 index 0000000..c08daac --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/bloom/bloom.go @@ -0,0 +1,248 @@ +// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package bloom implements Bloom filters. +package bloom // import "github.com/cockroachdb/pebble/v2/bloom" + +import ( + "encoding/binary" + "fmt" + "sync" + + "github.com/cockroachdb/pebble/v2/internal/base" +) + +const ( + cacheLineSize = 64 + cacheLineBits = cacheLineSize * 8 +) + +type tableFilter []byte + +func (f tableFilter) MayContain(key []byte) bool { + if len(f) <= 5 { + return false + } + n := len(f) - 5 + nProbes := f[n] + nLines := binary.LittleEndian.Uint32(f[n+1:]) + cacheLineBits := 8 * (uint32(n) / nLines) + + h := hash(key) + delta := h>>17 | h<<15 + b := (h % nLines) * cacheLineBits + + for j := uint8(0); j < nProbes; j++ { + bitPos := b + (h % cacheLineBits) + if f[bitPos/8]&(1<<(bitPos%8)) == 0 { + return false + } + h += delta + } + return true +} + +func calculateProbes(bitsPerKey int) uint32 { + // We intentionally round down to reduce probing cost a little bit + n := uint32(float64(bitsPerKey) * 0.69) // 0.69 =~ ln(2) + if n < 1 { + n = 1 + } + if n > 30 { + n = 30 + } + return n +} + +// extend appends n zero bytes to b. It returns the overall slice (of length +// n+len(originalB)) and the slice of n trailing zeroes. +func extend(b []byte, n int) (overall, trailer []byte) { + want := n + len(b) + if want <= cap(b) { + overall = b[:want] + trailer = overall[len(b):] + clear(trailer) + } else { + // Grow the capacity exponentially, with a 1KiB minimum. + c := 1024 + for c < want { + c += c / 4 + } + overall = make([]byte, want, c) + trailer = overall[len(b):] + copy(overall, b) + } + return overall, trailer +} + +// hash implements a hashing algorithm similar to the Murmur hash. +func hash(b []byte) uint32 { + const ( + seed = 0xbc9f1d34 + m = 0xc6a4a793 + ) + h := uint32(seed) ^ uint32(uint64(uint32(len(b))*m)) + for ; len(b) >= 4; b = b[4:] { + h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 + h *= m + h ^= h >> 16 + } + + // The code below first casts each byte to a signed 8-bit integer. This is + // necessary to match RocksDB's behavior. Note that the `byte` type in Go is + // unsigned. What is the difference between casting a signed 8-bit value vs + // unsigned 8-bit value into an unsigned 32-bit value? + // Sign-extension. Consider the value 250 which has the bit pattern 11111010: + // + // uint32(250) = 00000000000000000000000011111010 + // uint32(int8(250)) = 11111111111111111111111111111010 + // + // Note that the original LevelDB code did not explicitly cast to a signed + // 8-bit value which left the behavior dependent on whether C characters were + // signed or unsigned which is a compiler flag for gcc (-funsigned-char). + switch len(b) { + case 3: + h += uint32(int8(b[2])) << 16 + fallthrough + case 2: + h += uint32(int8(b[1])) << 8 + fallthrough + case 1: + h += uint32(int8(b[0])) + h *= m + h ^= h >> 24 + } + return h +} + +const hashBlockLen = 16384 + +type hashBlock [hashBlockLen]uint32 + +var hashBlockPool = sync.Pool{ + New: func() interface{} { + return &hashBlock{} + }, +} + +type tableFilterWriter struct { + bitsPerKey int + + numHashes int + // We store the hashes in blocks. + blocks []*hashBlock + lastHash uint32 + + // Initial "in-line" storage for the blocks slice (to avoid some small + // allocations). + blocksBuf [16]*hashBlock +} + +func newTableFilterWriter(bitsPerKey int) *tableFilterWriter { + w := &tableFilterWriter{ + bitsPerKey: bitsPerKey, + } + w.blocks = w.blocksBuf[:0] + return w +} + +// AddKey implements the base.FilterWriter interface. +func (w *tableFilterWriter) AddKey(key []byte) { + h := hash(key) + if w.numHashes != 0 && h == w.lastHash { + return + } + ofs := w.numHashes % hashBlockLen + if ofs == 0 { + // Time for a new block. + w.blocks = append(w.blocks, hashBlockPool.Get().(*hashBlock)) + } + w.blocks[len(w.blocks)-1][ofs] = h + w.numHashes++ + w.lastHash = h +} + +// Finish implements the base.FilterWriter interface. +func (w *tableFilterWriter) Finish(buf []byte) []byte { + // The table filter format matches the RocksDB full-file filter format. + var nLines int + if w.numHashes != 0 { + nLines = (w.numHashes*w.bitsPerKey + cacheLineBits - 1) / (cacheLineBits) + // Make nLines an odd number to make sure more bits are involved when + // determining which block. + if nLines%2 == 0 { + nLines++ + } + } + + nBytes := nLines * cacheLineSize + // +5: 4 bytes for num-lines, 1 byte for num-probes + buf, filter := extend(buf, nBytes+5) + + if nLines != 0 { + nProbes := calculateProbes(w.bitsPerKey) + for bIdx, b := range w.blocks { + length := hashBlockLen + if bIdx == len(w.blocks)-1 && w.numHashes%hashBlockLen != 0 { + length = w.numHashes % hashBlockLen + } + for _, h := range b[:length] { + delta := h>>17 | h<<15 // rotate right 17 bits + b := (h % uint32(nLines)) * (cacheLineBits) + for i := uint32(0); i < nProbes; i++ { + bitPos := b + (h % cacheLineBits) + filter[bitPos/8] |= (1 << (bitPos % 8)) + h += delta + } + } + } + filter[nBytes] = byte(nProbes) + binary.LittleEndian.PutUint32(filter[nBytes+1:], uint32(nLines)) + } + + // Release the hash blocks. + for i, b := range w.blocks { + hashBlockPool.Put(b) + w.blocks[i] = nil + } + w.blocks = w.blocks[:0] + w.numHashes = 0 + return buf +} + +// FilterPolicy implements the FilterPolicy interface from the pebble package. +// +// The integer value is the approximate number of bits used per key. A good +// value is 10, which yields a filter with ~ 1% false positive rate. +type FilterPolicy int + +var _ base.FilterPolicy = FilterPolicy(0) + +// Name implements the pebble.FilterPolicy interface. +func (p FilterPolicy) Name() string { + // This string looks arbitrary, but its value is written to LevelDB .sst + // files, and should be this exact value to be compatible with those files + // and with the C++ LevelDB code. + return "rocksdb.BuiltinBloomFilter" +} + +// MayContain implements the pebble.FilterPolicy interface. +func (p FilterPolicy) MayContain(ftype base.FilterType, f, key []byte) bool { + switch ftype { + case base.TableFilter: + return tableFilter(f).MayContain(key) + default: + panic(fmt.Sprintf("unknown filter type: %v", ftype)) + } +} + +// NewWriter implements the pebble.FilterPolicy interface. +func (p FilterPolicy) NewWriter(ftype base.FilterType) base.FilterWriter { + switch ftype { + case base.TableFilter: + return newTableFilterWriter(int(p)) + default: + panic(fmt.Sprintf("unknown filter type: %v", ftype)) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/cache.go b/vendor/github.com/cockroachdb/pebble/v2/cache.go similarity index 93% rename from vendor/github.com/cockroachdb/pebble/cache.go rename to vendor/github.com/cockroachdb/pebble/v2/cache.go index 91f5532..f96e377 100644 --- a/vendor/github.com/cockroachdb/pebble/cache.go +++ b/vendor/github.com/cockroachdb/pebble/v2/cache.go @@ -4,7 +4,7 @@ package pebble -import "github.com/cockroachdb/pebble/internal/cache" +import "github.com/cockroachdb/pebble/v2/internal/cache" // Cache exports the cache.Cache type. type Cache = cache.Cache diff --git a/vendor/github.com/cockroachdb/pebble/checkpoint.go b/vendor/github.com/cockroachdb/pebble/v2/checkpoint.go similarity index 56% rename from vendor/github.com/cockroachdb/pebble/checkpoint.go rename to vendor/github.com/cockroachdb/pebble/v2/checkpoint.go index 6eb72fd..c4b7ca8 100644 --- a/vendor/github.com/cockroachdb/pebble/checkpoint.go +++ b/vendor/github.com/cockroachdb/pebble/v2/checkpoint.go @@ -5,14 +5,17 @@ package pebble import ( + "bytes" "io" "os" + "github.com/cockroachdb/errors" "github.com/cockroachdb/errors/oserror" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/record" - "github.com/cockroachdb/pebble/vfs" - "github.com/cockroachdb/pebble/vfs/atomicfs" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/record" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/pebble/v2/vfs/atomicfs" ) // checkpointOptions hold the optional parameters to construct checkpoint @@ -67,13 +70,14 @@ type CheckpointSpan struct { // excludeFromCheckpoint returns true if an SST file should be excluded from the // checkpoint because it does not overlap with the spans of interest // (opt.restrictToSpans). -func excludeFromCheckpoint(f *fileMetadata, opt *checkpointOptions, cmp Compare) bool { +func excludeFromCheckpoint(f *manifest.TableMetadata, opt *checkpointOptions, cmp Compare) bool { if len(opt.restrictToSpans) == 0 { // Option not set; don't exclude anything. return false } for _, s := range opt.restrictToSpans { - if f.Overlaps(cmp, s.Start, s.End, true /* exclusiveEnd */) { + spanBounds := base.UserKeyBoundsEndExclusive(s.Start, s.End) + if f.Overlaps(cmp, &spanBounds) { return false } } @@ -132,6 +136,11 @@ func mkdirAllAndSyncParents(fs vfs.FS, destDir string) (vfs.File, error) { // space overhead for a checkpoint if hard links are disabled. Also beware that // even if hard links are used, the space overhead for the checkpoint will // increase over time as the DB performs compactions. +// +// Note that shared files in a checkpoint could get deleted if the DB is +// restarted after a checkpoint operation, as the reference for the checkpoint +// is only maintained in memory. This is okay as long as users of Checkpoint +// crash shortly afterwards with a "poison file" preventing further restarts. func (d *DB) Checkpoint( destDir string, opts ...CheckpointOption, ) ( @@ -161,6 +170,9 @@ func (d *DB) Checkpoint( } // Disable file deletions. + // We acquire a reference on the version down below that will prevent any + // sstables or blob files from becoming "obsolete" and potentially deleted, + // but this doesn't protect the current WALs or manifests. d.mu.Lock() d.disableFileDeletions() defer func() { @@ -177,22 +189,35 @@ func (d *DB) Checkpoint( // we read, otherwise we might copy a versionEdit not reflected in the // sstables we copy/link. d.mu.versions.logLock() - // Get the unflushed log files, the current version, and the current manifest - // file number. - memQueue := d.mu.mem.queue + // Get the the current version and the current manifest file number. current := d.mu.versions.currentVersion() formatVers := d.FormatMajorVersion() manifestFileNum := d.mu.versions.manifestFileNum manifestSize := d.mu.versions.manifest.Size() optionsFileNum := d.optionsFileNum + virtualBackingFiles := make(map[base.DiskFileNum]struct{}) - for diskFileNum := range d.mu.versions.backingState.fileBackingMap { - virtualBackingFiles[diskFileNum] = struct{}{} - } - // Release the manifest and DB.mu so we don't block other operations on - // the database. + d.mu.versions.latest.virtualBackings.ForEach(func(backing *manifest.TableBacking) { + virtualBackingFiles[backing.DiskFileNum] = struct{}{} + }) + versionBlobFiles := d.mu.versions.latest.blobFiles.Metadatas() + + // Acquire the logs while holding mutexes to ensure we don't race with a + // flush that might mark a log that's relevant to `current` as obsolete + // before our call to List. + allLogicalLogs := d.mu.log.manager.List() + + // Release the manifest and DB.mu so we don't block other operations on the + // database. + // + // But first reference the version to ensure that the version's in-memory + // state and its physical files remain available for the checkpoint. In + // particular, the Version.BlobFileSet is only valid while a version is + // referenced. + current.Ref() d.mu.versions.logUnlock() d.mu.Unlock() + defer current.Unref() // Wrap the normal filesystem with one which wraps newly created files with // vfs.NewSyncingFile. @@ -218,10 +243,10 @@ func (d *DB) Checkpoint( } { - // Link or copy the OPTIONS. - srcPath := base.MakeFilepath(fs, d.dirname, fileTypeOptions, optionsFileNum) + // Copy the OPTIONS. + srcPath := base.MakeFilepath(fs, d.dirname, base.FileTypeOptions, optionsFileNum) destPath := fs.PathJoin(destDir, fs.PathBase(srcPath)) - ckErr = vfs.LinkOrCopy(fs, srcPath, destPath) + ckErr = copyCheckpointOptions(fs, srcPath, destPath) if ckErr != nil { return ckErr } @@ -249,36 +274,78 @@ func (d *DB) Checkpoint( } } - var excludedFiles map[deletedFileEntry]*fileMetadata - // Set of FileBacking.DiskFileNum which will be required by virtual sstables + var excludedTables map[manifest.DeletedTableEntry]*manifest.TableMetadata + var includedBlobFiles map[base.BlobFileID]struct{} + var remoteFiles []base.DiskFileNum + // Set of TableBacking.DiskFileNum which will be required by virtual sstables // in the checkpoint. requiredVirtualBackingFiles := make(map[base.DiskFileNum]struct{}) + + copyFile := func(typ base.FileType, fileNum base.DiskFileNum) error { + meta, err := d.objProvider.Lookup(typ, fileNum) + if err != nil { + return err + } + if meta.IsRemote() { + // We don't copy remote files. This is desirable as checkpointing is + // supposed to be a fast operation, and references to remote files can + // always be resolved by any checkpoint readers by reading the object + // catalog. We don't add this file to excludedFiles either, as that'd + // cause it to be deleted in the second manifest entry which is also + // inaccurate. + remoteFiles = append(remoteFiles, meta.DiskFileNum) + return nil + } + srcPath := base.MakeFilepath(fs, d.dirname, typ, fileNum) + destPath := fs.PathJoin(destDir, fs.PathBase(srcPath)) + return vfs.LinkOrCopy(fs, srcPath, destPath) + } + // Link or copy the sstables. for l := range current.Levels { iter := current.Levels[l].Iter() for f := iter.First(); f != nil; f = iter.Next() { if excludeFromCheckpoint(f, opt, d.cmp) { - if excludedFiles == nil { - excludedFiles = make(map[deletedFileEntry]*fileMetadata) + if excludedTables == nil { + excludedTables = make(map[manifest.DeletedTableEntry]*manifest.TableMetadata) } - excludedFiles[deletedFileEntry{ + excludedTables[manifest.DeletedTableEntry{ Level: l, - FileNum: f.FileNum, + FileNum: f.TableNum, }] = f continue } - fileBacking := f.FileBacking + // Copy any referenced blob files that have not already been copied. + if len(f.BlobReferences) > 0 { + if includedBlobFiles == nil { + includedBlobFiles = make(map[base.BlobFileID]struct{}) + } + for _, ref := range f.BlobReferences { + if _, ok := includedBlobFiles[ref.FileID]; !ok { + includedBlobFiles[ref.FileID] = struct{}{} + + // Map the BlobFileID to a DiskFileNum in the current version. + diskFileNum, ok := current.BlobFiles.Lookup(ref.FileID) + if !ok { + return errors.Errorf("blob file %s not found", ref.FileID) + } + ckErr = copyFile(base.FileTypeBlob, diskFileNum) + if ckErr != nil { + return ckErr + } + } + } + } + + tableBacking := f.TableBacking if f.Virtual { - if _, ok := requiredVirtualBackingFiles[fileBacking.DiskFileNum]; ok { + if _, ok := requiredVirtualBackingFiles[tableBacking.DiskFileNum]; ok { continue } - requiredVirtualBackingFiles[fileBacking.DiskFileNum] = struct{}{} + requiredVirtualBackingFiles[tableBacking.DiskFileNum] = struct{}{} } - - srcPath := base.MakeFilepath(fs, d.dirname, fileTypeTable, fileBacking.DiskFileNum) - destPath := fs.PathJoin(destDir, fs.PathBase(srcPath)) - ckErr = vfs.LinkOrCopy(fs, srcPath, destPath) + ckErr = copyFile(base.FileTypeTable, tableBacking.DiskFileNum) if ckErr != nil { return ckErr } @@ -293,28 +360,55 @@ func (d *DB) Checkpoint( removeBackingTables = append(removeBackingTables, diskFileNum) } } + // Record the blob files that are not referenced by any included sstables. + // When we write the MANIFEST of the checkpoint, we'll include a final + // VersionEdit that removes these blob files so that the checkpointed + // manifest is consistent. + var excludedBlobFiles map[manifest.DeletedBlobFileEntry]*manifest.PhysicalBlobFile + if len(includedBlobFiles) < len(versionBlobFiles) { + excludedBlobFiles = make(map[manifest.DeletedBlobFileEntry]*manifest.PhysicalBlobFile, len(versionBlobFiles)-len(includedBlobFiles)) + for _, meta := range versionBlobFiles { + if _, ok := includedBlobFiles[meta.FileID]; !ok { + excludedBlobFiles[manifest.DeletedBlobFileEntry{ + FileID: meta.FileID, + FileNum: meta.Physical.FileNum, + }] = meta.Physical + } + } + } ckErr = d.writeCheckpointManifest( - fs, formatVers, destDir, dir, manifestFileNum.DiskFileNum(), manifestSize, - excludedFiles, removeBackingTables, + fs, formatVers, destDir, dir, manifestFileNum, manifestSize, + excludedTables, removeBackingTables, excludedBlobFiles, ) if ckErr != nil { return ckErr } + if len(remoteFiles) > 0 { + ckErr = d.objProvider.CheckpointState(fs, destDir, remoteFiles) + if ckErr != nil { + return ckErr + } + } // Copy the WAL files. We copy rather than link because WAL file recycling // will cause the WAL files to be reused which would invalidate the - // checkpoint. - for i := range memQueue { - logNum := memQueue[i].logNum - if logNum == 0 { - continue - } - srcPath := base.MakeFilepath(fs, d.walDirname, fileTypeLog, logNum.DiskFileNum()) - destPath := fs.PathJoin(destDir, fs.PathBase(srcPath)) - ckErr = vfs.Copy(fs, srcPath, destPath) - if ckErr != nil { - return ckErr + // checkpoint. It's possible allLogicalLogs includes logs that are not + // relevant (beneath the version's MinUnflushedLogNum). These extra files + // are harmless. The earlier (wal.Manager).List call will not include + // obsolete logs that are sitting in the recycler or have already been + // passed off to the cleanup manager for deletion. + // + // TODO(jackson): It would be desirable to copy all recycling and obsolete + // WALs to aid corruption postmortem debugging should we need them. + for _, log := range allLogicalLogs { + for i := 0; i < log.NumSegments(); i++ { + srcFS, srcPath := log.SegmentLocation(i) + destPath := fs.PathJoin(destDir, srcFS.PathBase(srcPath)) + ckErr = vfs.CopyAcrossFS(srcFS, srcPath, fs, destPath) + if ckErr != nil { + return ckErr + } } } @@ -328,6 +422,58 @@ func (d *DB) Checkpoint( return ckErr } +// copyCheckpointOptions copies an OPTIONS file, commenting out some options +// that existed on the original database but no longer apply to the checkpointed +// database. For example, the entire [WAL Failover] stanza is commented out +// because Checkpoint will copy all WAL segment files from both the primary and +// secondary WAL directories into the checkpoint. +func copyCheckpointOptions(fs vfs.FS, srcPath, dstPath string) error { + var buf bytes.Buffer + f, err := fs.Open(srcPath) + if err != nil { + return err + } + defer f.Close() + b, err := io.ReadAll(f) + if err != nil { + return err + } + // Copy the OPTIONS file verbatim, but commenting out the [WAL Failover] + // section. + err = parseOptions(string(b), parseOptionsFuncs{ + visitNewSection: func(startOff, endOff int, section string) error { + if section == "WAL Failover" { + buf.WriteString("# ") + } + buf.Write(b[startOff:endOff]) + return nil + }, + visitKeyValue: func(startOff, endOff int, section, key, value string) error { + if section == "WAL Failover" { + buf.WriteString("# ") + } + buf.Write(b[startOff:endOff]) + return nil + }, + visitCommentOrWhitespace: func(startOff, endOff int, line string) error { + buf.Write(b[startOff:endOff]) + return nil + }, + }) + if err != nil { + return err + } + nf, err := fs.Create(dstPath, vfs.WriteCategoryUnspecified) + if err != nil { + return err + } + _, err = io.Copy(nf, &buf) + if err != nil { + return err + } + return errors.CombineErrors(nf.Sync(), nf.Close()) +} + func (d *DB) writeCheckpointManifest( fs vfs.FS, formatVers FormatMajorVersion, @@ -335,8 +481,9 @@ func (d *DB) writeCheckpointManifest( destDir vfs.File, manifestFileNum base.DiskFileNum, manifestSize int64, - excludedFiles map[deletedFileEntry]*fileMetadata, + excludedTables map[manifest.DeletedTableEntry]*manifest.TableMetadata, removeBackingTables []base.DiskFileNum, + excludedBlobFiles map[manifest.DeletedBlobFileEntry]*manifest.PhysicalBlobFile, ) error { // Copy the MANIFEST, and create a pointer to it. We copy rather // than link because additional version edits added to the @@ -347,7 +494,7 @@ func (d *DB) writeCheckpointManifest( // If some files are excluded from the checkpoint, also append a block that // records those files as deleted. if err := func() error { - srcPath := base.MakeFilepath(fs, d.dirname, fileTypeManifest, manifestFileNum) + srcPath := base.MakeFilepath(fs, d.dirname, base.FileTypeManifest, manifestFileNum) destPath := fs.PathJoin(destDirPath, fs.PathBase(srcPath)) src, err := fs.Open(srcPath, vfs.SequentialReadsOption) if err != nil { @@ -355,7 +502,7 @@ func (d *DB) writeCheckpointManifest( } defer src.Close() - dst, err := fs.Create(destPath) + dst, err := fs.Create(destPath, vfs.WriteCategoryUnspecified) if err != nil { return err } @@ -365,7 +512,7 @@ func (d *DB) writeCheckpointManifest( // need to append another record with the excluded files (we cannot simply // append a record after a raw data copy; see // https://github.com/cockroachdb/cockroach/issues/100935). - r := record.NewReader(&io.LimitedReader{R: src, N: manifestSize}, manifestFileNum.FileNum()) + r := record.NewReader(&io.LimitedReader{R: src, N: manifestSize}, manifestFileNum) w := record.NewWriter(dst) for { rr, err := r.Next() @@ -385,11 +532,12 @@ func (d *DB) writeCheckpointManifest( } } - if len(excludedFiles) > 0 { + if len(excludedTables) > 0 || len(excludedBlobFiles) > 0 { // Write out an additional VersionEdit that deletes the excluded SST files. - ve := versionEdit{ - DeletedFiles: excludedFiles, + ve := manifest.VersionEdit{ + DeletedTables: excludedTables, RemovedBackingTables: removeBackingTables, + DeletedBlobFiles: excludedBlobFiles, } rw, err := w.Next() @@ -408,17 +556,12 @@ func (d *DB) writeCheckpointManifest( return err } - // Recent format versions use an atomic marker for setting the - // active manifest. Older versions use the CURRENT file. The - // setCurrentFunc function will return a closure that will - // take the appropriate action for the database's format - // version. var manifestMarker *atomicfs.Marker manifestMarker, _, err := atomicfs.LocateMarker(fs, destDirPath, manifestMarkerName) if err != nil { return err } - if err := setCurrentFunc(formatVers, manifestMarker, fs, destDirPath, destDir)(manifestFileNum.FileNum()); err != nil { + if err := manifestMarker.Move(base.MakeFilename(base.FileTypeManifest, manifestFileNum)); err != nil { return err } return manifestMarker.Close() diff --git a/vendor/github.com/cockroachdb/pebble/commit.go b/vendor/github.com/cockroachdb/pebble/v2/commit.go similarity index 95% rename from vendor/github.com/cockroachdb/pebble/commit.go rename to vendor/github.com/cockroachdb/pebble/v2/commit.go index 38cdbb8..b9efed7 100644 --- a/vendor/github.com/cockroachdb/pebble/commit.go +++ b/vendor/github.com/cockroachdb/pebble/v2/commit.go @@ -8,9 +8,11 @@ import ( "runtime" "sync" "sync/atomic" - "time" - "github.com/cockroachdb/pebble/record" + "github.com/cockroachdb/crlib/crtime" + "github.com/cockroachdb/pebble/v2/batchrepr" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/record" ) // commitQueue is a lock-free fixed-size single-producer, multi-consumer @@ -125,10 +127,10 @@ func (q *commitQueue) dequeueApplied() *Batch { type commitEnv struct { // The next sequence number to give to a batch. Protected by // commitPipeline.mu. - logSeqNum *atomic.Uint64 + logSeqNum *base.AtomicSeqNum // The visible sequence number at which reads should be performed. Ratcheted // upwards atomically as batches are applied to the memtable. - visibleSeqNum *atomic.Uint64 + visibleSeqNum *base.AtomicSeqNum // Apply the batch to the specified memtable. Called concurrently. apply func(b *Batch, mem *memTable) error @@ -298,13 +300,13 @@ func (p *commitPipeline) Commit(b *Batch, syncWAL bool, noSyncWait bool) error { return nil } - commitStartTime := time.Now() + commitStartTime := crtime.NowMono() // Acquire semaphores. p.commitQueueSem <- struct{}{} if syncWAL { p.logSyncQSem <- struct{}{} } - b.commitStats.SemaphoreWaitDuration = time.Since(commitStartTime) + b.commitStats.SemaphoreWaitDuration = commitStartTime.Elapsed() // Prepare the batch for committing: enqueuing the batch in the pending // queue, determining the batch sequence number and writing the data to the @@ -346,7 +348,7 @@ func (p *commitPipeline) Commit(b *Batch, syncWAL bool, noSyncWait bool) error { // b.commitErr. We will read b.commitErr in Batch.SyncWait after the // LogWriter is done writing. - b.commitStats.TotalDuration = time.Since(commitStartTime) + b.commitStats.TotalDuration = commitStartTime.Elapsed() return err } @@ -359,18 +361,18 @@ func (p *commitPipeline) Commit(b *Batch, syncWAL bool, noSyncWait bool) error { // invoked with commitPipeline.mu held, but note that DB.mu is not held and // must be locked if necessary. func (p *commitPipeline) AllocateSeqNum( - count int, prepare func(seqNum uint64), apply func(seqNum uint64), + count int, prepare func(seqNum base.SeqNum), apply func(seqNum base.SeqNum), ) { // This method is similar to Commit and prepare. Be careful about trying to // share additional code with those methods because Commit and prepare are // performance critical code paths. b := newBatch(nil) - defer b.release() + defer func() { _ = b.Close() }() // Give the batch a count of 1 so that the log and visible sequence number // are incremented correctly. - b.data = make([]byte, batchHeaderLen) + b.data = make([]byte, batchrepr.HeaderLen) b.setCount(uint32(count)) b.commit.Add(1) @@ -386,7 +388,7 @@ func (p *commitPipeline) AllocateSeqNum( // Assign the batch a sequence number. Note that we use atomic operations // here to handle concurrent reads of logSeqNum. commitPipeline.mu provides // mutual exclusion for other goroutines writing to logSeqNum. - logSeqNum := p.env.logSeqNum.Add(uint64(count)) - uint64(count) + logSeqNum := p.env.logSeqNum.Add(base.SeqNum(count)) - base.SeqNum(count) seqNum := logSeqNum if seqNum == 0 { // We can't use the value 0 for the global seqnum during ingestion, because @@ -460,7 +462,7 @@ func (p *commitPipeline) prepare(b *Batch, syncWAL bool, noSyncWait bool) (*memT // Assign the batch a sequence number. Note that we use atomic operations // here to handle concurrent reads of logSeqNum. commitPipeline.mu provides // mutual exclusion for other goroutines writing to logSeqNum. - b.setSeqNum(p.env.logSeqNum.Add(n) - n) + b.setSeqNum(p.env.logSeqNum.Add(base.SeqNum(n)) - base.SeqNum(n)) // Write the data to the WAL. mem, err := p.env.write(b, syncWG, syncErr) @@ -486,9 +488,9 @@ func (p *commitPipeline) publish(b *Batch) { if t == nil { // Wait for another goroutine to publish us. We might also be waiting for // the WAL sync to finish. - now := time.Now() + now := crtime.NowMono() b.commit.Wait() - b.commitStats.CommitWaitDuration += time.Since(now) + b.commitStats.CommitWaitDuration += now.Elapsed() break } if !t.applied.Load() { @@ -501,7 +503,7 @@ func (p *commitPipeline) publish(b *Batch) { // that the sequence number ratchets up. for { curSeqNum := p.env.visibleSeqNum.Load() - newSeqNum := t.SeqNum() + uint64(t.Count()) + newSeqNum := t.SeqNum() + base.SeqNum(t.Count()) if newSeqNum <= curSeqNum { // t's sequence number has already been published. break diff --git a/vendor/github.com/cockroachdb/pebble/v2/compaction.go b/vendor/github.com/cockroachdb/pebble/v2/compaction.go new file mode 100644 index 0000000..d035141 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/compaction.go @@ -0,0 +1,3676 @@ +// Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "bytes" + stdcmp "cmp" + "context" + "fmt" + "iter" + "maps" + "math" + "runtime/pprof" + "slices" + "sort" + "sync/atomic" + "time" + "unsafe" + + "github.com/cockroachdb/crlib/crtime" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/compact" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/problemspans" + "github.com/cockroachdb/pebble/v2/internal/sstableinternal" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/redact" +) + +var errEmptyTable = errors.New("pebble: empty table") + +// ErrCancelledCompaction is returned if a compaction is cancelled by a +// concurrent excise or ingest-split operation. +var ErrCancelledCompaction = errors.New("pebble: compaction cancelled by a concurrent operation, will retry compaction") + +var flushLabels = pprof.Labels("pebble", "flush", "output-level", "L0") +var gcLabels = pprof.Labels("pebble", "gc") + +// expandedCompactionByteSizeLimit is the maximum number of bytes in all +// compacted files. We avoid expanding the lower level file set of a compaction +// if it would make the total compaction cover more than this many bytes. +func expandedCompactionByteSizeLimit( + opts *Options, targetFileSize int64, availBytes uint64, +) uint64 { + v := uint64(25 * targetFileSize) + + // Never expand a compaction beyond half the available capacity, divided + // by the maximum number of concurrent compactions. Each of the concurrent + // compactions may expand up to this limit, so this attempts to limit + // compactions to half of available disk space. Note that this will not + // prevent compaction picking from pursuing compactions that are larger + // than this threshold before expansion. + // + // NB: this heuristic is an approximation since we may run more compactions + // than the upper concurrency limit. + _, maxConcurrency := opts.CompactionConcurrencyRange() + diskMax := (availBytes / 2) / uint64(maxConcurrency) + if v > diskMax { + v = diskMax + } + return v +} + +// maxGrandparentOverlapBytes is the maximum bytes of overlap with level+1 +// before we stop building a single file in a level-1 to level compaction. +func maxGrandparentOverlapBytes(targetFileSize int64) uint64 { + return uint64(10 * targetFileSize) +} + +// maxReadCompactionBytes is used to prevent read compactions which +// are too wide. +func maxReadCompactionBytes(targetFileSize int64) uint64 { + return uint64(10 * targetFileSize) +} + +// noCloseIter wraps around a FragmentIterator, intercepting and eliding +// calls to Close. It is used during compaction to ensure that rangeDelIters +// are not closed prematurely. +type noCloseIter struct { + keyspan.FragmentIterator +} + +func (i *noCloseIter) Close() {} + +type compactionLevel struct { + level int + files manifest.LevelSlice + // l0SublevelInfo contains information about L0 sublevels being compacted. + // It's only set for the start level of a compaction starting out of L0 and + // is nil for all other compactions. + l0SublevelInfo []sublevelInfo +} + +func (cl compactionLevel) Clone() compactionLevel { + newCL := compactionLevel{ + level: cl.level, + files: cl.files, + } + return newCL +} +func (cl compactionLevel) String() string { + return fmt.Sprintf(`Level %d, Files %s`, cl.level, cl.files) +} + +// compactionWritable is a objstorage.Writable wrapper that, on every write, +// updates a metric in `versions` on bytes written by in-progress compactions so +// far. It also increments a per-compaction `written` atomic int. +type compactionWritable struct { + objstorage.Writable + + versions *versionSet + written *atomic.Int64 +} + +// Write is part of the objstorage.Writable interface. +func (c *compactionWritable) Write(p []byte) error { + if err := c.Writable.Write(p); err != nil { + return err + } + + c.written.Add(int64(len(p))) + c.versions.incrementCompactionBytes(int64(len(p))) + return nil +} + +type compactionKind int + +const ( + compactionKindDefault compactionKind = iota + compactionKindFlush + // compactionKindMove denotes a move compaction where the input file is + // retained and linked in a new level without being obsoleted. + compactionKindMove + // compactionKindCopy denotes a copy compaction where the input file is + // copied byte-by-byte into a new file with a new TableNum in the output level. + compactionKindCopy + // compactionKindDeleteOnly denotes a compaction that only deletes input + // files. It can occur when wide range tombstones completely contain sstables. + compactionKindDeleteOnly + compactionKindElisionOnly + compactionKindRead + compactionKindTombstoneDensity + compactionKindRewrite + compactionKindIngestedFlushable + compactionKindBlobFileRewrite +) + +func (k compactionKind) String() string { + switch k { + case compactionKindDefault: + return "default" + case compactionKindFlush: + return "flush" + case compactionKindMove: + return "move" + case compactionKindDeleteOnly: + return "delete-only" + case compactionKindElisionOnly: + return "elision-only" + case compactionKindRead: + return "read" + case compactionKindTombstoneDensity: + return "tombstone-density" + case compactionKindRewrite: + return "rewrite" + case compactionKindIngestedFlushable: + return "ingested-flushable" + case compactionKindCopy: + return "copy" + case compactionKindBlobFileRewrite: + return "blob-file-rewrite" + } + return "?" +} + +// compactingOrFlushing returns "flushing" if the compaction kind is a flush, +// otherwise it returns "compacting". +func (k compactionKind) compactingOrFlushing() string { + if k == compactionKindFlush { + return "flushing" + } + return "compacting" +} + +type compaction interface { + AddInProgressLocked(*DB) + BeganAt() time.Time + Bounds() *base.UserKeyBounds + Cancel() + Execute(JobID, *DB) error + GrantHandle() CompactionGrantHandle + Info() compactionInfo + IsDownload() bool + IsFlush() bool + PprofLabels(UserKeyCategories) pprof.LabelSet + RecordError(*problemspans.ByLevel, error) + Tables() iter.Seq2[int, *manifest.TableMetadata] + VersionEditApplied() bool +} + +// tableCompaction is a table compaction from one level to the next, starting +// from a given version. It implements the compaction interface. +type tableCompaction struct { + // cancel is a bool that can be used by other goroutines to signal a compaction + // to cancel, such as if a conflicting excise operation raced it to manifest + // application. Only holders of the manifest lock will write to this atomic. + cancel atomic.Bool + // kind indicates the kind of compaction. Different compaction kinds have + // different semantics and mechanics. Some may have additional fields. + kind compactionKind + // isDownload is true if this compaction was started as part of a Download + // operation. In this case kind is compactionKindCopy or + // compactionKindRewrite. + isDownload bool + + comparer *base.Comparer + logger Logger + version *manifest.Version + // versionEditApplied is set to true when a compaction has completed and the + // resulting version has been installed (if successful), but the compaction + // goroutine is still cleaning up (eg, deleting obsolete files). + versionEditApplied bool + // getValueSeparation constructs a compact.ValueSeparation for use in a + // compaction. It implements heuristics around choosing whether a compaction + // should: + // + // a) preserve existing blob references: The compaction does not write any + // new blob files, but propagates existing references to blob files.This + // conserves write bandwidth by avoiding rewriting the referenced values. It + // also reduces the locality of the referenced values which can reduce scan + // performance because a scan must load values from more unique blob files. + // It can also delay reclamation of disk space if some of the references to + // blob values are elided by the compaction, increasing space amplification. + // + // b) rewrite blob files: The compaction will write eligible values to new + // blob files. This consumes more write bandwidth because all values are + // rewritten. However it restores locality. + getValueSeparation func(JobID, *tableCompaction, sstable.TableFormat) compact.ValueSeparation + + // startLevel is the level that is being compacted. Inputs from startLevel + // and outputLevel will be merged to produce a set of outputLevel files. + startLevel *compactionLevel + + // outputLevel is the level that files are being produced in. outputLevel is + // equal to startLevel+1 except when: + // - if startLevel is 0, the output level equals compactionPicker.baseLevel(). + // - in multilevel compaction, the output level is the lowest level involved in + // the compaction + // A compaction's outputLevel is nil for delete-only compactions. + outputLevel *compactionLevel + + // extraLevels point to additional levels in between the input and output + // levels that get compacted in multilevel compactions + extraLevels []*compactionLevel + + inputs []compactionLevel + + // maxOutputFileSize is the maximum size of an individual table created + // during compaction. + maxOutputFileSize uint64 + // maxOverlapBytes is the maximum number of bytes of overlap allowed for a + // single output table with the tables in the grandparent level. + maxOverlapBytes uint64 + + // The boundaries of the input data. + bounds base.UserKeyBounds + + // grandparents are the tables in level+2 that overlap with the files being + // compacted. Used to determine output table boundaries. Do not assume that the actual files + // in the grandparent when this compaction finishes will be the same. + grandparents manifest.LevelSlice + + delElision compact.TombstoneElision + rangeKeyElision compact.TombstoneElision + + // deleteOnly contains information specific to compactions with kind + // compactionKindDeleteOnly. A delete-only compaction is a special + // compaction that does not merge or write sstables. Instead, it only + // performs deletions either through removing whole sstables from the LSM or + // virtualizing them into virtual sstables. + deleteOnly struct { + // hints are collected by the table stats collector and describe range + // deletions and the files containing keys deleted by them. + hints []deleteCompactionHint + // exciseEnabled is set to true if this compaction is allowed to excise + // files. If false, the compaction will only remove whole sstables that + // are wholly contained within the bounds of range deletions. + exciseEnabled bool + } + // flush contains information specific to flushes (compactionKindFlush and + // compactionKindIngestedFlushable). A flush is modeled by a compaction + // because it has similar mechanics to a default compaction. + flush struct { + // flushables contains the flushables (aka memtables, large batches, + // flushable ingestions, etc) that are being flushed. + flushables flushableList + // Boundaries at which sstables flushed to L0 should be split. + // Determined by L0Sublevels. If nil, ignored. + l0Limits [][]byte + } + // iterationState contains state used during compaction iteration. + iterationState struct { + // bufferPool is a pool of buffers used when reading blocks. Compactions + // do not populate the block cache under the assumption that the blocks + // we read will soon be irrelevant when their containing sstables are + // removed from the LSM. + bufferPool sstable.BufferPool + // keyspanIterClosers is a list of fragment iterators to close when the + // compaction finishes. As iteration opens new keyspan iterators, + // elements are appended. Keyspan iterators must remain open for the + // lifetime of the compaction, so they're accumulated here. When the + // compaction finishes, all the underlying keyspan iterators are closed. + keyspanIterClosers []*noCloseIter + // valueFetcher is used to fetch values from blob files. It's propagated + // down the iterator tree through the internal iterator options. + valueFetcher blob.ValueFetcher + } + // metrics encapsulates various metrics collected during a compaction. + metrics compactionMetrics + + grantHandle CompactionGrantHandle + + tableFormat sstable.TableFormat + objCreateOpts objstorage.CreateOptions + + annotations []string +} + +// Assert that tableCompaction implements the compaction interface. +var _ compaction = (*tableCompaction)(nil) + +func (c *tableCompaction) AddInProgressLocked(d *DB) { + d.mu.compact.inProgress[c] = struct{}{} + var isBase, isIntraL0 bool + for _, cl := range c.inputs { + for f := range cl.files.All() { + if f.IsCompacting() { + d.opts.Logger.Fatalf("L%d->L%d: %s already being compacted", c.startLevel.level, c.outputLevel.level, f.TableNum) + } + f.SetCompactionState(manifest.CompactionStateCompacting) + if c.startLevel != nil && c.outputLevel != nil && c.startLevel.level == 0 { + if c.outputLevel.level == 0 { + f.IsIntraL0Compacting = true + isIntraL0 = true + } else { + isBase = true + } + } + } + } + + if isIntraL0 || isBase { + l0Inputs := []manifest.LevelSlice{c.startLevel.files} + if isIntraL0 { + l0Inputs = append(l0Inputs, c.outputLevel.files) + } + if err := d.mu.versions.latest.l0Organizer.UpdateStateForStartedCompaction(l0Inputs, isBase); err != nil { + d.opts.Logger.Fatalf("could not update state for compaction: %s", err) + } + } +} + +func (c *tableCompaction) BeganAt() time.Time { return c.metrics.beganAt } +func (c *tableCompaction) Bounds() *base.UserKeyBounds { return &c.bounds } +func (c *tableCompaction) Cancel() { c.cancel.Store(true) } + +func (c *tableCompaction) Execute(jobID JobID, d *DB) error { + c.grantHandle.Started() + err := d.compact1(jobID, c) + // The version stored in the compaction is ref'd when the compaction is + // created. We're responsible for un-refing it when the compaction is + // complete. + if c.version != nil { + c.version.UnrefLocked() + } + return err +} + +func (c *tableCompaction) RecordError(problemSpans *problemspans.ByLevel, err error) { + // Record problem spans for a short duration, unless the error is a + // corruption. + expiration := 30 * time.Second + if IsCorruptionError(err) { + // TODO(radu): ideally, we should be using the corruption reporting + // mechanism which has a tighter span for the corruption. We would need to + // somehow plumb the level of the file. + expiration = 5 * time.Minute + } + + for i := range c.inputs { + level := c.inputs[i].level + if level == 0 { + // We do not set problem spans on L0, as they could block flushes. + continue + } + it := c.inputs[i].files.Iter() + for f := it.First(); f != nil; f = it.Next() { + problemSpans.Add(level, f.UserKeyBounds(), expiration) + } + } +} + +func (c *tableCompaction) GrantHandle() CompactionGrantHandle { return c.grantHandle } +func (c *tableCompaction) IsDownload() bool { return c.isDownload } +func (c *tableCompaction) IsFlush() bool { return len(c.flush.flushables) > 0 } +func (c *tableCompaction) Info() compactionInfo { + info := compactionInfo{ + versionEditApplied: c.versionEditApplied, + kind: c.kind, + inputs: c.inputs, + bounds: &c.bounds, + outputLevel: -1, + } + if c.outputLevel != nil { + info.outputLevel = c.outputLevel.level + } + return info +} +func (c *tableCompaction) PprofLabels(kc UserKeyCategories) pprof.LabelSet { + activity := "compact" + if len(c.flush.flushables) != 0 { + activity = "flush" + } + level := "L?" + // Delete-only compactions don't have an output level. + if c.outputLevel != nil { + level = fmt.Sprintf("L%d", c.outputLevel.level) + } + if kc.Len() > 0 { + cat := kc.CategorizeKeyRange(c.bounds.Start, c.bounds.End.Key) + return pprof.Labels("pebble", activity, "output-level", level, "key-type", cat) + } + return pprof.Labels("pebble", activity, "output-level", level) +} + +func (c *tableCompaction) Tables() iter.Seq2[int, *manifest.TableMetadata] { + return func(yield func(int, *manifest.TableMetadata) bool) { + for _, cl := range c.inputs { + for f := range cl.files.All() { + if !yield(cl.level, f) { + return + } + } + } + } +} + +func (c *tableCompaction) VersionEditApplied() bool { return c.versionEditApplied } + +// compactionMetrics contians metrics surrounding a compaction. +type compactionMetrics struct { + // beganAt is the time when the compaction began. + beganAt time.Time + // bytesWritten contains the number of bytes that have been written to + // outputs. It's updated whenever the compaction outputs' + // objstorage.Writables receive new writes. See newCompactionOutputObj. + bytesWritten atomic.Int64 + // internalIterStats contains statistics from the internal iterators used by + // the compaction. + // + // TODO(jackson): Use these to power the compaction BytesRead metric. + internalIterStats base.InternalIteratorStats + // perLevel contains metrics for each level involved in the compaction. + perLevel levelMetricsDelta + // picker contains metrics from the compaction picker when the compaction + // was picked. + picker pickedCompactionMetrics +} + +// inputLargestSeqNumAbsolute returns the maximum LargestSeqNumAbsolute of any +// input sstables. +func (c *tableCompaction) inputLargestSeqNumAbsolute() base.SeqNum { + var seqNum base.SeqNum + for _, cl := range c.inputs { + for m := range cl.files.All() { + seqNum = max(seqNum, m.LargestSeqNumAbsolute) + } + } + return seqNum +} + +func (c *tableCompaction) makeInfo(jobID JobID) CompactionInfo { + info := CompactionInfo{ + JobID: int(jobID), + Reason: c.kind.String(), + Input: make([]LevelInfo, 0, len(c.inputs)), + Annotations: []string{}, + } + if c.isDownload { + info.Reason = "download," + info.Reason + } + for _, cl := range c.inputs { + inputInfo := LevelInfo{Level: cl.level, Tables: nil} + for m := range cl.files.All() { + inputInfo.Tables = append(inputInfo.Tables, m.TableInfo()) + } + info.Input = append(info.Input, inputInfo) + } + if c.outputLevel != nil { + info.Output.Level = c.outputLevel.level + + // If there are no inputs from the output level (eg, a move + // compaction), add an empty LevelInfo to info.Input. + if len(c.inputs) > 0 && c.inputs[len(c.inputs)-1].level != c.outputLevel.level { + info.Input = append(info.Input, LevelInfo{Level: c.outputLevel.level}) + } + } else { + // For a delete-only compaction, set the output level to L6. The + // output level is not meaningful here, but complicating the + // info.Output interface with a pointer doesn't seem worth the + // semantic distinction. + info.Output.Level = numLevels - 1 + } + + for i, score := range c.metrics.picker.scores { + info.Input[i].Score = score + } + info.SingleLevelOverlappingRatio = c.metrics.picker.singleLevelOverlappingRatio + info.MultiLevelOverlappingRatio = c.metrics.picker.multiLevelOverlappingRatio + if len(info.Input) > 2 { + info.Annotations = append(info.Annotations, "multilevel") + } + return info +} + +type getValueSeparation func(JobID, *tableCompaction, sstable.TableFormat) compact.ValueSeparation + +// newCompaction constructs a compaction from the provided picked compaction. +// +// The compaction is created with a reference to its version that must be +// released when the compaction is complete. +func newCompaction( + pc *pickedTableCompaction, + opts *Options, + beganAt time.Time, + provider objstorage.Provider, + grantHandle CompactionGrantHandle, + tableFormat sstable.TableFormat, + getValueSeparation getValueSeparation, +) *tableCompaction { + c := &tableCompaction{ + kind: compactionKindDefault, + comparer: opts.Comparer, + inputs: pc.inputs, + bounds: pc.bounds, + logger: opts.Logger, + version: pc.version, + getValueSeparation: getValueSeparation, + maxOutputFileSize: pc.maxOutputFileSize, + maxOverlapBytes: pc.maxOverlapBytes, + metrics: compactionMetrics{ + beganAt: beganAt, + picker: pc.pickerMetrics, + }, + grantHandle: grantHandle, + tableFormat: tableFormat, + } + // Acquire a reference to the version to ensure that files and in-memory + // version state necessary for reading files remain available. Ignoring + // excises, this isn't strictly necessary for reading the sstables that are + // inputs to the compaction because those files are 'marked as compacting' + // and shouldn't be subject to any competing compactions. However with + // excises, a concurrent excise may remove a compaction's file from the + // Version and then cancel the compaction. The file shouldn't be physically + // removed until the cancelled compaction stops reading it. + // + // Additionally, we need any blob files referenced by input sstables to + // remain available, even if the blob file is rewritten. Maintaining a + // reference ensures that all these files remain available for the + // compaction's reads. + c.version.Ref() + + c.startLevel = &c.inputs[0] + if pc.startLevel.l0SublevelInfo != nil { + c.startLevel.l0SublevelInfo = pc.startLevel.l0SublevelInfo + } + + c.outputLevel = &c.inputs[len(c.inputs)-1] + + if len(pc.inputs) > 2 { + // TODO(xinhaoz): Look into removing extraLevels on the compaction struct. + c.extraLevels = make([]*compactionLevel, 0, len(pc.inputs)-2) + for i := 1; i < len(pc.inputs)-1; i++ { + c.extraLevels = append(c.extraLevels, &c.inputs[i]) + } + } + // Compute the set of outputLevel+1 files that overlap this compaction (these + // are the grandparent sstables). + if c.outputLevel.level+1 < numLevels { + c.grandparents = c.version.Overlaps(c.outputLevel.level+1, c.bounds) + } + c.delElision, c.rangeKeyElision = compact.SetupTombstoneElision( + c.comparer.Compare, c.version, pc.l0Organizer, c.outputLevel.level, c.bounds, + ) + c.kind = pc.kind + + preferSharedStorage := tableFormat >= FormatMinForSharedObjects.MaxTableFormat() && + remote.ShouldCreateShared(opts.Experimental.CreateOnShared, c.outputLevel.level) + c.maybeSwitchToMoveOrCopy(preferSharedStorage, provider) + c.objCreateOpts = objstorage.CreateOptions{ + PreferSharedStorage: preferSharedStorage, + WriteCategory: getDiskWriteCategoryForCompaction(opts, c.kind), + } + if preferSharedStorage { + c.getValueSeparation = neverSeparateValues + } + + return c +} + +// maybeSwitchToMoveOrCopy decides if the compaction can be changed into a move +// or copy compaction, in which case c.kind is updated. +func (c *tableCompaction) maybeSwitchToMoveOrCopy( + preferSharedStorage bool, provider objstorage.Provider, +) { + // Only non-multi-level compactions with a single input file can be + // considered. + if c.startLevel.files.Len() != 1 || !c.outputLevel.files.Empty() || c.hasExtraLevelData() { + return + } + + // In addition to the default compaction, we also check whether a tombstone + // density compaction can be optimized into a move compaction. However, we + // want to avoid performing a move compaction into the lowest level, since the + // goal there is to actually remove the tombstones. + // + // Tombstone density compaction is meant to address cases where tombstones + // don't reclaim much space but are still expensive to scan over. We can only + // remove the tombstones once there's nothing at all underneath them. + switch c.kind { + case compactionKindDefault: + // Proceed. + case compactionKindTombstoneDensity: + // Tombstone density compaction can be optimized into a move compaction. + // However, we want to avoid performing a move compaction into the lowest + // level, since the goal there is to actually remove the tombstones; even if + // they don't prevent a lot of space from being reclaimed, tombstones can + // still be expensive to scan over. + if c.outputLevel.level == numLevels-1 { + return + } + default: + // Other compaction kinds not supported. + return + } + + // We avoid a move or copy if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require a very + // expensive merge later on. + if c.grandparents.AggregateSizeSum() > c.maxOverlapBytes { + return + } + + iter := c.startLevel.files.Iter() + meta := iter.First() + + // We should always be passed a provider, except in some unit tests. + isRemote := provider != nil && !objstorage.IsLocalTable(provider, meta.TableBacking.DiskFileNum) + + // Shared and external tables can always be moved. We can also move a local + // table unless we need the result to be on shared storage. + if isRemote || !preferSharedStorage { + c.kind = compactionKindMove + return + } + + // We can rewrite the table (regular compaction) or we can use a copy compaction. + switch { + case meta.Virtual: + // We want to avoid a copy compaction if the table is virtual, as we may end + // up copying a lot more data than necessary. + case meta.BlobReferenceDepth != 0: + // We also want to avoid copy compactions for tables with blob references, + // as we currently lack a mechanism to propagate blob references along with + // the sstable. + default: + c.kind = compactionKindCopy + } +} + +// newDeleteOnlyCompaction constructs a delete-only compaction from the provided +// inputs. +// +// The compaction is created with a reference to its version that must be +// released when the compaction is complete. +func newDeleteOnlyCompaction( + opts *Options, + cur *manifest.Version, + inputs []compactionLevel, + beganAt time.Time, + hints []deleteCompactionHint, + exciseEnabled bool, +) *tableCompaction { + c := &tableCompaction{ + kind: compactionKindDeleteOnly, + comparer: opts.Comparer, + logger: opts.Logger, + version: cur, + inputs: inputs, + grantHandle: noopGrantHandle{}, + metrics: compactionMetrics{ + beganAt: beganAt, + }, + } + c.deleteOnly.hints = hints + c.deleteOnly.exciseEnabled = exciseEnabled + // Acquire a reference to the version to ensure that files and in-memory + // version state necessary for reading files remain available. Ignoring + // excises, this isn't strictly necessary for reading the sstables that are + // inputs to the compaction because those files are 'marked as compacting' + // and shouldn't be subject to any competing compactions. However with + // excises, a concurrent excise may remove a compaction's file from the + // Version and then cancel the compaction. The file shouldn't be physically + // removed until the cancelled compaction stops reading it. + // + // Additionally, we need any blob files referenced by input sstables to + // remain available, even if the blob file is rewritten. Maintaining a + // reference ensures that all these files remain available for the + // compaction's reads. + c.version.Ref() + + // Set c.smallest, c.largest. + cmp := opts.Comparer.Compare + for _, in := range inputs { + c.bounds = manifest.ExtendKeyRange(cmp, c.bounds, in.files.All()) + } + return c +} + +func adjustGrandparentOverlapBytesForFlush(c *tableCompaction, flushingBytes uint64) { + // Heuristic to place a lower bound on compaction output file size + // caused by Lbase. Prior to this heuristic we have observed an L0 in + // production with 310K files of which 290K files were < 10KB in size. + // Our hypothesis is that it was caused by L1 having 2600 files and + // ~10GB, such that each flush got split into many tiny files due to + // overlapping with most of the files in Lbase. + // + // The computation below is general in that it accounts + // for flushing different volumes of data (e.g. we may be flushing + // many memtables). For illustration, we consider the typical + // example of flushing a 64MB memtable. So 12.8MB output, + // based on the compression guess below. If the compressed bytes + // guess is an over-estimate we will end up with smaller files, + // and if an under-estimate we will end up with larger files. + // With a 2MB target file size, 7 files. We are willing to accept + // 4x the number of files, if it results in better write amplification + // when later compacting to Lbase, i.e., ~450KB files (target file + // size / 4). + // + // Note that this is a pessimistic heuristic in that + // fileCountUpperBoundDueToGrandparents could be far from the actual + // number of files produced due to the grandparent limits. For + // example, in the extreme, consider a flush that overlaps with 1000 + // files in Lbase f0...f999, and the initially calculated value of + // maxOverlapBytes will cause splits at f10, f20,..., f990, which + // means an upper bound file count of 100 files. Say the input bytes + // in the flush are such that acceptableFileCount=10. We will fatten + // up maxOverlapBytes by 10x to ensure that the upper bound file count + // drops to 10. However, it is possible that in practice, even without + // this change, we would have produced no more than 10 files, and that + // this change makes the files unnecessarily wide. Say the input bytes + // are distributed such that 10% are in f0...f9, 10% in f10...f19, ... + // 10% in f80...f89 and 10% in f990...f999. The original value of + // maxOverlapBytes would have actually produced only 10 sstables. But + // by increasing maxOverlapBytes by 10x, we may produce 1 sstable that + // spans f0...f89, i.e., a much wider sstable than necessary. + // + // We could produce a tighter estimate of + // fileCountUpperBoundDueToGrandparents if we had knowledge of the key + // distribution of the flush. The 4x multiplier mentioned earlier is + // a way to try to compensate for this pessimism. + // + // TODO(sumeer): we don't have compression info for the data being + // flushed, but it is likely that existing files that overlap with + // this flush in Lbase are representative wrt compression ratio. We + // could store the uncompressed size in TableMetadata and estimate + // the compression ratio. + const approxCompressionRatio = 0.2 + approxOutputBytes := approxCompressionRatio * float64(flushingBytes) + approxNumFilesBasedOnTargetSize := + int(math.Ceil(approxOutputBytes / float64(c.maxOutputFileSize))) + acceptableFileCount := float64(4 * approxNumFilesBasedOnTargetSize) + // The byte calculation is linear in numGrandparentFiles, but we will + // incur this linear cost in compact.Runner.TableSplitLimit() too, so we are + // also willing to pay it now. We could approximate this cheaply by using the + // mean file size of Lbase. + grandparentFileBytes := c.grandparents.AggregateSizeSum() + fileCountUpperBoundDueToGrandparents := + float64(grandparentFileBytes) / float64(c.maxOverlapBytes) + if fileCountUpperBoundDueToGrandparents > acceptableFileCount { + c.maxOverlapBytes = uint64( + float64(c.maxOverlapBytes) * + (fileCountUpperBoundDueToGrandparents / acceptableFileCount)) + } +} + +// newFlush creates the state necessary for a flush (modeled with the compaction +// struct). +// +// newFlush takes the current Version in order to populate grandparent flushing +// limits, but it does not reference the version. +// +// TODO(jackson): Consider maintaining a reference to the version anyways since +// in the future in-memory Version state may only be available while a Version +// is referenced (eg, if we start recycling B-Tree nodes once they're no longer +// referenced). There's subtlety around unref'ing the version at the right +// moment, so we defer it for now. +func newFlush( + opts *Options, + cur *manifest.Version, + l0Organizer *manifest.L0Organizer, + baseLevel int, + flushing flushableList, + beganAt time.Time, + tableFormat sstable.TableFormat, + getValueSeparation getValueSeparation, +) (*tableCompaction, error) { + c := &tableCompaction{ + kind: compactionKindFlush, + comparer: opts.Comparer, + logger: opts.Logger, + inputs: []compactionLevel{{level: -1}, {level: 0}}, + getValueSeparation: getValueSeparation, + maxOutputFileSize: math.MaxUint64, + maxOverlapBytes: math.MaxUint64, + grantHandle: noopGrantHandle{}, + tableFormat: tableFormat, + metrics: compactionMetrics{ + beganAt: beganAt, + }, + } + c.flush.flushables = flushing + c.flush.l0Limits = l0Organizer.FlushSplitKeys() + c.startLevel = &c.inputs[0] + c.outputLevel = &c.inputs[1] + if len(flushing) > 0 { + if _, ok := flushing[0].flushable.(*ingestedFlushable); ok { + if len(flushing) != 1 { + panic("pebble: ingestedFlushable must be flushed one at a time.") + } + c.kind = compactionKindIngestedFlushable + return c, nil + } else { + // Make sure there's no ingestedFlushable after the first flushable + // in the list. + for _, f := range c.flush.flushables[1:] { + if _, ok := f.flushable.(*ingestedFlushable); ok { + panic("pebble: flushables shouldn't contain ingestedFlushable") + } + } + } + } + + preferSharedStorage := tableFormat >= FormatMinForSharedObjects.MaxTableFormat() && + remote.ShouldCreateShared(opts.Experimental.CreateOnShared, c.outputLevel.level) + c.objCreateOpts = objstorage.CreateOptions{ + PreferSharedStorage: preferSharedStorage, + WriteCategory: getDiskWriteCategoryForCompaction(opts, c.kind), + } + if preferSharedStorage { + c.getValueSeparation = neverSeparateValues + } + + cmp := c.comparer.Compare + updatePointBounds := func(iter internalIterator) { + if kv := iter.First(); kv != nil { + if c.bounds.Start == nil || cmp(c.bounds.Start, kv.K.UserKey) > 0 { + c.bounds.Start = slices.Clone(kv.K.UserKey) + } + } + if kv := iter.Last(); kv != nil { + if c.bounds.End.Key == nil || !c.bounds.End.IsUpperBoundForInternalKey(cmp, kv.K) { + c.bounds.End = base.UserKeyExclusiveIf(slices.Clone(kv.K.UserKey), kv.K.IsExclusiveSentinel()) + } + } + } + + updateRangeBounds := func(iter keyspan.FragmentIterator) error { + // File bounds require s != nil && !s.Empty(). We only need to check for + // s != nil here, as the memtable's FragmentIterator would never surface + // empty spans. + if s, err := iter.First(); err != nil { + return err + } else if s != nil { + c.bounds = c.bounds.Union(cmp, s.Bounds().Clone()) + } + if s, err := iter.Last(); err != nil { + return err + } else if s != nil { + c.bounds = c.bounds.Union(cmp, s.Bounds().Clone()) + } + return nil + } + + var flushingBytes uint64 + for i := range flushing { + f := flushing[i] + updatePointBounds(f.newIter(nil)) + if rangeDelIter := f.newRangeDelIter(nil); rangeDelIter != nil { + if err := updateRangeBounds(rangeDelIter); err != nil { + return nil, err + } + } + if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { + if err := updateRangeBounds(rangeKeyIter); err != nil { + return nil, err + } + } + flushingBytes += f.inuseBytes() + } + + if opts.FlushSplitBytes > 0 { + c.maxOutputFileSize = uint64(opts.TargetFileSizes[0]) + c.maxOverlapBytes = maxGrandparentOverlapBytes(opts.TargetFileSizes[0]) + c.grandparents = cur.Overlaps(baseLevel, c.bounds) + adjustGrandparentOverlapBytesForFlush(c, flushingBytes) + } + + // We don't elide tombstones for flushes. + c.delElision, c.rangeKeyElision = compact.NoTombstoneElision(), compact.NoTombstoneElision() + return c, nil +} + +func (c *tableCompaction) hasExtraLevelData() bool { + if len(c.extraLevels) == 0 { + // not a multi level compaction + return false + } else if c.extraLevels[0].files.Empty() { + // a multi level compaction without data in the intermediate input level; + // e.g. for a multi level compaction with levels 4,5, and 6, this could + // occur if there is no files to compact in 5, or in 5 and 6 (i.e. a move). + return false + } + return true +} + +// errorOnUserKeyOverlap returns an error if the last two written sstables in +// this compaction have revisions of the same user key present in both sstables, +// when it shouldn't (eg. when splitting flushes). +func (c *tableCompaction) errorOnUserKeyOverlap(ve *manifest.VersionEdit) error { + if n := len(ve.NewTables); n > 1 { + meta := ve.NewTables[n-1].Meta + prevMeta := ve.NewTables[n-2].Meta + if !prevMeta.Largest().IsExclusiveSentinel() && + c.comparer.Compare(prevMeta.Largest().UserKey, meta.Smallest().UserKey) >= 0 { + return errors.Errorf("pebble: compaction split user key across two sstables: %s in %s and %s", + prevMeta.Largest().Pretty(c.comparer.FormatKey), + prevMeta.TableNum, + meta.TableNum) + } + } + return nil +} + +// isBottommostDataLayer returns true if the compaction's inputs are known to be +// the bottommost layer of data for the compaction's key range. If true, this +// allows the compaction iterator to perform transformations to keys such as +// setting a key's sequence number to zero. +// +// This function performs this determination by looking at the TombstoneElision +// values which are set up based on sstables which overlap the bounds of the +// compaction at a lower level in the LSM. This function always returns false +// for flushes. +func (c *tableCompaction) isBottommostDataLayer() bool { + // TODO(peter): we disable zeroing of seqnums during flushing to match + // RocksDB behavior and to avoid generating overlapping sstables during + // DB.replayWAL. When replaying WAL files at startup, we flush after each + // WAL is replayed building up a single version edit that is + // applied. Because we don't apply the version edit after each flush, this + // code doesn't know that L0 contains files and zeroing of seqnums should + // be disabled. That is fixable, but it seems safer to just match the + // RocksDB behavior for now. + return len(c.flush.flushables) == 0 && c.delElision.ElidesEverything() && c.rangeKeyElision.ElidesEverything() +} + +// newInputIters returns an iterator over all the input tables in a compaction. +func (c *tableCompaction) newInputIters( + newIters tableNewIters, iiopts internalIterOpts, +) ( + pointIter internalIterator, + rangeDelIter, rangeKeyIter keyspan.FragmentIterator, + retErr error, +) { + ctx := context.TODO() + cmp := c.comparer.Compare + + // Validate the ordering of compaction input files for defense in depth. + if len(c.flush.flushables) == 0 { + if c.startLevel.level >= 0 { + err := manifest.CheckOrdering(c.comparer, manifest.Level(c.startLevel.level), + c.startLevel.files.Iter()) + if err != nil { + return nil, nil, nil, err + } + } + err := manifest.CheckOrdering(c.comparer, manifest.Level(c.outputLevel.level), + c.outputLevel.files.Iter()) + if err != nil { + return nil, nil, nil, err + } + if c.startLevel.level == 0 { + if c.startLevel.l0SublevelInfo == nil { + panic("l0SublevelInfo not created for compaction out of L0") + } + for _, info := range c.startLevel.l0SublevelInfo { + err := manifest.CheckOrdering(c.comparer, info.sublevel, info.Iter()) + if err != nil { + return nil, nil, nil, err + } + } + } + if len(c.extraLevels) > 0 { + if len(c.extraLevels) > 1 { + panic("n>2 multi level compaction not implemented yet") + } + interLevel := c.extraLevels[0] + err := manifest.CheckOrdering(c.comparer, manifest.Level(interLevel.level), + interLevel.files.Iter()) + if err != nil { + return nil, nil, nil, err + } + } + } + + // There are three classes of keys that a compaction needs to process: point + // keys, range deletion tombstones and range keys. Collect all iterators for + // all these classes of keys from all the levels. We'll aggregate them + // together farther below. + // + // numInputLevels is an approximation of the number of iterator levels. Due + // to idiosyncrasies in iterator construction, we may (rarely) exceed this + // initial capacity. + numInputLevels := max(len(c.flush.flushables), len(c.inputs)) + iters := make([]internalIterator, 0, numInputLevels) + rangeDelIters := make([]keyspan.FragmentIterator, 0, numInputLevels) + rangeKeyIters := make([]keyspan.FragmentIterator, 0, numInputLevels) + + // If construction of the iterator inputs fails, ensure that we close all + // the consitutent iterators. + defer func() { + if retErr != nil { + for _, iter := range iters { + if iter != nil { + _ = iter.Close() + } + } + for _, rangeDelIter := range rangeDelIters { + rangeDelIter.Close() + } + } + }() + iterOpts := IterOptions{ + Category: categoryCompaction, + logger: c.logger, + } + + // Populate iters, rangeDelIters and rangeKeyIters with the appropriate + // constituent iterators. This depends on whether this is a flush or a + // compaction. + if len(c.flush.flushables) != 0 { + // If flushing, we need to build the input iterators over the memtables + // stored in c.flush.flushables. + for _, f := range c.flush.flushables { + iters = append(iters, f.newFlushIter(nil)) + rangeDelIter := f.newRangeDelIter(nil) + if rangeDelIter != nil { + rangeDelIters = append(rangeDelIters, rangeDelIter) + } + if rangeKeyIter := f.newRangeKeyIter(nil); rangeKeyIter != nil { + rangeKeyIters = append(rangeKeyIters, rangeKeyIter) + } + } + } else { + addItersForLevel := func(level *compactionLevel, l manifest.Layer) error { + // Add a *levelIter for point iterators. Because we don't call + // initRangeDel, the levelIter will close and forget the range + // deletion iterator when it steps on to a new file. Surfacing range + // deletions to compactions are handled below. + iters = append(iters, newLevelIter(ctx, iterOpts, c.comparer, + newIters, level.files.Iter(), l, iiopts)) + // TODO(jackson): Use keyspanimpl.LevelIter to avoid loading all the range + // deletions into memory upfront. (See #2015, which reverted this.) There + // will be no user keys that are split between sstables within a level in + // Cockroach 23.1, which unblocks this optimization. + + // Add the range deletion iterator for each file as an independent level + // in mergingIter, as opposed to making a levelIter out of those. This + // is safer as levelIter expects all keys coming from underlying + // iterators to be in order. Due to compaction / tombstone writing + // logic in finishOutput(), it is possible for range tombstones to not + // be strictly ordered across all files in one level. + // + // Consider this example from the metamorphic tests (also repeated in + // finishOutput()), consisting of three L3 files with their bounds + // specified in square brackets next to the file name: + // + // ./000240.sst [tmgc#391,MERGE-tmgc#391,MERGE] + // tmgc#391,MERGE [786e627a] + // tmgc-udkatvs#331,RANGEDEL + // + // ./000241.sst [tmgc#384,MERGE-tmgc#384,MERGE] + // tmgc#384,MERGE [666c7070] + // tmgc-tvsalezade#383,RANGEDEL + // tmgc-tvsalezade#331,RANGEDEL + // + // ./000242.sst [tmgc#383,RANGEDEL-tvsalezade#72057594037927935,RANGEDEL] + // tmgc-tvsalezade#383,RANGEDEL + // tmgc#375,SET [72646c78766965616c72776865676e79] + // tmgc-tvsalezade#356,RANGEDEL + // + // Here, the range tombstone in 000240.sst falls "after" one in + // 000241.sst, despite 000240.sst being ordered "before" 000241.sst for + // levelIter's purposes. While each file is still consistent before its + // bounds, it's safer to have all rangedel iterators be visible to + // mergingIter. + iter := level.files.Iter() + for f := iter.First(); f != nil; f = iter.Next() { + rangeDelIter, err := c.newRangeDelIter(ctx, newIters, iter.Take(), iterOpts, iiopts, l) + if err != nil { + // The error will already be annotated with the BackingFileNum, so + // we annotate it with the FileNum. + return errors.Wrapf(err, "pebble: could not open table %s", errors.Safe(f.TableNum)) + } + if rangeDelIter == nil { + continue + } + rangeDelIters = append(rangeDelIters, rangeDelIter) + c.iterationState.keyspanIterClosers = append(c.iterationState.keyspanIterClosers, rangeDelIter) + } + + // Check if this level has any range keys. + hasRangeKeys := false + for f := iter.First(); f != nil; f = iter.Next() { + if f.HasRangeKeys { + hasRangeKeys = true + break + } + } + if hasRangeKeys { + newRangeKeyIterWrapper := func(ctx context.Context, file *manifest.TableMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { + iters, err := newIters(ctx, file, &iterOpts, iiopts, iterRangeKeys) + if err != nil { + return nil, err + } else if iters.rangeKey == nil { + return emptyKeyspanIter, nil + } + // Ensure that the range key iter is not closed until the compaction is + // finished. This is necessary because range key processing + // requires the range keys to be held in memory for up to the + // lifetime of the compaction. + noCloseIter := &noCloseIter{iters.rangeKey} + c.iterationState.keyspanIterClosers = append(c.iterationState.keyspanIterClosers, noCloseIter) + + // We do not need to truncate range keys to sstable boundaries, or + // only read within the file's atomic compaction units, unlike with + // range tombstones. This is because range keys were added after we + // stopped splitting user keys across sstables, so all the range keys + // in this sstable must wholly lie within the file's bounds. + return noCloseIter, err + } + li := keyspanimpl.NewLevelIter(ctx, keyspan.SpanIterOptions{}, cmp, + newRangeKeyIterWrapper, level.files.Iter(), l, manifest.KeyTypeRange) + rangeKeyIters = append(rangeKeyIters, li) + } + return nil + } + + for i := range c.inputs { + // If the level is annotated with l0SublevelInfo, expand it into one + // level per sublevel. + // TODO(jackson): Perform this expansion even earlier when we pick the + // compaction? + if len(c.inputs[i].l0SublevelInfo) > 0 { + for _, info := range c.startLevel.l0SublevelInfo { + sublevelCompactionLevel := &compactionLevel{0, info.LevelSlice, nil} + if err := addItersForLevel(sublevelCompactionLevel, info.sublevel); err != nil { + return nil, nil, nil, err + } + } + continue + } + if err := addItersForLevel(&c.inputs[i], manifest.Level(c.inputs[i].level)); err != nil { + return nil, nil, nil, err + } + } + } + + // If there's only one constituent point iterator, we can avoid the overhead + // of a *mergingIter. This is possible, for example, when performing a flush + // of a single memtable. Otherwise, combine all the iterators into a merging + // iter. + pointIter = iters[0] + if len(iters) > 1 { + pointIter = newMergingIter(c.logger, &c.metrics.internalIterStats, cmp, nil, iters...) + } + + // In normal operation, levelIter iterates over the point operations in a + // level, and initializes a rangeDelIter pointer for the range deletions in + // each table. During compaction, we want to iterate over the merged view of + // point operations and range deletions. In order to do this we create one + // levelIter per level to iterate over the point operations, and collect up + // all the range deletion files. + // + // The range deletion levels are combined with a keyspanimpl.MergingIter. The + // resulting merged rangedel iterator is then included using an + // InterleavingIter. + // TODO(jackson): Consider using a defragmenting iterator to stitch together + // logical range deletions that were fragmented due to previous file + // boundaries. + if len(rangeDelIters) > 0 { + mi := &keyspanimpl.MergingIter{} + mi.Init(c.comparer, keyspan.NoopTransform, new(keyspanimpl.MergingBuffers), rangeDelIters...) + rangeDelIter = mi + } + + // If there are range key iterators, we need to combine them using + // keyspanimpl.MergingIter, and then interleave them among the points. + if len(rangeKeyIters) > 0 { + mi := &keyspanimpl.MergingIter{} + mi.Init(c.comparer, keyspan.NoopTransform, new(keyspanimpl.MergingBuffers), rangeKeyIters...) + // TODO(radu): why do we have a defragmenter here but not above? + di := &keyspan.DefragmentingIter{} + di.Init(c.comparer, mi, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, new(keyspan.DefragmentingBuffers)) + rangeKeyIter = di + } + return pointIter, rangeDelIter, rangeKeyIter, nil +} + +func (c *tableCompaction) newRangeDelIter( + ctx context.Context, + newIters tableNewIters, + f manifest.LevelFile, + opts IterOptions, + iiopts internalIterOpts, + l manifest.Layer, +) (*noCloseIter, error) { + opts.layer = l + iterSet, err := newIters(ctx, f.TableMetadata, &opts, iiopts, iterRangeDeletions) + if err != nil { + return nil, err + } else if iterSet.rangeDeletion == nil { + // The file doesn't contain any range deletions. + return nil, nil + } + // Ensure that rangeDelIter is not closed until the compaction is + // finished. This is necessary because range tombstone processing + // requires the range tombstones to be held in memory for up to the + // lifetime of the compaction. + return &noCloseIter{iterSet.rangeDeletion}, nil +} + +func (c *tableCompaction) String() string { + if len(c.flush.flushables) != 0 { + return "flush\n" + } + + var buf bytes.Buffer + for level := c.startLevel.level; level <= c.outputLevel.level; level++ { + i := level - c.startLevel.level + fmt.Fprintf(&buf, "%d:", level) + for f := range c.inputs[i].files.All() { + fmt.Fprintf(&buf, " %s:%s-%s", f.TableNum, f.Smallest(), f.Largest()) + } + fmt.Fprintf(&buf, "\n") + } + return buf.String() +} + +type manualCompaction struct { + // id is for internal bookkeeping. + id uint64 + // Count of the retries due to concurrent compaction to overlapping levels. + retries int + level int + outputLevel int + done chan error + start []byte + end []byte + split bool +} + +type readCompaction struct { + level int + // [start, end] key ranges are used for de-duping. + start []byte + end []byte + + // The file associated with the compaction. + // If the file no longer belongs in the same + // level, then we skip the compaction. + tableNum base.TableNum +} + +// Removes compaction markers from files in a compaction. The rollback parameter +// indicates whether the compaction state should be rolled back to its original +// state in the case of an unsuccessful compaction. +// +// DB.mu must be held when calling this method, however this method can drop and +// re-acquire that mutex. All writes to the manifest for this compaction should +// have completed by this point. +func (d *DB) clearCompactingState(c *tableCompaction, rollback bool) { + c.versionEditApplied = true + for _, cl := range c.inputs { + for f := range cl.files.All() { + if !f.IsCompacting() { + d.opts.Logger.Fatalf("L%d->L%d: %s not being compacted", c.startLevel.level, c.outputLevel.level, f.TableNum) + } + if !rollback { + // On success all compactions other than move and delete-only compactions + // transition the file into the Compacted state. Move-compacted files + // become eligible for compaction again and transition back to NotCompacting. + // Delete-only compactions could, on rare occasion, leave files untouched + // (eg. if files have a loose bound), so we revert them all to NotCompacting + // just in case they need to be compacted again. + if c.kind != compactionKindMove && c.kind != compactionKindDeleteOnly { + f.SetCompactionState(manifest.CompactionStateCompacted) + } else { + f.SetCompactionState(manifest.CompactionStateNotCompacting) + } + } else { + // Else, on rollback, all input files unconditionally transition back to + // NotCompacting. + f.SetCompactionState(manifest.CompactionStateNotCompacting) + } + f.IsIntraL0Compacting = false + } + } + l0InProgress := inProgressL0Compactions(d.getInProgressCompactionInfoLocked(c)) + func() { + // InitCompactingFileInfo requires that no other manifest writes be + // happening in parallel with it, i.e. we're not in the midst of installing + // another version. Otherwise, it's possible that we've created another + // L0Sublevels instance, but not added it to the versions list, causing + // all the indices in TableMetadata to be inaccurate. To ensure this, + // grab the manifest lock. + d.mu.versions.logLock() + // It is a bit peculiar that we are fiddling with th current version state + // in a separate critical section from when this version was installed. + // But this fiddling is necessary if the compaction failed. When the + // compaction succeeded, we've already done this in UpdateVersionLocked, so + // this seems redundant. Anyway, we clear the pickedCompactionCache since we + // may be able to pick a better compaction (though when this compaction + // succeeded we've also cleared the cache in UpdateVersionLocked). + defer d.mu.versions.logUnlockAndInvalidatePickedCompactionCache() + d.mu.versions.latest.l0Organizer.InitCompactingFileInfo(l0InProgress) + }() +} + +func (d *DB) calculateDiskAvailableBytes() uint64 { + space, err := d.opts.FS.GetDiskUsage(d.dirname) + if err != nil { + if !errors.Is(err, vfs.ErrUnsupported) { + d.opts.EventListener.BackgroundError(err) + } + // Return the last value we managed to obtain. + return d.diskAvailBytes.Load() + } + + d.lowDiskSpaceReporter.Report(space.AvailBytes, space.TotalBytes, d.opts.EventListener) + d.diskAvailBytes.Store(space.AvailBytes) + return space.AvailBytes +} + +// maybeScheduleFlush schedules a flush if necessary. +// +// d.mu must be held when calling this. +func (d *DB) maybeScheduleFlush() { + if d.mu.compact.flushing || d.closed.Load() != nil || d.opts.ReadOnly { + return + } + if len(d.mu.mem.queue) <= 1 { + return + } + + if !d.passedFlushThreshold() { + return + } + + d.mu.compact.flushing = true + go d.flush() +} + +func (d *DB) passedFlushThreshold() bool { + var n int + var size uint64 + for ; n < len(d.mu.mem.queue)-1; n++ { + if !d.mu.mem.queue[n].readyForFlush() { + break + } + if d.mu.mem.queue[n].flushForced { + // A flush was forced. Pretend the memtable size is the configured + // size. See minFlushSize below. + size += d.opts.MemTableSize + } else { + size += d.mu.mem.queue[n].totalBytes() + } + } + if n == 0 { + // None of the immutable memtables are ready for flushing. + return false + } + + // Only flush once the sum of the queued memtable sizes exceeds half the + // configured memtable size. This prevents flushing of memtables at startup + // while we're undergoing the ramp period on the memtable size. See + // DB.newMemTable(). + minFlushSize := d.opts.MemTableSize / 2 + return size >= minFlushSize +} + +func (d *DB) maybeScheduleDelayedFlush(tbl *memTable, dur time.Duration) { + var mem *flushableEntry + for _, m := range d.mu.mem.queue { + if m.flushable == tbl { + mem = m + break + } + } + if mem == nil || mem.flushForced { + return + } + deadline := d.timeNow().Add(dur) + if !mem.delayedFlushForcedAt.IsZero() && deadline.After(mem.delayedFlushForcedAt) { + // Already scheduled to flush sooner than within `dur`. + return + } + mem.delayedFlushForcedAt = deadline + go func() { + timer := time.NewTimer(dur) + defer timer.Stop() + + select { + case <-d.closedCh: + return + case <-mem.flushed: + return + case <-timer.C: + d.commit.mu.Lock() + defer d.commit.mu.Unlock() + d.mu.Lock() + defer d.mu.Unlock() + + // NB: The timer may fire concurrently with a call to Close. If a + // Close call beat us to acquiring d.mu, d.closed holds ErrClosed, + // and it's too late to flush anything. Otherwise, the Close call + // will block on locking d.mu until we've finished scheduling the + // flush and set `d.mu.compact.flushing` to true. Close will wait + // for the current flush to complete. + if d.closed.Load() != nil { + return + } + + if d.mu.mem.mutable == tbl { + _ = d.makeRoomForWrite(nil) + } else { + mem.flushForced = true + } + d.maybeScheduleFlush() + } + }() +} + +func (d *DB) flush() { + pprof.Do(context.Background(), flushLabels, func(context.Context) { + flushingWorkStart := crtime.NowMono() + d.mu.Lock() + defer d.mu.Unlock() + idleDuration := flushingWorkStart.Sub(d.mu.compact.noOngoingFlushStartTime) + var bytesFlushed uint64 + var err error + if bytesFlushed, err = d.flush1(); err != nil { + // TODO(peter): count consecutive flush errors and backoff. + d.opts.EventListener.BackgroundError(err) + } + d.mu.compact.flushing = false + d.mu.compact.noOngoingFlushStartTime = crtime.NowMono() + workDuration := d.mu.compact.noOngoingFlushStartTime.Sub(flushingWorkStart) + d.mu.compact.flushWriteThroughput.Bytes += int64(bytesFlushed) + d.mu.compact.flushWriteThroughput.WorkDuration += workDuration + d.mu.compact.flushWriteThroughput.IdleDuration += idleDuration + // More flush work may have arrived while we were flushing, so schedule + // another flush if needed. + d.maybeScheduleFlush() + // Let the CompactionScheduler know, so that it can react immediately to + // an increase in DB.GetAllowedWithoutPermission. + d.opts.Experimental.CompactionScheduler.UpdateGetAllowedWithoutPermission() + // The flush may have produced too many files in a level, so schedule a + // compaction if needed. + d.maybeScheduleCompaction() + d.mu.compact.cond.Broadcast() + }) +} + +// runIngestFlush is used to generate a flush version edit for sstables which +// were ingested as flushables. Both DB.mu and the manifest lock must be held +// while runIngestFlush is called. +func (d *DB) runIngestFlush(c *tableCompaction) (*manifest.VersionEdit, error) { + if len(c.flush.flushables) != 1 { + panic("pebble: ingestedFlushable must be flushed one at a time.") + } + + // Finding the target level for ingestion must use the latest version + // after the logLock has been acquired. + version := d.mu.versions.currentVersion() + + baseLevel := d.mu.versions.picker.getBaseLevel() + ve := &manifest.VersionEdit{} + var ingestSplitFiles []ingestSplitFile + ingestFlushable := c.flush.flushables[0].flushable.(*ingestedFlushable) + + updateLevelMetricsOnExcise := func(m *manifest.TableMetadata, level int, added []manifest.NewTableEntry) { + levelMetrics := c.metrics.perLevel[level] + if levelMetrics == nil { + levelMetrics = &LevelMetrics{} + c.metrics.perLevel[level] = levelMetrics + } + levelMetrics.TablesCount-- + levelMetrics.TablesSize -= int64(m.Size) + levelMetrics.EstimatedReferencesSize -= m.EstimatedReferenceSize() + for i := range added { + levelMetrics.TablesCount++ + levelMetrics.TablesSize += int64(added[i].Meta.Size) + levelMetrics.EstimatedReferencesSize += added[i].Meta.EstimatedReferenceSize() + } + } + + suggestSplit := d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit() && + d.FormatMajorVersion() >= FormatVirtualSSTables + + if suggestSplit || ingestFlushable.exciseSpan.Valid() { + // We could add deleted files to ve. + ve.DeletedTables = make(map[manifest.DeletedTableEntry]*manifest.TableMetadata) + } + + ctx := context.Background() + overlapChecker := &overlapChecker{ + comparer: d.opts.Comparer, + newIters: d.newIters, + opts: IterOptions{ + logger: d.opts.Logger, + Category: categoryIngest, + }, + v: version, + } + replacedTables := make(map[base.TableNum][]manifest.NewTableEntry) + for _, file := range ingestFlushable.files { + var fileToSplit *manifest.TableMetadata + var level int + + // This file fits perfectly within the excise span, so we can slot it at L6. + if ingestFlushable.exciseSpan.Valid() && + ingestFlushable.exciseSpan.Contains(d.cmp, file.Smallest()) && + ingestFlushable.exciseSpan.Contains(d.cmp, file.Largest()) { + level = 6 + } else { + // TODO(radu): this can perform I/O; we should not do this while holding DB.mu. + lsmOverlap, err := overlapChecker.DetermineLSMOverlap(ctx, file.UserKeyBounds()) + if err != nil { + return nil, err + } + level, fileToSplit, err = ingestTargetLevel( + ctx, d.cmp, lsmOverlap, baseLevel, d.mu.compact.inProgress, file, suggestSplit, + ) + if err != nil { + return nil, err + } + } + + // Add the current flushableIngest file to the version. + ve.NewTables = append(ve.NewTables, manifest.NewTableEntry{Level: level, Meta: file}) + if fileToSplit != nil { + ingestSplitFiles = append(ingestSplitFiles, ingestSplitFile{ + ingestFile: file, + splitFile: fileToSplit, + level: level, + }) + } + levelMetrics := c.metrics.perLevel.level(level) + levelMetrics.TableBytesIngested += file.Size + levelMetrics.TablesIngested++ + } + if ingestFlushable.exciseSpan.Valid() { + exciseBounds := ingestFlushable.exciseSpan.UserKeyBounds() + // Iterate through all levels and find files that intersect with exciseSpan. + for layer, ls := range version.AllLevelsAndSublevels() { + for m := range ls.Overlaps(d.cmp, ingestFlushable.exciseSpan.UserKeyBounds()).All() { + leftTable, rightTable, err := d.exciseTable(context.TODO(), exciseBounds, m, layer.Level(), tightExciseBounds) + if err != nil { + return nil, err + } + newFiles := applyExciseToVersionEdit(ve, m, leftTable, rightTable, layer.Level()) + replacedTables[m.TableNum] = newFiles + updateLevelMetricsOnExcise(m, layer.Level(), newFiles) + } + } + } + + if len(ingestSplitFiles) > 0 { + if err := d.ingestSplit(context.TODO(), ve, updateLevelMetricsOnExcise, ingestSplitFiles, replacedTables); err != nil { + return nil, err + } + } + + return ve, nil +} + +// flush runs a compaction that copies the immutable memtables from memory to +// disk. +// +// d.mu must be held when calling this, but the mutex may be dropped and +// re-acquired during the course of this method. +func (d *DB) flush1() (bytesFlushed uint64, err error) { + // NB: The flushable queue can contain flushables of type ingestedFlushable. + // The sstables in ingestedFlushable.files must be placed into the appropriate + // level in the lsm. Let's say the flushable queue contains a prefix of + // regular immutable memtables, then an ingestedFlushable, and then the + // mutable memtable. When the flush of the ingestedFlushable is performed, + // it needs an updated view of the lsm. That is, the prefix of immutable + // memtables must have already been flushed. Similarly, if there are two + // contiguous ingestedFlushables in the queue, then the first flushable must + // be flushed, so that the second flushable can see an updated view of the + // lsm. + // + // Given the above, we restrict flushes to either some prefix of regular + // memtables, or a single flushable of type ingestedFlushable. The DB.flush + // function will call DB.maybeScheduleFlush again, so a new flush to finish + // the remaining flush work should be scheduled right away. + // + // NB: Large batches placed in the flushable queue share the WAL with the + // previous memtable in the queue. We must ensure the property that both the + // large batch and the memtable with which it shares a WAL are flushed + // together. The property ensures that the minimum unflushed log number + // isn't incremented incorrectly. Since a flushableBatch.readyToFlush always + // returns true, and since the large batch will always be placed right after + // the memtable with which it shares a WAL, the property is naturally + // ensured. The large batch will always be placed after the memtable with + // which it shares a WAL because we ensure it in DB.commitWrite by holding + // the commitPipeline.mu and then holding DB.mu. As an extra defensive + // measure, if we try to flush the memtable without also flushing the + // flushable batch in the same flush, since the memtable and flushableBatch + // have the same logNum, the logNum invariant check below will trigger. + var n, inputs int + var inputBytes uint64 + var ingest bool + for ; n < len(d.mu.mem.queue)-1; n++ { + if f, ok := d.mu.mem.queue[n].flushable.(*ingestedFlushable); ok { + if n == 0 { + // The first flushable is of type ingestedFlushable. Since these + // must be flushed individually, we perform a flush for just + // this. + if !f.readyForFlush() { + // This check is almost unnecessary, but we guard against it + // just in case this invariant changes in the future. + panic("pebble: ingestedFlushable should always be ready to flush.") + } + // By setting n = 1, we ensure that the first flushable(n == 0) + // is scheduled for a flush. The number of tables added is equal to the + // number of files in the ingest operation. + n = 1 + inputs = len(f.files) + ingest = true + break + } else { + // There was some prefix of flushables which weren't of type + // ingestedFlushable. So, perform a flush for those. + break + } + } + if !d.mu.mem.queue[n].readyForFlush() { + break + } + inputBytes += d.mu.mem.queue[n].inuseBytes() + } + if n == 0 { + // None of the immutable memtables are ready for flushing. + return 0, nil + } + if !ingest { + // Flushes of memtables add the prefix of n memtables from the flushable + // queue. + inputs = n + } + + // Require that every memtable being flushed has a log number less than the + // new minimum unflushed log number. + minUnflushedLogNum := d.mu.mem.queue[n].logNum + if !d.opts.DisableWAL { + for i := 0; i < n; i++ { + if logNum := d.mu.mem.queue[i].logNum; logNum >= minUnflushedLogNum { + panic(errors.AssertionFailedf("logNum invariant violated: flushing %d items; %d:type=%T,logNum=%d; %d:type=%T,logNum=%d", + n, + i, d.mu.mem.queue[i].flushable, logNum, + n, d.mu.mem.queue[n].flushable, minUnflushedLogNum)) + } + } + } + + c, err := newFlush(d.opts, d.mu.versions.currentVersion(), d.mu.versions.latest.l0Organizer, + d.mu.versions.picker.getBaseLevel(), d.mu.mem.queue[:n], d.timeNow(), d.TableFormat(), d.determineCompactionValueSeparation) + if err != nil { + return 0, err + } + c.AddInProgressLocked(d) + + jobID := d.newJobIDLocked() + info := FlushInfo{ + JobID: int(jobID), + Input: inputs, + InputBytes: inputBytes, + Ingest: ingest, + } + d.opts.EventListener.FlushBegin(info) + + startTime := d.timeNow() + + var ve *manifest.VersionEdit + var stats compact.Stats + // To determine the target level of the files in the ingestedFlushable, we + // need to acquire the logLock, and not release it for that duration. Since + // UpdateVersionLocked acquires it anyway, we create the VersionEdit for + // ingestedFlushable outside runCompaction. For all other flush cases, we + // construct the VersionEdit inside runCompaction. + var compactionErr error + if c.kind != compactionKindIngestedFlushable { + ve, stats, compactionErr = d.runCompaction(jobID, c) + } + + err = d.mu.versions.UpdateVersionLocked(func() (versionUpdate, error) { + err := compactionErr + if c.kind == compactionKindIngestedFlushable { + ve, err = d.runIngestFlush(c) + } + info.Duration = d.timeNow().Sub(startTime) + if err != nil { + return versionUpdate{}, err + } + + validateVersionEdit(ve, d.opts.Comparer.ValidateKey, d.opts.Comparer.FormatKey, d.opts.Logger) + for i := range ve.NewTables { + e := &ve.NewTables[i] + info.Output = append(info.Output, e.Meta.TableInfo()) + // Ingested tables are not necessarily flushed to L0. Record the level of + // each ingested file explicitly. + if ingest { + info.IngestLevels = append(info.IngestLevels, e.Level) + } + } + + // The flush succeeded or it produced an empty sstable. In either case we + // want to bump the minimum unflushed log number to the log number of the + // oldest unflushed memtable. + ve.MinUnflushedLogNum = minUnflushedLogNum + if c.kind != compactionKindIngestedFlushable { + l0Metrics := c.metrics.perLevel.level(0) + if d.opts.DisableWAL { + // If the WAL is disabled, every flushable has a zero [logSize], + // resulting in zero bytes in. Instead, use the number of bytes we + // flushed as the BytesIn. This ensures we get a reasonable w-amp + // calculation even when the WAL is disabled. + l0Metrics.TableBytesIn = l0Metrics.TableBytesFlushed + l0Metrics.BlobBytesFlushed + } else { + for i := 0; i < n; i++ { + l0Metrics.TableBytesIn += d.mu.mem.queue[i].logSize + } + } + } else { + // c.kind == compactionKindIngestedFlushable && we could have deleted files due + // to ingest-time splits or excises. + ingestFlushable := c.flush.flushables[0].flushable.(*ingestedFlushable) + exciseBounds := ingestFlushable.exciseSpan.UserKeyBounds() + for c2 := range d.mu.compact.inProgress { + // Check if this compaction overlaps with the excise span. Note that just + // checking if the inputs individually overlap with the excise span + // isn't sufficient; for instance, a compaction could have [a,b] and [e,f] + // as inputs and write it all out as [a,b,e,f] in one sstable. If we're + // doing a [c,d) excise at the same time as this compaction, we will have + // to error out the whole compaction as we can't guarantee it hasn't/won't + // write a file overlapping with the excise span. + bounds := c2.Bounds() + if bounds != nil && bounds.Overlaps(d.cmp, &exciseBounds) { + c2.Cancel() + } + } + + if len(ve.DeletedTables) > 0 { + // Iterate through all other compactions, and check if their inputs have + // been replaced due to an ingest-time split or excise. In that case, + // cancel the compaction. + for c2 := range d.mu.compact.inProgress { + for level, table := range c2.Tables() { + if _, ok := ve.DeletedTables[manifest.DeletedTableEntry{FileNum: table.TableNum, Level: level}]; ok { + c2.Cancel() + break + } + } + } + } + } + return versionUpdate{ + VE: ve, + JobID: jobID, + Metrics: c.metrics.perLevel, + InProgressCompactionsFn: func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) }, + }, nil + }) + + // If err != nil, then the flush will be retried, and we will recalculate + // these metrics. + if err == nil { + d.mu.snapshots.cumulativePinnedCount += stats.CumulativePinnedKeys + d.mu.snapshots.cumulativePinnedSize += stats.CumulativePinnedSize + d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.CountMissizedDels + } + + d.clearCompactingState(c, err != nil) + delete(d.mu.compact.inProgress, c) + d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.metrics.bytesWritten.Load(), err) + + var flushed flushableList + if err == nil { + flushed = d.mu.mem.queue[:n] + d.mu.mem.queue = d.mu.mem.queue[n:] + d.updateReadStateLocked(d.opts.DebugCheck) + d.updateTableStatsLocked(ve.NewTables) + if ingest { + d.mu.versions.metrics.Flush.AsIngestCount++ + for _, l := range c.metrics.perLevel { + if l != nil { + d.mu.versions.metrics.Flush.AsIngestBytes += l.TableBytesIngested + d.mu.versions.metrics.Flush.AsIngestTableCount += l.TablesIngested + } + } + } + d.maybeTransitionSnapshotsToFileOnlyLocked() + } + // Signal FlushEnd after installing the new readState. This helps for unit + // tests that use the callback to trigger a read using an iterator with + // IterOptions.OnlyReadGuaranteedDurable. + info.Err = err + if info.Err == nil && len(ve.NewTables) == 0 { + info.Err = errEmptyTable + } + info.Done = true + info.TotalDuration = d.timeNow().Sub(startTime) + d.opts.EventListener.FlushEnd(info) + + // The order of these operations matters here for ease of testing. + // Removing the reader reference first allows tests to be guaranteed that + // the memtable reservation has been released by the time a synchronous + // flush returns. readerUnrefLocked may also produce obsolete files so the + // call to deleteObsoleteFiles must happen after it. + for i := range flushed { + flushed[i].readerUnrefLocked(true) + } + + d.deleteObsoleteFiles(jobID) + + // Mark all the memtables we flushed as flushed. + for i := range flushed { + close(flushed[i].flushed) + } + + return inputBytes, err +} + +// maybeTransitionSnapshotsToFileOnlyLocked transitions any "eventually +// file-only" snapshots to be file-only if all their visible state has been +// flushed to sstables. +// +// REQUIRES: d.mu. +func (d *DB) maybeTransitionSnapshotsToFileOnlyLocked() { + earliestUnflushedSeqNum := d.getEarliestUnflushedSeqNumLocked() + currentVersion := d.mu.versions.currentVersion() + for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; { + if s.efos == nil { + s = s.next + continue + } + overlapsFlushable := false + if base.Visible(earliestUnflushedSeqNum, s.efos.seqNum, base.SeqNumMax) { + // There are some unflushed keys that are still visible to the EFOS. + // Check if any memtables older than the EFOS contain keys within a + // protected range of the EFOS. If no, we can transition. + protectedRanges := make([]bounded, len(s.efos.protectedRanges)) + for i := range s.efos.protectedRanges { + protectedRanges[i] = s.efos.protectedRanges[i] + } + for i := range d.mu.mem.queue { + if !base.Visible(d.mu.mem.queue[i].logSeqNum, s.efos.seqNum, base.SeqNumMax) { + // All keys in this memtable are newer than the EFOS. Skip this + // memtable. + continue + } + // NB: computePossibleOverlaps could have false positives, such as if + // the flushable is a flushable ingest and not a memtable. In that + // case we don't open the sstables to check; we just pessimistically + // assume an overlap. + d.mu.mem.queue[i].computePossibleOverlaps(func(b bounded) shouldContinue { + overlapsFlushable = true + return stopIteration + }, protectedRanges...) + if overlapsFlushable { + break + } + } + } + if overlapsFlushable { + s = s.next + continue + } + currentVersion.Ref() + + // NB: s.efos.transitionToFileOnlySnapshot could close s, in which + // case s.next would be nil. Save it before calling it. + next := s.next + _ = s.efos.transitionToFileOnlySnapshot(currentVersion) + s = next + } +} + +// maybeScheduleCompactionAsync should be used when +// we want to possibly schedule a compaction, but don't +// want to eat the cost of running maybeScheduleCompaction. +// This method should be launched in a separate goroutine. +// d.mu must not be held when this is called. +func (d *DB) maybeScheduleCompactionAsync() { + defer d.compactionSchedulers.Done() + + d.mu.Lock() + d.maybeScheduleCompaction() + d.mu.Unlock() +} + +// maybeScheduleCompaction schedules a compaction if necessary. +// +// WARNING: maybeScheduleCompaction and Schedule must be the only ways that +// any compactions are run. These ensure that the pickedCompactionCache is +// used and not stale (by ensuring invalidation is done). +// +// Even compactions that are not scheduled by the CompactionScheduler must be +// run using maybeScheduleCompaction, since starting those compactions needs +// to invalidate the pickedCompactionCache. +// +// Requires d.mu to be held. +func (d *DB) maybeScheduleCompaction() { + d.mu.versions.logLock() + defer d.mu.versions.logUnlock() + env := d.makeCompactionEnvLocked() + if env == nil { + return + } + // env.inProgressCompactions will become stale once we pick a compaction, so + // it needs to be kept fresh. Also, the pickedCompaction in the + // pickedCompactionCache is not valid if we pick a compaction before using + // it, since those earlier compactions can mark the same file as compacting. + + // Delete-only compactions are expected to be cheap and reduce future + // compaction work, so schedule them directly instead of using the + // CompactionScheduler. + if d.tryScheduleDeleteOnlyCompaction() { + env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) + d.mu.versions.pickedCompactionCache.invalidate() + } + // Download compactions have their own concurrency and do not currently + // interact with CompactionScheduler. + // + // TODO(sumeer): integrate with CompactionScheduler, since these consume + // disk write bandwidth. + if d.tryScheduleDownloadCompactions(*env, d.opts.MaxConcurrentDownloads()) { + env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) + d.mu.versions.pickedCompactionCache.invalidate() + } + // The remaining compactions are scheduled by the CompactionScheduler. + if d.mu.versions.pickedCompactionCache.isWaiting() { + // CompactionScheduler already knows that the DB is waiting to run a + // compaction. + return + } + // INVARIANT: !pickedCompactionCache.isWaiting. The following loop will + // either exit after successfully starting all the compactions it can pick, + // or will exit with one pickedCompaction in the cache, and isWaiting=true. + for { + // Do not have a pickedCompaction in the cache. + pc := d.pickAnyCompaction(*env) + if pc == nil { + return + } + success, grantHandle := d.opts.Experimental.CompactionScheduler.TrySchedule() + if !success { + // Can't run now, but remember this pickedCompaction in the cache. + d.mu.versions.pickedCompactionCache.add(pc) + return + } + d.runPickedCompaction(pc, grantHandle) + env.inProgressCompactions = d.getInProgressCompactionInfoLocked(nil) + } +} + +// makeCompactionEnv attempts to create a compactionEnv necessary during +// compaction picking. If the DB is closed or marked as read-only, +// makeCompactionEnv returns nil to indicate that compactions may not be +// performed. Else, a new compactionEnv is constructed using the current DB +// state. +// +// Compaction picking needs a coherent view of a Version. For example, we need +// to exclude concurrent ingestions from making a decision on which level to +// ingest into that conflicts with our compaction decision. +// +// A pickedCompaction constructed using a compactionEnv must only be used if +// the latest Version has not changed. +// +// REQUIRES: d.mu and d.mu.versions.logLock are held. +func (d *DB) makeCompactionEnvLocked() *compactionEnv { + if d.closed.Load() != nil || d.opts.ReadOnly { + return nil + } + env := &compactionEnv{ + diskAvailBytes: d.diskAvailBytes.Load(), + earliestSnapshotSeqNum: d.mu.snapshots.earliest(), + earliestUnflushedSeqNum: d.getEarliestUnflushedSeqNumLocked(), + inProgressCompactions: d.getInProgressCompactionInfoLocked(nil), + readCompactionEnv: readCompactionEnv{ + readCompactions: &d.mu.compact.readCompactions, + flushing: d.mu.compact.flushing || d.passedFlushThreshold(), + rescheduleReadCompaction: &d.mu.compact.rescheduleReadCompaction, + }, + } + if !d.problemSpans.IsEmpty() { + env.problemSpans = &d.problemSpans + } + return env +} + +// pickAnyCompaction tries to pick a manual or automatic compaction. +func (d *DB) pickAnyCompaction(env compactionEnv) (pc pickedCompaction) { + // Pick a score-based compaction first, since a misshapen LSM is bad. + if !d.opts.DisableAutomaticCompactions { + if pc = d.mu.versions.picker.pickAutoScore(env); pc != nil { + return pc + } + } + // Pick a manual compaction, if any. + if pc = d.pickManualCompaction(env); pc != nil { + return pc + } + if !d.opts.DisableAutomaticCompactions { + return d.mu.versions.picker.pickAutoNonScore(env) + } + return nil +} + +// runPickedCompaction kicks off the provided pickedCompaction. In case the +// pickedCompaction is a manual compaction, the corresponding manualCompaction +// is removed from d.mu.compact.manual. +// +// REQUIRES: d.mu and d.mu.versions.logLock is held. +func (d *DB) runPickedCompaction(pc pickedCompaction, grantHandle CompactionGrantHandle) { + var doneChannel chan error + if pc.ManualID() > 0 { + for i := range d.mu.compact.manual { + if d.mu.compact.manual[i].id == pc.ManualID() { + doneChannel = d.mu.compact.manual[i].done + d.mu.compact.manual = slices.Delete(d.mu.compact.manual, i, i+1) + d.mu.compact.manualLen.Store(int32(len(d.mu.compact.manual))) + break + } + } + if doneChannel == nil { + panic(errors.AssertionFailedf("did not find manual compaction with id %d", pc.ManualID())) + } + } + + d.mu.compact.compactingCount++ + c := pc.ConstructCompaction(d, grantHandle) + c.AddInProgressLocked(d) + go func() { + d.compact(c, doneChannel) + }() +} + +// Schedule implements DBForCompaction (it is called by the +// CompactionScheduler). +func (d *DB) Schedule(grantHandle CompactionGrantHandle) bool { + d.mu.Lock() + defer d.mu.Unlock() + d.mu.versions.logLock() + defer d.mu.versions.logUnlock() + isWaiting := d.mu.versions.pickedCompactionCache.isWaiting() + if !isWaiting { + return false + } + pc := d.mu.versions.pickedCompactionCache.getForRunning() + if pc == nil { + env := d.makeCompactionEnvLocked() + if env != nil { + pc = d.pickAnyCompaction(*env) + } + if pc == nil { + d.mu.versions.pickedCompactionCache.setNotWaiting() + return false + } + } + // INVARIANT: pc != nil and is not in the cache. isWaiting is true, since + // there may be more compactions to run. + d.runPickedCompaction(pc, grantHandle) + return true +} + +// GetWaitingCompaction implements DBForCompaction (it is called by the +// CompactionScheduler). +func (d *DB) GetWaitingCompaction() (bool, WaitingCompaction) { + d.mu.Lock() + defer d.mu.Unlock() + d.mu.versions.logLock() + defer d.mu.versions.logUnlock() + isWaiting := d.mu.versions.pickedCompactionCache.isWaiting() + if !isWaiting { + return false, WaitingCompaction{} + } + pc := d.mu.versions.pickedCompactionCache.peek() + if pc == nil { + // Need to pick a compaction. + env := d.makeCompactionEnvLocked() + if env != nil { + pc = d.pickAnyCompaction(*env) + } + if pc == nil { + // Call setNotWaiting so that next call to GetWaitingCompaction can + // return early. + d.mu.versions.pickedCompactionCache.setNotWaiting() + return false, WaitingCompaction{} + } else { + d.mu.versions.pickedCompactionCache.add(pc) + } + } + // INVARIANT: pc != nil and is in the cache. + return true, pc.WaitingCompaction() +} + +// GetAllowedWithoutPermission implements DBForCompaction (it is called by the +// CompactionScheduler). +func (d *DB) GetAllowedWithoutPermission() int { + allowedBasedOnBacklog := int(d.mu.versions.curCompactionConcurrency.Load()) + allowedBasedOnManual := 0 + manualBacklog := int(d.mu.compact.manualLen.Load()) + if manualBacklog > 0 { + _, maxAllowed := d.opts.CompactionConcurrencyRange() + allowedBasedOnManual = min(maxAllowed, manualBacklog+allowedBasedOnBacklog) + } + return max(allowedBasedOnBacklog, allowedBasedOnManual) +} + +// tryScheduleDownloadCompactions tries to start download compactions. +// +// Requires d.mu to be held. Updates d.mu.compact.downloads. +// +// Returns true iff at least one compaction was started. +func (d *DB) tryScheduleDownloadCompactions(env compactionEnv, maxConcurrentDownloads int) bool { + started := false + vers := d.mu.versions.currentVersion() + for i := 0; i < len(d.mu.compact.downloads); { + if d.mu.compact.downloadingCount >= maxConcurrentDownloads { + break + } + download := d.mu.compact.downloads[i] + switch d.tryLaunchDownloadCompaction(download, vers, d.mu.versions.latest.l0Organizer, env, maxConcurrentDownloads) { + case launchedCompaction: + started = true + continue + case didNotLaunchCompaction: + // See if we can launch a compaction for another download task. + i++ + case downloadTaskCompleted: + // Task is completed and must be removed. + d.mu.compact.downloads = slices.Delete(d.mu.compact.downloads, i, i+1) + } + } + return started +} + +func (d *DB) pickManualCompaction(env compactionEnv) (pc pickedCompaction) { + v := d.mu.versions.currentVersion() + for len(d.mu.compact.manual) > 0 { + manual := d.mu.compact.manual[0] + pc, retryLater := newPickedManualCompaction(v, d.mu.versions.latest.l0Organizer, + d.opts, env, d.mu.versions.picker.getBaseLevel(), manual) + if pc != nil { + return pc + } + if retryLater { + // We are not able to run this manual compaction at this time. + // Inability to run the head blocks later manual compactions. + manual.retries++ + return nil + } + // Manual compaction is a no-op. Signal that it's complete. + manual.done <- nil + d.mu.compact.manual = d.mu.compact.manual[1:] + d.mu.compact.manualLen.Store(int32(len(d.mu.compact.manual))) + } + return nil +} + +// tryScheduleDeleteOnlyCompaction tries to kick off a delete-only compaction +// for all files that can be deleted as suggested by deletionHints. +// +// Requires d.mu to be held. Updates d.mu.compact.deletionHints. +// +// Returns true iff a compaction was started. +func (d *DB) tryScheduleDeleteOnlyCompaction() bool { + if d.opts.private.disableDeleteOnlyCompactions || d.opts.DisableAutomaticCompactions || + len(d.mu.compact.deletionHints) == 0 { + return false + } + if _, maxConcurrency := d.opts.CompactionConcurrencyRange(); d.mu.compact.compactingCount >= maxConcurrency { + return false + } + v := d.mu.versions.currentVersion() + snapshots := d.mu.snapshots.toSlice() + // We need to save the value of exciseEnabled in the compaction itself, as + // it can change dynamically between now and when the compaction runs. + exciseEnabled := d.FormatMajorVersion() >= FormatVirtualSSTables && + d.opts.Experimental.EnableDeleteOnlyCompactionExcises != nil && d.opts.Experimental.EnableDeleteOnlyCompactionExcises() + inputs, resolvedHints, unresolvedHints := checkDeleteCompactionHints(d.cmp, v, d.mu.compact.deletionHints, snapshots, exciseEnabled) + d.mu.compact.deletionHints = unresolvedHints + + if len(inputs) > 0 { + c := newDeleteOnlyCompaction(d.opts, v, inputs, d.timeNow(), resolvedHints, exciseEnabled) + d.mu.compact.compactingCount++ + c.AddInProgressLocked(d) + go d.compact(c, nil) + return true + } + return false +} + +// deleteCompactionHintType indicates whether the deleteCompactionHint was +// generated from a span containing a range del (point key only), a range key +// delete (range key only), or both a point and range key. +type deleteCompactionHintType uint8 + +const ( + // NOTE: While these are primarily used as enumeration types, they are also + // used for some bitwise operations. Care should be taken when updating. + deleteCompactionHintTypeUnknown deleteCompactionHintType = iota + deleteCompactionHintTypePointKeyOnly + deleteCompactionHintTypeRangeKeyOnly + deleteCompactionHintTypePointAndRangeKey +) + +// String implements fmt.Stringer. +func (h deleteCompactionHintType) String() string { + switch h { + case deleteCompactionHintTypeUnknown: + return "unknown" + case deleteCompactionHintTypePointKeyOnly: + return "point-key-only" + case deleteCompactionHintTypeRangeKeyOnly: + return "range-key-only" + case deleteCompactionHintTypePointAndRangeKey: + return "point-and-range-key" + default: + panic(fmt.Sprintf("unknown hint type: %d", h)) + } +} + +// compactionHintFromKeys returns a deleteCompactionHintType given a slice of +// keyspan.Keys. +func compactionHintFromKeys(keys []keyspan.Key) deleteCompactionHintType { + var hintType deleteCompactionHintType + for _, k := range keys { + switch k.Kind() { + case base.InternalKeyKindRangeDelete: + hintType |= deleteCompactionHintTypePointKeyOnly + case base.InternalKeyKindRangeKeyDelete: + hintType |= deleteCompactionHintTypeRangeKeyOnly + default: + panic(fmt.Sprintf("unsupported key kind: %s", k.Kind())) + } + } + return hintType +} + +// A deleteCompactionHint records a user key and sequence number span that has been +// deleted by a range tombstone. A hint is recorded if at least one sstable +// falls completely within both the user key and sequence number spans. +// Once the tombstones and the observed completely-contained sstables fall +// into the same snapshot stripe, a delete-only compaction may delete any +// sstables within the range. +type deleteCompactionHint struct { + // The type of key span that generated this hint (point key, range key, or + // both). + hintType deleteCompactionHintType + // start and end are user keys specifying a key range [start, end) of + // deleted keys. + start []byte + end []byte + // The level of the file containing the range tombstone(s) when the hint + // was created. Only lower levels need to be searched for files that may + // be deleted. + tombstoneLevel int + // The file containing the range tombstone(s) that created the hint. + tombstoneFile *manifest.TableMetadata + // The smallest and largest sequence numbers of the abutting tombstones + // merged to form this hint. All of a tables' keys must be less than the + // tombstone smallest sequence number to be deleted. All of a tables' + // sequence numbers must fall into the same snapshot stripe as the + // tombstone largest sequence number to be deleted. + tombstoneLargestSeqNum base.SeqNum + tombstoneSmallestSeqNum base.SeqNum + // The smallest sequence number of a sstable that was found to be covered + // by this hint. The hint cannot be resolved until this sequence number is + // in the same snapshot stripe as the largest tombstone sequence number. + // This is set when a hint is created, so the LSM may look different and + // notably no longer contain the sstable that contained the key at this + // sequence number. + fileSmallestSeqNum base.SeqNum +} + +type deletionHintOverlap int8 + +const ( + // hintDoesNotApply indicates that the hint does not apply to the file. + hintDoesNotApply deletionHintOverlap = iota + // hintExcisesFile indicates that the hint excises a portion of the file, + // and the format major version of the DB supports excises. + hintExcisesFile + // hintDeletesFile indicates that the hint deletes the entirety of the file. + hintDeletesFile +) + +func (h deleteCompactionHint) String() string { + return fmt.Sprintf( + "L%d.%s %s-%s seqnums(tombstone=%d-%d, file-smallest=%d, type=%s)", + h.tombstoneLevel, h.tombstoneFile.TableNum, h.start, h.end, + h.tombstoneSmallestSeqNum, h.tombstoneLargestSeqNum, h.fileSmallestSeqNum, + h.hintType, + ) +} + +func (h *deleteCompactionHint) canDeleteOrExcise( + cmp Compare, m *manifest.TableMetadata, snapshots compact.Snapshots, exciseEnabled bool, +) deletionHintOverlap { + // The file can only be deleted if all of its keys are older than the + // earliest tombstone aggregated into the hint. Note that we use + // m.LargestSeqNumAbsolute, not m.LargestSeqNum. Consider a compaction that + // zeroes sequence numbers. A compaction may zero the sequence number of a + // key with a sequence number > h.tombstoneSmallestSeqNum and set it to + // zero. If we looked at m.LargestSeqNum, the resulting output file would + // appear to not contain any keys more recent than the oldest tombstone. To + // avoid this error, the largest pre-zeroing sequence number is maintained + // in LargestSeqNumAbsolute and used here to make the determination whether + // the file's keys are older than all of the hint's tombstones. + if m.LargestSeqNumAbsolute >= h.tombstoneSmallestSeqNum || m.SmallestSeqNum < h.fileSmallestSeqNum { + return hintDoesNotApply + } + + // The file's oldest key must be in the same snapshot stripe as the + // newest tombstone. NB: We already checked the hint's sequence numbers, + // but this file's oldest sequence number might be lower than the hint's + // smallest sequence number despite the file falling within the key range + // if this file was constructed after the hint by a compaction. + if snapshots.Index(h.tombstoneLargestSeqNum) != snapshots.Index(m.SmallestSeqNum) { + return hintDoesNotApply + } + + switch h.hintType { + case deleteCompactionHintTypePointKeyOnly: + // A hint generated by a range del span cannot delete tables that contain + // range keys. + if m.HasRangeKeys { + return hintDoesNotApply + } + case deleteCompactionHintTypeRangeKeyOnly: + // A hint generated by a range key del span cannot delete tables that + // contain point keys. + if m.HasPointKeys { + return hintDoesNotApply + } + case deleteCompactionHintTypePointAndRangeKey: + // A hint from a span that contains both range dels *and* range keys can + // only be deleted if both bounds fall within the hint. The next check takes + // care of this. + default: + panic(fmt.Sprintf("pebble: unknown delete compaction hint type: %d", h.hintType)) + } + if cmp(h.start, m.Smallest().UserKey) <= 0 && + base.UserKeyExclusive(h.end).CompareUpperBounds(cmp, m.UserKeyBounds().End) >= 0 { + return hintDeletesFile + } + if !exciseEnabled { + // The file's keys must be completely contained within the hint range; excises + // aren't allowed. + return hintDoesNotApply + } + // Check for any overlap. In cases of partial overlap, we can excise the part of the file + // that overlaps with the deletion hint. + if cmp(h.end, m.Smallest().UserKey) > 0 && + (m.UserKeyBounds().End.CompareUpperBounds(cmp, base.UserKeyInclusive(h.start)) >= 0) { + return hintExcisesFile + } + return hintDoesNotApply +} + +// checkDeleteCompactionHints checks the passed-in deleteCompactionHints for those that +// can be resolved and those that cannot. A hint is considered resolved when its largest +// tombstone sequence number and the smallest sequence number of covered files fall in +// the same snapshot stripe. No more than maxHintsPerDeleteOnlyCompaction will be resolved +// per method call. Resolved and unresolved hints are returned in separate return values. +// The files that the resolved hints apply to, are returned as compactionLevels. +func checkDeleteCompactionHints( + cmp Compare, + v *manifest.Version, + hints []deleteCompactionHint, + snapshots compact.Snapshots, + exciseEnabled bool, +) (levels []compactionLevel, resolved, unresolved []deleteCompactionHint) { + var files map[*manifest.TableMetadata]bool + var byLevel [numLevels][]*manifest.TableMetadata + + // Delete-only compactions can be quadratic (O(mn)) in terms of runtime + // where m = number of files in the delete-only compaction and n = number + // of resolved hints. To prevent these from growing unbounded, we cap + // the number of hints we resolve for one delete-only compaction. This + // cap only applies if exciseEnabled == true. + const maxHintsPerDeleteOnlyCompaction = 10 + + unresolvedHints := hints[:0] + // Lazily populate resolvedHints, similar to files above. + resolvedHints := make([]deleteCompactionHint, 0) + for _, h := range hints { + // Check each compaction hint to see if it's resolvable. Resolvable + // hints are removed and trigger a delete-only compaction if any files + // in the current LSM still meet their criteria. Unresolvable hints + // are saved and don't trigger a delete-only compaction. + // + // When a compaction hint is created, the sequence numbers of the + // range tombstones and the covered file with the oldest key are + // recorded. The largest tombstone sequence number and the smallest + // file sequence number must be in the same snapshot stripe for the + // hint to be resolved. The below graphic models a compaction hint + // covering the keyspace [b, r). The hint completely contains two + // files, 000002 and 000003. The file 000003 contains the lowest + // covered sequence number at #90. The tombstone b.RANGEDEL.230:h has + // the highest tombstone sequence number incorporated into the hint. + // The hint may be resolved only once the snapshots at #100, #180 and + // #210 are all closed. File 000001 is not included within the hint + // because it extends beyond the range tombstones in user key space. + // + // 250 + // + // |-b...230:h-| + // _____________________________________________________ snapshot #210 + // 200 |--h.RANGEDEL.200:r--| + // + // _____________________________________________________ snapshot #180 + // + // 150 +--------+ + // +---------+ | 000003 | + // | 000002 | | | + // +_________+ | | + // 100_____________________|________|___________________ snapshot #100 + // +--------+ + // _____________________________________________________ snapshot #70 + // +---------------+ + // 50 | 000001 | + // | | + // +---------------+ + // ______________________________________________________________ + // a b c d e f g h i j k l m n o p q r s t u v w x y z + + if snapshots.Index(h.tombstoneLargestSeqNum) != snapshots.Index(h.fileSmallestSeqNum) || + (len(resolvedHints) >= maxHintsPerDeleteOnlyCompaction && exciseEnabled) { + // Cannot resolve yet. + unresolvedHints = append(unresolvedHints, h) + continue + } + + // The hint h will be resolved and dropped, if it either affects no files at all + // or if the number of files it creates (eg. through excision) is less than or + // equal to the number of files it deletes. First, determine how many files are + // affected by this hint. + filesDeletedByCurrentHint := 0 + var filesDeletedByLevel [7][]*manifest.TableMetadata + for l := h.tombstoneLevel + 1; l < numLevels; l++ { + for m := range v.Overlaps(l, base.UserKeyBoundsEndExclusive(h.start, h.end)).All() { + doesHintApply := h.canDeleteOrExcise(cmp, m, snapshots, exciseEnabled) + if m.IsCompacting() || doesHintApply == hintDoesNotApply || files[m] { + continue + } + switch doesHintApply { + case hintDeletesFile: + filesDeletedByCurrentHint++ + case hintExcisesFile: + // Account for the original file being deleted. + filesDeletedByCurrentHint++ + // An excise could produce up to 2 new files. If the hint + // leaves a fragment of the file on the left, decrement + // the counter once. If the hint leaves a fragment of the + // file on the right, decrement the counter once. + if cmp(h.start, m.Smallest().UserKey) > 0 { + filesDeletedByCurrentHint-- + } + if m.UserKeyBounds().End.IsUpperBoundFor(cmp, h.end) { + filesDeletedByCurrentHint-- + } + } + filesDeletedByLevel[l] = append(filesDeletedByLevel[l], m) + } + } + if filesDeletedByCurrentHint < 0 { + // This hint does not delete a sufficient number of files to warrant + // a delete-only compaction at this stage. Drop it (ie. don't add it + // to either resolved or unresolved hints) so it doesn't stick around + // forever. + continue + } + // This hint will be resolved and dropped. + for l := h.tombstoneLevel + 1; l < numLevels; l++ { + byLevel[l] = append(byLevel[l], filesDeletedByLevel[l]...) + for _, m := range filesDeletedByLevel[l] { + if files == nil { + // Construct files lazily, assuming most calls will not + // produce delete-only compactions. + files = make(map[*manifest.TableMetadata]bool) + } + files[m] = true + } + } + resolvedHints = append(resolvedHints, h) + } + + var compactLevels []compactionLevel + for l, files := range byLevel { + if len(files) == 0 { + continue + } + compactLevels = append(compactLevels, compactionLevel{ + level: l, + files: manifest.NewLevelSliceKeySorted(cmp, files), + }) + } + return compactLevels, resolvedHints, unresolvedHints +} + +// compact runs one compaction and maybe schedules another call to compact. +func (d *DB) compact(c compaction, errChannel chan error) { + pprof.Do(context.Background(), c.PprofLabels(d.opts.Experimental.UserKeyCategories), func(context.Context) { + func() { + d.mu.Lock() + defer d.mu.Unlock() + jobID := d.newJobIDLocked() + + compactErr := c.Execute(jobID, d) + + d.deleteObsoleteFiles(jobID) + // We send on the error channel only after we've deleted + // obsolete files so that tests performing manual compactions + // block until the obsolete files are deleted, and the test + // observes the deletion. + if errChannel != nil { + errChannel <- compactErr + } + if compactErr != nil { + d.handleCompactFailure(c, compactErr) + } + if c.IsDownload() { + d.mu.compact.downloadingCount-- + } else { + d.mu.compact.compactingCount-- + } + delete(d.mu.compact.inProgress, c) + // Add this compaction's duration to the cumulative duration. NB: This + // must be atomic with the above removal of c from + // d.mu.compact.InProgress to ensure Metrics.Compact.Duration does not + // miss or double count a completing compaction's duration. + d.mu.compact.duration += d.timeNow().Sub(c.BeganAt()) + }() + // Done must not be called while holding any lock that needs to be + // acquired by Schedule. Also, it must be called after new Version has + // been installed, and metadata related to compactingCount and inProgress + // compactions has been updated. This is because when we are running at + // the limit of permitted compactions, Done can cause the + // CompactionScheduler to schedule another compaction. Note that the only + // compactions that may be scheduled by Done are those integrated with the + // CompactionScheduler. + c.GrantHandle().Done() + // The previous compaction may have produced too many files in a level, so + // reschedule another compaction if needed. + // + // The preceding Done call will not necessarily cause a compaction to be + // scheduled, so we also need to call maybeScheduleCompaction. And + // maybeScheduleCompaction encompasses all compactions, and not only those + // scheduled via the CompactionScheduler. + func() { + d.mu.Lock() + defer d.mu.Unlock() + d.maybeScheduleCompaction() + d.mu.compact.cond.Broadcast() + }() + }) +} + +func (d *DB) handleCompactFailure(c compaction, err error) { + if errors.Is(err, ErrCancelledCompaction) { + // ErrCancelledCompaction is expected during normal operation, so we don't + // want to report it as a background error. + d.opts.Logger.Infof("%v", err) + return + } + c.RecordError(&d.problemSpans, err) + // TODO(peter): count consecutive compaction errors and backoff. + d.opts.EventListener.BackgroundError(err) +} + +// cleanupVersionEdit cleans up any on-disk artifacts that were created +// for the application of a versionEdit that is no longer going to be applied. +// +// d.mu must be held when calling this method. +func (d *DB) cleanupVersionEdit(ve *manifest.VersionEdit) { + obsoleteFiles := manifest.ObsoleteFiles{ + TableBackings: make([]*manifest.TableBacking, 0, len(ve.NewTables)), + BlobFiles: make([]*manifest.PhysicalBlobFile, 0, len(ve.NewBlobFiles)), + } + deletedTables := make(map[base.TableNum]struct{}) + for key := range ve.DeletedTables { + deletedTables[key.FileNum] = struct{}{} + } + for i := range ve.NewBlobFiles { + obsoleteFiles.AddBlob(ve.NewBlobFiles[i].Physical) + d.mu.versions.zombieBlobs.Add(objectInfo{ + fileInfo: fileInfo{ + FileNum: ve.NewBlobFiles[i].Physical.FileNum, + FileSize: ve.NewBlobFiles[i].Physical.Size, + }, + isLocal: objstorage.IsLocalBlobFile(d.objProvider, ve.NewBlobFiles[i].Physical.FileNum), + }) + } + for i := range ve.NewTables { + if ve.NewTables[i].Meta.Virtual { + // We handle backing files separately. + continue + } + if _, ok := deletedTables[ve.NewTables[i].Meta.TableNum]; ok { + // This file is being moved in this ve to a different level. + // Don't mark it as obsolete. + continue + } + obsoleteFiles.AddBacking(ve.NewTables[i].Meta.PhysicalMeta().TableBacking) + } + for i := range ve.CreatedBackingTables { + if ve.CreatedBackingTables[i].IsUnused() { + obsoleteFiles.AddBacking(ve.CreatedBackingTables[i]) + } + } + for _, of := range obsoleteFiles.TableBackings { + // Add this file to zombie tables as well, as the versionSet + // asserts on whether every obsolete file was at one point + // marked zombie. + d.mu.versions.zombieTables.Add(objectInfo{ + fileInfo: fileInfo{ + FileNum: of.DiskFileNum, + FileSize: of.Size, + }, + isLocal: objstorage.IsLocalTable(d.objProvider, of.DiskFileNum), + }) + } + d.mu.versions.addObsoleteLocked(obsoleteFiles) +} + +// compact1 runs one compaction. +// +// d.mu must be held when calling this, but the mutex may be dropped and +// re-acquired during the course of this method. +func (d *DB) compact1(jobID JobID, c *tableCompaction) (err error) { + info := c.makeInfo(jobID) + d.opts.EventListener.CompactionBegin(info) + startTime := d.timeNow() + + ve, stats, err := d.runCompaction(jobID, c) + + info.Annotations = append(info.Annotations, c.annotations...) + info.Duration = d.timeNow().Sub(startTime) + if err == nil { + validateVersionEdit(ve, d.opts.Comparer.ValidateKey, d.opts.Comparer.FormatKey, d.opts.Logger) + err = d.mu.versions.UpdateVersionLocked(func() (versionUpdate, error) { + // Check if this compaction had a conflicting operation (eg. a d.excise()) + // that necessitates it restarting from scratch. Note that since we hold + // the manifest lock, we don't expect this bool to change its value + // as only the holder of the manifest lock will ever write to it. + if c.cancel.Load() { + err = firstError(err, ErrCancelledCompaction) + // This is the first time we've seen a cancellation during the + // life of this compaction (or the original condition on err == nil + // would not have been true). We should delete any tables already + // created, as d.runCompaction did not do that. + d.cleanupVersionEdit(ve) + // Note that UpdateVersionLocked invalidates the pickedCompactionCache + // when we return, which is relevant because this failed compaction + // may be the highest priority to run next. + return versionUpdate{}, err + } + return versionUpdate{ + VE: ve, + JobID: jobID, + Metrics: c.metrics.perLevel, + InProgressCompactionsFn: func() []compactionInfo { return d.getInProgressCompactionInfoLocked(c) }, + }, nil + }) + } + + info.Done = true + info.Err = err + if err == nil { + for i := range ve.NewTables { + e := &ve.NewTables[i] + info.Output.Tables = append(info.Output.Tables, e.Meta.TableInfo()) + } + d.mu.snapshots.cumulativePinnedCount += stats.CumulativePinnedKeys + d.mu.snapshots.cumulativePinnedSize += stats.CumulativePinnedSize + d.mu.versions.metrics.Keys.MissizedTombstonesCount += stats.CountMissizedDels + } + + // NB: clearing compacting state must occur before updating the read state; + // L0Sublevels initialization depends on it. + d.clearCompactingState(c, err != nil) + d.mu.versions.incrementCompactions(c.kind, c.extraLevels, c.metrics.bytesWritten.Load(), err) + d.mu.versions.incrementCompactionBytes(-c.metrics.bytesWritten.Load()) + + info.TotalDuration = d.timeNow().Sub(c.metrics.beganAt) + d.opts.EventListener.CompactionEnd(info) + + // Update the read state before deleting obsolete files because the + // read-state update will cause the previous version to be unref'd and if + // there are no references obsolete tables will be added to the obsolete + // table list. + if err == nil { + d.updateReadStateLocked(d.opts.DebugCheck) + d.updateTableStatsLocked(ve.NewTables) + } + + return err +} + +// runCopyCompaction runs a copy compaction where a new TableNum is created that +// is a byte-for-byte copy of the input file or span thereof in some cases. This +// is used in lieu of a move compaction when a file is being moved across the +// local/remote storage boundary. It could also be used in lieu of a rewrite +// compaction as part of a Download() call, which allows copying only a span of +// the external file, provided the file does not contain range keys or value +// blocks (see sstable.CopySpan). +// +// d.mu must be held when calling this method. The mutex will be released when +// doing IO. +func (d *DB) runCopyCompaction( + jobID JobID, c *tableCompaction, +) (ve *manifest.VersionEdit, stats compact.Stats, _ error) { + if c.cancel.Load() { + return nil, compact.Stats{}, ErrCancelledCompaction + } + iter := c.startLevel.files.Iter() + inputMeta := iter.First() + if iter.Next() != nil { + return nil, compact.Stats{}, base.AssertionFailedf("got more than one file for a move compaction") + } + if inputMeta.BlobReferenceDepth > 0 || len(inputMeta.BlobReferences) > 0 { + return nil, compact.Stats{}, base.AssertionFailedf( + "copy compaction for %s with blob references (depth=%d, refs=%d)", + inputMeta.TableNum, inputMeta.BlobReferenceDepth, len(inputMeta.BlobReferences), + ) + } + ve = &manifest.VersionEdit{ + DeletedTables: map[manifest.DeletedTableEntry]*manifest.TableMetadata{ + {Level: c.startLevel.level, FileNum: inputMeta.TableNum}: inputMeta, + }, + } + + objMeta, err := d.objProvider.Lookup(base.FileTypeTable, inputMeta.TableBacking.DiskFileNum) + if err != nil { + return nil, compact.Stats{}, err + } + // This code does not support copying a shared table (which should never be necessary). + if objMeta.IsShared() { + return nil, compact.Stats{}, base.AssertionFailedf("copy compaction of shared table") + } + + // We are in the relatively more complex case where we need to copy this + // file to remote storage. Drop the db mutex while we do the copy + // + // To ease up cleanup of the local file and tracking of refs, we create + // a new FileNum. This has the potential of making the block cache less + // effective, however. + newMeta := &manifest.TableMetadata{ + Size: inputMeta.Size, + CreationTime: inputMeta.CreationTime, + SmallestSeqNum: inputMeta.SmallestSeqNum, + LargestSeqNum: inputMeta.LargestSeqNum, + LargestSeqNumAbsolute: inputMeta.LargestSeqNumAbsolute, + Stats: inputMeta.Stats, + Virtual: inputMeta.Virtual, + SyntheticPrefixAndSuffix: inputMeta.SyntheticPrefixAndSuffix, + } + if inputMeta.HasPointKeys { + newMeta.ExtendPointKeyBounds(c.comparer.Compare, + inputMeta.PointKeyBounds.Smallest(), + inputMeta.PointKeyBounds.Largest()) + } + if inputMeta.HasRangeKeys { + newMeta.ExtendRangeKeyBounds(c.comparer.Compare, + inputMeta.RangeKeyBounds.Smallest(), + inputMeta.RangeKeyBounds.Largest()) + } + newMeta.TableNum = d.mu.versions.getNextTableNum() + if objMeta.IsExternal() { + // external -> local/shared copy. File must be virtual. + // We will update this size later after we produce the new backing file. + newMeta.InitVirtualBacking(base.DiskFileNum(newMeta.TableNum), inputMeta.TableBacking.Size) + } else { + // local -> shared copy. New file is guaranteed to not be virtual. + newMeta.InitPhysicalBacking() + } + + // NB: The order here is reversed, lock after unlock. This is similar to + // runCompaction. + d.mu.Unlock() + defer d.mu.Lock() + + deleteOnExit := false + defer func() { + if deleteOnExit { + _ = d.objProvider.Remove(base.FileTypeTable, newMeta.TableBacking.DiskFileNum) + } + }() + + // If the src obj is external, we're doing an external to local/shared copy. + if objMeta.IsExternal() { + ctx := context.TODO() + src, err := d.objProvider.OpenForReading( + ctx, base.FileTypeTable, inputMeta.TableBacking.DiskFileNum, objstorage.OpenOptions{}, + ) + if err != nil { + return nil, compact.Stats{}, err + } + defer func() { + if src != nil { + _ = src.Close() + } + }() + + w, _, err := d.objProvider.Create( + ctx, base.FileTypeTable, newMeta.TableBacking.DiskFileNum, + objstorage.CreateOptions{ + PreferSharedStorage: d.shouldCreateShared(c.outputLevel.level), + }, + ) + if err != nil { + return nil, compact.Stats{}, err + } + deleteOnExit = true + + start, end := newMeta.Smallest(), newMeta.Largest() + if newMeta.SyntheticPrefixAndSuffix.HasPrefix() { + syntheticPrefix := newMeta.SyntheticPrefixAndSuffix.Prefix() + start.UserKey = syntheticPrefix.Invert(start.UserKey) + end.UserKey = syntheticPrefix.Invert(end.UserKey) + } + if newMeta.SyntheticPrefixAndSuffix.HasSuffix() { + // Extend the bounds as necessary so that the keys don't include suffixes. + start.UserKey = start.UserKey[:c.comparer.Split(start.UserKey)] + if n := c.comparer.Split(end.UserKey); n < len(end.UserKey) { + end = base.MakeRangeDeleteSentinelKey(c.comparer.ImmediateSuccessor(nil, end.UserKey[:n])) + } + } + + // NB: external files are always virtual. + var wrote uint64 + err = d.fileCache.withReader(ctx, block.NoReadEnv, inputMeta.VirtualMeta(), func(r *sstable.Reader, env sstable.ReadEnv) error { + var err error + // TODO(radu): plumb a ReadEnv to CopySpan (it could use the buffer pool + // or update category stats). + wrote, err = sstable.CopySpan(ctx, + src, r, d.opts.MakeReaderOptions(), + w, d.opts.MakeWriterOptions(c.outputLevel.level, d.TableFormat()), + start, end, + ) + return err + }) + + src = nil // We passed src to CopySpan; it's responsible for closing it. + if err != nil { + if errors.Is(err, sstable.ErrEmptySpan) { + // The virtual table was empty. Just remove the backing file. + // Note that deleteOnExit is true so we will delete the created object. + outputMetrics := c.metrics.perLevel.level(c.outputLevel.level) + outputMetrics.TableBytesIn = inputMeta.Size + + return ve, compact.Stats{}, nil + } + return nil, compact.Stats{}, err + } + newMeta.TableBacking.Size = wrote + newMeta.Size = wrote + } else { + _, err := d.objProvider.LinkOrCopyFromLocal(context.TODO(), d.opts.FS, + d.objProvider.Path(objMeta), base.FileTypeTable, newMeta.TableBacking.DiskFileNum, + objstorage.CreateOptions{PreferSharedStorage: true}) + if err != nil { + return nil, compact.Stats{}, err + } + deleteOnExit = true + } + ve.NewTables = []manifest.NewTableEntry{{ + Level: c.outputLevel.level, + Meta: newMeta, + }} + if newMeta.Virtual { + ve.CreatedBackingTables = []*manifest.TableBacking{newMeta.TableBacking} + } + outputMetrics := c.metrics.perLevel.level(c.outputLevel.level) + outputMetrics.TableBytesIn = inputMeta.Size + outputMetrics.TableBytesCompacted = newMeta.Size + outputMetrics.TablesCompacted = 1 + + if err := d.objProvider.Sync(); err != nil { + return nil, compact.Stats{}, err + } + deleteOnExit = false + return ve, compact.Stats{}, nil +} + +// applyHintOnFile applies a deleteCompactionHint to a file, and updates the +// versionEdit accordingly. It returns a list of new files that were created +// if the hint was applied partially to a file (eg. through an exciseTable as opposed +// to an outright deletion). levelMetrics is kept up-to-date with the number +// of tables deleted or excised. +func (d *DB) applyHintOnFile( + h deleteCompactionHint, + f *manifest.TableMetadata, + level int, + levelMetrics *LevelMetrics, + ve *manifest.VersionEdit, + hintOverlap deletionHintOverlap, +) (newFiles []manifest.NewTableEntry, err error) { + if hintOverlap == hintDoesNotApply { + return nil, nil + } + + // The hint overlaps with at least part of the file. + if hintOverlap == hintDeletesFile { + // The hint deletes the entirety of this file. + ve.DeletedTables[manifest.DeletedTableEntry{ + Level: level, + FileNum: f.TableNum, + }] = f + levelMetrics.TablesDeleted++ + return nil, nil + } + // The hint overlaps with only a part of the file, not the entirety of it. We need + // to use d.exciseTable. (hintOverlap == hintExcisesFile) + if d.FormatMajorVersion() < FormatVirtualSSTables { + panic("pebble: delete-only compaction hint excising a file is not supported in this version") + } + + levelMetrics.TablesExcised++ + exciseBounds := base.UserKeyBoundsEndExclusive(h.start, h.end) + leftTable, rightTable, err := d.exciseTable(context.TODO(), exciseBounds, f, level, tightExciseBounds) + if err != nil { + return nil, errors.Wrap(err, "error when running excise for delete-only compaction") + } + newFiles = applyExciseToVersionEdit(ve, f, leftTable, rightTable, level) + return newFiles, nil +} + +func (d *DB) runDeleteOnlyCompactionForLevel( + cl compactionLevel, + levelMetrics *LevelMetrics, + ve *manifest.VersionEdit, + snapshots compact.Snapshots, + fragments []deleteCompactionHintFragment, + exciseEnabled bool, +) error { + if cl.level == 0 { + panic("cannot run delete-only compaction for L0") + } + curFragment := 0 + + // Outer loop loops on files. Middle loop loops on fragments. Inner loop + // loops on raw fragments of hints. Number of fragments are bounded by + // the number of hints this compaction was created with, which is capped + // in the compaction picker to avoid very CPU-hot loops here. + for f := range cl.files.All() { + // curFile usually matches f, except if f got excised in which case + // it maps to a virtual file that replaces f, or nil if f got removed + // in its entirety. + curFile := f + for curFragment < len(fragments) && d.cmp(fragments[curFragment].start, f.Smallest().UserKey) <= 0 { + curFragment++ + } + if curFragment > 0 { + curFragment-- + } + + for ; curFragment < len(fragments); curFragment++ { + if f.UserKeyBounds().End.CompareUpperBounds(d.cmp, base.UserKeyInclusive(fragments[curFragment].start)) < 0 { + break + } + // Process all overlapping hints with this file. Note that applying + // a hint twice is idempotent; curFile should have already been excised + // the first time, resulting in no change the second time. + for _, h := range fragments[curFragment].hints { + if h.tombstoneLevel >= cl.level { + // We cannot excise out the deletion tombstone itself, or anything + // above it. + continue + } + hintOverlap := h.canDeleteOrExcise(d.cmp, curFile, snapshots, exciseEnabled) + if hintOverlap == hintDoesNotApply { + continue + } + newFiles, err := d.applyHintOnFile(h, curFile, cl.level, levelMetrics, ve, hintOverlap) + if err != nil { + return err + } + if _, ok := ve.DeletedTables[manifest.DeletedTableEntry{Level: cl.level, FileNum: curFile.TableNum}]; ok { + curFile = nil + } + if len(newFiles) > 0 { + curFile = newFiles[len(newFiles)-1].Meta + } else if curFile == nil { + // Nothing remains of the file. + break + } + } + if curFile == nil { + // Nothing remains of the file. + break + } + } + if _, ok := ve.DeletedTables[manifest.DeletedTableEntry{ + Level: cl.level, + FileNum: f.TableNum, + }]; !ok { + panic("pebble: delete-only compaction scheduled with hints that did not delete or excise a file") + } + } + return nil +} + +// deleteCompactionHintFragment represents a fragment of the key space and +// contains a set of deleteCompactionHints that apply to that fragment; a +// fragment starts at the start field and ends where the next fragment starts. +type deleteCompactionHintFragment struct { + start []byte + hints []deleteCompactionHint +} + +// Delete compaction hints can overlap with each other, and multiple fragments +// can apply to a single file. This function takes a list of hints and fragments +// them, to make it easier to apply them to non-overlapping files occupying a level; +// that way, files and hint fragments can be iterated on in lockstep, while efficiently +// being able to apply all hints overlapping with a given file. +func fragmentDeleteCompactionHints( + cmp Compare, hints []deleteCompactionHint, +) []deleteCompactionHintFragment { + fragments := make([]deleteCompactionHintFragment, 0, len(hints)*2) + for i := range hints { + fragments = append(fragments, deleteCompactionHintFragment{start: hints[i].start}, + deleteCompactionHintFragment{start: hints[i].end}) + } + slices.SortFunc(fragments, func(i, j deleteCompactionHintFragment) int { + return cmp(i.start, j.start) + }) + fragments = slices.CompactFunc(fragments, func(i, j deleteCompactionHintFragment) bool { + return bytes.Equal(i.start, j.start) + }) + for _, h := range hints { + startIdx := sort.Search(len(fragments), func(i int) bool { + return cmp(fragments[i].start, h.start) >= 0 + }) + endIdx := sort.Search(len(fragments), func(i int) bool { + return cmp(fragments[i].start, h.end) >= 0 + }) + for i := startIdx; i < endIdx; i++ { + fragments[i].hints = append(fragments[i].hints, h) + } + } + return fragments +} + +// Runs a delete-only compaction. +// +// d.mu must *not* be held when calling this. +func (d *DB) runDeleteOnlyCompaction( + jobID JobID, c *tableCompaction, snapshots compact.Snapshots, +) (ve *manifest.VersionEdit, stats compact.Stats, retErr error) { + fragments := fragmentDeleteCompactionHints(d.cmp, c.deleteOnly.hints) + ve = &manifest.VersionEdit{ + DeletedTables: map[manifest.DeletedTableEntry]*manifest.TableMetadata{}, + } + for _, cl := range c.inputs { + levelMetrics := c.metrics.perLevel.level(cl.level) + err := d.runDeleteOnlyCompactionForLevel(cl, levelMetrics, ve, snapshots, fragments, c.deleteOnly.exciseEnabled) + if err != nil { + return nil, stats, err + } + } + // Remove any files that were added and deleted in the same versionEdit. + ve.NewTables = slices.DeleteFunc(ve.NewTables, func(e manifest.NewTableEntry) bool { + entry := manifest.DeletedTableEntry{Level: e.Level, FileNum: e.Meta.TableNum} + if _, deleted := ve.DeletedTables[entry]; deleted { + delete(ve.DeletedTables, entry) + return true + } + return false + }) + sort.Slice(ve.NewTables, func(i, j int) bool { + return ve.NewTables[i].Meta.TableNum < ve.NewTables[j].Meta.TableNum + }) + deletedTableEntries := slices.Collect(maps.Keys(ve.DeletedTables)) + slices.SortFunc(deletedTableEntries, func(a, b manifest.DeletedTableEntry) int { + return stdcmp.Compare(a.FileNum, b.FileNum) + }) + // Remove any entries from CreatedBackingTables that are not used in any + // NewFiles. + usedBackingFiles := make(map[base.DiskFileNum]struct{}) + for _, e := range ve.NewTables { + if e.Meta.Virtual { + usedBackingFiles[e.Meta.TableBacking.DiskFileNum] = struct{}{} + } + } + ve.CreatedBackingTables = slices.DeleteFunc(ve.CreatedBackingTables, func(b *manifest.TableBacking) bool { + _, used := usedBackingFiles[b.DiskFileNum] + return !used + }) + + // Iterate through the deleted tables and new tables to annotate excised tables. + // If a new table is virtual and the base.DiskFileNum is the same as a deleted table, then + // our deleted table was excised. + for _, table := range deletedTableEntries { + for _, newEntry := range ve.NewTables { + if newEntry.Meta.Virtual && + newEntry.Meta.TableBacking.DiskFileNum == ve.DeletedTables[table].TableBacking.DiskFileNum { + c.annotations = append(c.annotations, + fmt.Sprintf("(excised: %s)", ve.DeletedTables[table].TableNum)) + break + } + } + + } + + // Refresh the disk available statistic whenever a compaction/flush + // completes, before re-acquiring the mutex. + d.calculateDiskAvailableBytes() + return ve, stats, nil +} + +func (d *DB) runMoveCompaction( + jobID JobID, c *tableCompaction, +) (ve *manifest.VersionEdit, stats compact.Stats, _ error) { + iter := c.startLevel.files.Iter() + meta := iter.First() + if iter.Next() != nil { + return nil, stats, base.AssertionFailedf("got more than one file for a move compaction") + } + if c.cancel.Load() { + return ve, stats, ErrCancelledCompaction + } + outputMetrics := c.metrics.perLevel.level(c.outputLevel.level) + outputMetrics.TableBytesMoved = meta.Size + outputMetrics.TablesMoved = 1 + ve = &manifest.VersionEdit{ + DeletedTables: map[manifest.DeletedTableEntry]*manifest.TableMetadata{ + {Level: c.startLevel.level, FileNum: meta.TableNum}: meta, + }, + NewTables: []manifest.NewTableEntry{ + {Level: c.outputLevel.level, Meta: meta}, + }, + } + + return ve, stats, nil +} + +// runCompaction runs a compaction that produces new on-disk tables from +// memtables or old on-disk tables. +// +// runCompaction cannot be used for compactionKindIngestedFlushable. +// +// d.mu must be held when calling this, but the mutex may be dropped and +// re-acquired during the course of this method. +func (d *DB) runCompaction( + jobID JobID, c *tableCompaction, +) (ve *manifest.VersionEdit, stats compact.Stats, retErr error) { + if c.cancel.Load() { + return ve, stats, ErrCancelledCompaction + } + switch c.kind { + case compactionKindDeleteOnly: + // Release the d.mu lock while doing I/O. + // Note the unusual order: Unlock and then Lock. + snapshots := d.mu.snapshots.toSlice() + d.mu.Unlock() + defer d.mu.Lock() + return d.runDeleteOnlyCompaction(jobID, c, snapshots) + case compactionKindMove: + return d.runMoveCompaction(jobID, c) + case compactionKindCopy: + return d.runCopyCompaction(jobID, c) + case compactionKindIngestedFlushable: + panic("pebble: runCompaction cannot handle compactionKindIngestedFlushable.") + } + + snapshots := d.mu.snapshots.toSlice() + + // Release the d.mu lock while doing I/O. + // Note the unusual order: Unlock and then Lock. + d.mu.Unlock() + defer d.mu.Lock() + + // Determine whether we should separate values into blob files. + // + // TODO(jackson): Currently we never separate values in non-tests. Choose + // and initialize the appropriate ValueSeparation implementation based on + // Options and the compaction inputs. + valueSeparation := c.getValueSeparation(jobID, c, c.tableFormat) + + result := d.compactAndWrite(jobID, c, snapshots, c.tableFormat, valueSeparation) + if result.Err == nil { + ve, result.Err = c.makeVersionEdit(result) + } + if result.Err != nil { + // Delete any created tables or blob files. + obsoleteFiles := manifest.ObsoleteFiles{ + TableBackings: make([]*manifest.TableBacking, 0, len(result.Tables)), + BlobFiles: make([]*manifest.PhysicalBlobFile, 0, len(result.Blobs)), + } + d.mu.Lock() + for i := range result.Tables { + backing := &manifest.TableBacking{ + DiskFileNum: result.Tables[i].ObjMeta.DiskFileNum, + Size: result.Tables[i].WriterMeta.Size, + } + obsoleteFiles.AddBacking(backing) + // Add this file to zombie tables as well, as the versionSet + // asserts on whether every obsolete file was at one point + // marked zombie. + d.mu.versions.zombieTables.AddMetadata(&result.Tables[i].ObjMeta, backing.Size) + } + for i := range result.Blobs { + obsoleteFiles.AddBlob(result.Blobs[i].Metadata) + // Add this file to zombie blobs as well, as the versionSet + // asserts on whether every obsolete file was at one point + // marked zombie. + d.mu.versions.zombieBlobs.AddMetadata(&result.Blobs[i].ObjMeta, result.Blobs[i].Metadata.Size) + } + d.mu.versions.addObsoleteLocked(obsoleteFiles) + d.mu.Unlock() + } + // Refresh the disk available statistic whenever a compaction/flush + // completes, before re-acquiring the mutex. + d.calculateDiskAvailableBytes() + return ve, result.Stats, result.Err +} + +// compactAndWrite runs the data part of a compaction, where we set up a +// compaction iterator and use it to write output tables. +func (d *DB) compactAndWrite( + jobID JobID, + c *tableCompaction, + snapshots compact.Snapshots, + tableFormat sstable.TableFormat, + valueSeparation compact.ValueSeparation, +) (result compact.Result) { + // Compactions use a pool of buffers to read blocks, avoiding polluting the + // block cache with blocks that will not be read again. We initialize the + // buffer pool with a size 12. This initial size does not need to be + // accurate, because the pool will grow to accommodate the maximum number of + // blocks allocated at a given time over the course of the compaction. But + // choosing a size larger than that working set avoids any additional + // allocations to grow the size of the pool over the course of iteration. + // + // Justification for initial size 12: In a two-level compaction, at any + // given moment we'll have 2 index blocks in-use and 2 data blocks in-use. + // Additionally, when decoding a compressed block, we'll temporarily + // allocate 1 additional block to hold the compressed buffer. In the worst + // case that all input sstables have two-level index blocks (+2), value + // blocks (+2), range deletion blocks (+n) and range key blocks (+n), we'll + // additionally require 2n+4 blocks where n is the number of input sstables. + // Range deletion and range key blocks are relatively rare, and the cost of + // an additional allocation or two over the course of the compaction is + // considered to be okay. A larger initial size would cause the pool to hold + // on to more memory, even when it's not in-use because the pool will + // recycle buffers up to the current capacity of the pool. The memory use of + // a 12-buffer pool is expected to be within reason, even if all the buffers + // grow to the typical size of an index block (256 KiB) which would + // translate to 3 MiB per compaction. + c.iterationState.bufferPool.Init(12) + defer c.iterationState.bufferPool.Release() + blockReadEnv := block.ReadEnv{ + BufferPool: &c.iterationState.bufferPool, + Stats: &c.metrics.internalIterStats, + IterStats: d.fileCache.SSTStatsCollector().Accumulator( + uint64(uintptr(unsafe.Pointer(c))), + categoryCompaction, + ), + } + if c.version != nil { + c.iterationState.valueFetcher.Init(&c.version.BlobFiles, d.fileCache, blockReadEnv) + } + iiopts := internalIterOpts{ + compaction: true, + readEnv: sstable.ReadEnv{Block: blockReadEnv}, + blobValueFetcher: &c.iterationState.valueFetcher, + } + defer func() { _ = c.iterationState.valueFetcher.Close() }() + + pointIter, rangeDelIter, rangeKeyIter, err := c.newInputIters(d.newIters, iiopts) + defer func() { + for _, closer := range c.iterationState.keyspanIterClosers { + closer.FragmentIterator.Close() + } + }() + if err != nil { + return compact.Result{Err: err} + } + cfg := compact.IterConfig{ + Comparer: c.comparer, + Merge: d.merge, + TombstoneElision: c.delElision, + RangeKeyElision: c.rangeKeyElision, + Snapshots: snapshots, + IsBottommostDataLayer: c.isBottommostDataLayer(), + IneffectualSingleDeleteCallback: func(userKey []byte) { + d.opts.EventListener.PossibleAPIMisuse(PossibleAPIMisuseInfo{ + Kind: IneffectualSingleDelete, + UserKey: slices.Clone(userKey), + }) + }, + NondeterministicSingleDeleteCallback: func(userKey []byte) { + d.opts.EventListener.PossibleAPIMisuse(PossibleAPIMisuseInfo{ + Kind: NondeterministicSingleDelete, + UserKey: slices.Clone(userKey), + }) + }, + MissizedDeleteCallback: func(userKey []byte, elidedSize, expectedSize uint64) { + d.opts.EventListener.PossibleAPIMisuse(PossibleAPIMisuseInfo{ + Kind: MissizedDelete, + UserKey: slices.Clone(userKey), + ExtraInfo: redact.Sprintf("elidedSize=%d,expectedSize=%d", + redact.SafeUint(elidedSize), redact.SafeUint(expectedSize)), + }) + }, + } + iter := compact.NewIter(cfg, pointIter, rangeDelIter, rangeKeyIter) + + runnerCfg := compact.RunnerConfig{ + CompactionBounds: c.bounds, + L0SplitKeys: c.flush.l0Limits, + Grandparents: c.grandparents, + MaxGrandparentOverlapBytes: c.maxOverlapBytes, + TargetOutputFileSize: c.maxOutputFileSize, + GrantHandle: c.grantHandle, + } + runner := compact.NewRunner(runnerCfg, iter) + + var spanPolicyValid bool + var spanPolicy SpanPolicy + // If spanPolicyValid is true and spanPolicyEndKey is empty, then spanPolicy + // applies for the rest of the keyspace. + var spanPolicyEndKey []byte + + for runner.MoreDataToWrite() { + if c.cancel.Load() { + return runner.Finish().WithError(ErrCancelledCompaction) + } + // Create a new table. + firstKey := runner.FirstKey() + if !spanPolicyValid || (len(spanPolicyEndKey) > 0 && d.cmp(firstKey, spanPolicyEndKey) >= 0) { + var err error + spanPolicy, spanPolicyEndKey, err = d.opts.Experimental.SpanPolicyFunc(firstKey) + if err != nil { + return runner.Finish().WithError(err) + } + spanPolicyValid = true + } + + writerOpts := d.opts.MakeWriterOptions(c.outputLevel.level, tableFormat) + if spanPolicy.DisableValueSeparationBySuffix { + writerOpts.DisableValueBlocks = true + } + if spanPolicy.PreferFastCompression && writerOpts.Compression != block.NoCompression { + writerOpts.Compression = block.FastestCompression + } + vSep := valueSeparation + if spanPolicy.ValueStoragePolicy == ValueStorageLowReadLatency { + vSep = compact.NeverSeparateValues{} + } + objMeta, tw, err := d.newCompactionOutputTable(jobID, c, writerOpts) + if err != nil { + return runner.Finish().WithError(err) + } + runner.WriteTable(objMeta, tw, spanPolicyEndKey, vSep) + } + result = runner.Finish() + if result.Err == nil { + result.Err = d.objProvider.Sync() + } + return result +} + +// makeVersionEdit creates the version edit for a compaction, based on the +// tables in compact.Result. +func (c *tableCompaction) makeVersionEdit(result compact.Result) (*manifest.VersionEdit, error) { + ve := &manifest.VersionEdit{ + DeletedTables: map[manifest.DeletedTableEntry]*manifest.TableMetadata{}, + } + for _, cl := range c.inputs { + for f := range cl.files.All() { + ve.DeletedTables[manifest.DeletedTableEntry{ + Level: cl.level, + FileNum: f.TableNum, + }] = f + } + } + // Add any newly constructed blob files to the version edit. + ve.NewBlobFiles = make([]manifest.BlobFileMetadata, len(result.Blobs)) + for i := range result.Blobs { + ve.NewBlobFiles[i] = manifest.BlobFileMetadata{ + FileID: base.BlobFileID(result.Blobs[i].Metadata.FileNum), + Physical: result.Blobs[i].Metadata, + } + } + + startLevelBytes := c.startLevel.files.TableSizeSum() + + outputMetrics := c.metrics.perLevel.level(c.outputLevel.level) + outputMetrics.TableBytesIn = startLevelBytes + // TODO(jackson): This BytesRead value does not include any blob files + // written. It either should, or we should add a separate metric. + outputMetrics.TableBytesRead = c.outputLevel.files.TableSizeSum() + outputMetrics.BlobBytesCompacted = result.Stats.CumulativeBlobFileSize + if c.flush.flushables != nil { + outputMetrics.BlobBytesFlushed = result.Stats.CumulativeBlobFileSize + } + if len(c.extraLevels) > 0 { + outputMetrics.TableBytesIn += c.extraLevels[0].files.TableSizeSum() + } + outputMetrics.TableBytesRead += outputMetrics.TableBytesIn + + if len(c.flush.flushables) == 0 { + c.metrics.perLevel.level(c.startLevel.level) + } + if len(c.extraLevels) > 0 { + c.metrics.perLevel.level(c.extraLevels[0].level) + outputMetrics.MultiLevel.TableBytesInTop = startLevelBytes + outputMetrics.MultiLevel.TableBytesIn = outputMetrics.TableBytesIn + outputMetrics.MultiLevel.TableBytesRead = outputMetrics.TableBytesRead + } + + inputLargestSeqNumAbsolute := c.inputLargestSeqNumAbsolute() + ve.NewTables = make([]manifest.NewTableEntry, len(result.Tables)) + for i := range result.Tables { + t := &result.Tables[i] + + if t.WriterMeta.Properties.NumValuesInBlobFiles > 0 { + if len(t.BlobReferences) == 0 { + return nil, base.AssertionFailedf("num values in blob files %d but no blob references", + t.WriterMeta.Properties.NumValuesInBlobFiles) + } + } + + fileMeta := &manifest.TableMetadata{ + TableNum: base.PhysicalTableFileNum(t.ObjMeta.DiskFileNum), + CreationTime: t.CreationTime.Unix(), + Size: t.WriterMeta.Size, + SmallestSeqNum: t.WriterMeta.SmallestSeqNum, + LargestSeqNum: t.WriterMeta.LargestSeqNum, + BlobReferences: t.BlobReferences, + BlobReferenceDepth: t.BlobReferenceDepth, + } + if c.flush.flushables == nil { + // Set the file's LargestSeqNumAbsolute to be the maximum value of any + // of the compaction's input sstables. + // TODO(jackson): This could be narrowed to be the maximum of input + // sstables that overlap the output sstable's key range. + fileMeta.LargestSeqNumAbsolute = inputLargestSeqNumAbsolute + } else { + fileMeta.LargestSeqNumAbsolute = t.WriterMeta.LargestSeqNum + } + fileMeta.InitPhysicalBacking() + + // If the file didn't contain any range deletions, we can fill its + // table stats now, avoiding unnecessarily loading the table later. + maybeSetStatsFromProperties( + fileMeta.PhysicalMeta(), &t.WriterMeta.Properties.CommonProperties, c.logger, + ) + + if t.WriterMeta.HasPointKeys { + fileMeta.ExtendPointKeyBounds(c.comparer.Compare, + t.WriterMeta.SmallestPoint, + t.WriterMeta.LargestPoint) + } + if t.WriterMeta.HasRangeDelKeys { + fileMeta.ExtendPointKeyBounds(c.comparer.Compare, + t.WriterMeta.SmallestRangeDel, + t.WriterMeta.LargestRangeDel) + } + if t.WriterMeta.HasRangeKeys { + fileMeta.ExtendRangeKeyBounds(c.comparer.Compare, + t.WriterMeta.SmallestRangeKey, + t.WriterMeta.LargestRangeKey) + } + + ve.NewTables[i] = manifest.NewTableEntry{ + Level: c.outputLevel.level, + Meta: fileMeta, + } + + // Update metrics. + if c.flush.flushables == nil { + outputMetrics.TablesCompacted++ + outputMetrics.TableBytesCompacted += fileMeta.Size + } else { + outputMetrics.TablesFlushed++ + outputMetrics.TableBytesFlushed += fileMeta.Size + } + outputMetrics.EstimatedReferencesSize += fileMeta.EstimatedReferenceSize() + outputMetrics.BlobBytesReadEstimate += fileMeta.EstimatedReferenceSize() + outputMetrics.TablesSize += int64(fileMeta.Size) + outputMetrics.TablesCount++ + outputMetrics.Additional.BytesWrittenDataBlocks += t.WriterMeta.Properties.DataSize + outputMetrics.Additional.BytesWrittenValueBlocks += t.WriterMeta.Properties.ValueBlocksSize + } + + // Sanity check that the tables are ordered and don't overlap. + for i := 1; i < len(ve.NewTables); i++ { + if ve.NewTables[i-1].Meta.Largest().IsUpperBoundFor(c.comparer.Compare, ve.NewTables[i].Meta.Smallest().UserKey) { + return nil, base.AssertionFailedf("pebble: compaction output tables overlap: %s and %s", + ve.NewTables[i-1].Meta.DebugString(c.comparer.FormatKey, true), + ve.NewTables[i].Meta.DebugString(c.comparer.FormatKey, true), + ) + } + } + + return ve, nil +} + +// newCompactionOutputTable creates an object for a new table produced by a +// compaction or flush. +func (d *DB) newCompactionOutputTable( + jobID JobID, c *tableCompaction, writerOpts sstable.WriterOptions, +) (objstorage.ObjectMetadata, sstable.RawWriter, error) { + writable, objMeta, err := d.newCompactionOutputObj( + base.FileTypeTable, c.kind, c.outputLevel.level, &c.metrics.bytesWritten, c.objCreateOpts) + if err != nil { + return objstorage.ObjectMetadata{}, nil, err + } + d.opts.EventListener.TableCreated(TableCreateInfo{ + JobID: int(jobID), + Reason: c.kind.compactingOrFlushing(), + Path: d.objProvider.Path(objMeta), + FileNum: objMeta.DiskFileNum, + }) + writerOpts.SetInternal(sstableinternal.WriterOptions{ + CacheOpts: sstableinternal.CacheOptions{ + CacheHandle: d.cacheHandle, + FileNum: objMeta.DiskFileNum, + }, + }) + tw := sstable.NewRawWriterWithCPUMeasurer(writable, writerOpts, c.grantHandle) + return objMeta, tw, nil +} + +// newCompactionOutputBlob creates an object for a new blob produced by a +// compaction or flush. +func (d *DB) newCompactionOutputBlob( + jobID JobID, + kind compactionKind, + outputLevel int, + bytesWritten *atomic.Int64, + opts objstorage.CreateOptions, +) (objstorage.Writable, objstorage.ObjectMetadata, error) { + writable, objMeta, err := d.newCompactionOutputObj(base.FileTypeBlob, kind, outputLevel, bytesWritten, opts) + if err != nil { + return nil, objstorage.ObjectMetadata{}, err + } + d.opts.EventListener.BlobFileCreated(BlobFileCreateInfo{ + JobID: int(jobID), + Reason: kind.compactingOrFlushing(), + Path: d.objProvider.Path(objMeta), + FileNum: objMeta.DiskFileNum, + }) + return writable, objMeta, nil +} + +// newCompactionOutputObj creates an object produced by a compaction or flush. +func (d *DB) newCompactionOutputObj( + typ base.FileType, + kind compactionKind, + outputLevel int, + bytesWritten *atomic.Int64, + opts objstorage.CreateOptions, +) (objstorage.Writable, objstorage.ObjectMetadata, error) { + diskFileNum := d.mu.versions.getNextDiskFileNum() + ctx := context.TODO() + + if objiotracing.Enabled { + ctx = objiotracing.WithLevel(ctx, outputLevel) + if kind == compactionKindFlush { + ctx = objiotracing.WithReason(ctx, objiotracing.ForFlush) + } else { + ctx = objiotracing.WithReason(ctx, objiotracing.ForCompaction) + } + } + + writable, objMeta, err := d.objProvider.Create(ctx, typ, diskFileNum, opts) + if err != nil { + return nil, objstorage.ObjectMetadata{}, err + } + + if kind != compactionKindFlush { + writable = &compactionWritable{ + Writable: writable, + versions: d.mu.versions, + written: bytesWritten, + } + } + return writable, objMeta, nil +} + +// validateVersionEdit validates that start and end keys across new and deleted +// files in a versionEdit pass the given validation function. +func validateVersionEdit( + ve *manifest.VersionEdit, vk base.ValidateKey, format base.FormatKey, logger Logger, +) { + validateKey := func(f *manifest.TableMetadata, key []byte) { + if err := vk.Validate(key); err != nil { + logger.Fatalf("pebble: version edit validation failed (key=%s file=%s): %v", format(key), f, err) + } + } + + // Validate both new and deleted files. + for _, f := range ve.NewTables { + validateKey(f.Meta, f.Meta.Smallest().UserKey) + validateKey(f.Meta, f.Meta.Largest().UserKey) + } + for _, m := range ve.DeletedTables { + validateKey(m, m.Smallest().UserKey) + validateKey(m, m.Largest().UserKey) + } +} + +func getDiskWriteCategoryForCompaction(opts *Options, kind compactionKind) vfs.DiskWriteCategory { + if opts.EnableSQLRowSpillMetrics { + // In the scenario that the Pebble engine is used for SQL row spills the + // data written to the memtable will correspond to spills to disk and + // should be categorized as such. + return "sql-row-spill" + } else if kind == compactionKindFlush { + return "pebble-memtable-flush" + } else if kind == compactionKindBlobFileRewrite { + return "pebble-blob-file-rewrite" + } else { + return "pebble-compaction" + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/compaction_picker.go b/vendor/github.com/cockroachdb/pebble/v2/compaction_picker.go new file mode 100644 index 0000000..668dd1d --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/compaction_picker.go @@ -0,0 +1,2223 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "bytes" + "cmp" + "fmt" + "iter" + "math" + "slices" + "sort" + "strings" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/problemspans" +) + +// The minimum count for an intra-L0 compaction. This matches the RocksDB +// heuristic. +const minIntraL0Count = 4 + +type compactionEnv struct { + // diskAvailBytes holds a statistic on the number of bytes available on + // disk, as reported by the filesystem. It's used to be more restrictive in + // expanding compactions if available disk space is limited. + // + // The cached value (d.diskAvailBytes) is updated whenever a file is deleted + // and whenever a compaction or flush completes. Since file removal is the + // primary means of reclaiming space, there is a rough bound on the + // statistic's staleness when available bytes is growing. Compactions and + // flushes are longer, slower operations and provide a much looser bound + // when available bytes is decreasing. + diskAvailBytes uint64 + earliestUnflushedSeqNum base.SeqNum + earliestSnapshotSeqNum base.SeqNum + inProgressCompactions []compactionInfo + readCompactionEnv readCompactionEnv + // problemSpans is checked by the compaction picker to avoid compactions that + // overlap an active "problem span". It can be nil when there are no problem + // spans. + problemSpans *problemspans.ByLevel +} + +type compactionPickerMetrics struct { + levels [numLevels]struct { + score float64 + fillFactor float64 + compensatedFillFactor float64 + } +} + +type compactionPicker interface { + getMetrics([]compactionInfo) compactionPickerMetrics + getBaseLevel() int + estimatedCompactionDebt() uint64 + pickAutoScore(env compactionEnv) (pc pickedCompaction) + pickAutoNonScore(env compactionEnv) (pc pickedCompaction) + forceBaseLevel1() +} + +// A pickedCompaction describes a potential compaction that the compaction +// picker has selected, based on its heuristics. When a compaction begins to +// execute, it is converted into a compaction struct by ConstructCompaction. +type pickedCompaction interface { + // ManualID returns the ID of the manual compaction, or 0 if the picked + // compaction is not a result of a manual compaction. + ManualID() uint64 + // ConstructCompaction creates a compaction from the picked compaction. + ConstructCompaction(*DB, CompactionGrantHandle) compaction + // WaitingCompaction returns a WaitingCompaction description of this + // compaction for consumption by the compaction scheduler. + WaitingCompaction() WaitingCompaction +} + +// readCompactionEnv is used to hold data required to perform read compactions +type readCompactionEnv struct { + rescheduleReadCompaction *bool + readCompactions *readCompactionQueue + flushing bool +} + +// Information about in-progress compactions provided to the compaction picker. +// These are used to constrain the new compactions that will be picked. +type compactionInfo struct { + // versionEditApplied is true if this compaction's version edit has already + // been committed. The compaction may still be in-progress deleting newly + // obsolete files. + versionEditApplied bool + // kind indicates the kind of compaction. + kind compactionKind + inputs []compactionLevel + outputLevel int + // bounds may be nil if the compaction does not involve sstables + // (specifically, a blob file rewrite). + bounds *base.UserKeyBounds +} + +func (info compactionInfo) String() string { + var buf bytes.Buffer + var largest int + for i, in := range info.inputs { + if i > 0 { + fmt.Fprintf(&buf, " -> ") + } + fmt.Fprintf(&buf, "L%d", in.level) + for f := range in.files.All() { + fmt.Fprintf(&buf, " %s", f.TableNum) + } + if largest < in.level { + largest = in.level + } + } + if largest != info.outputLevel || len(info.inputs) == 1 { + fmt.Fprintf(&buf, " -> L%d", info.outputLevel) + } + return buf.String() +} + +// sublevelInfo is used to tag a LevelSlice for an L0 sublevel with the +// sublevel. +type sublevelInfo struct { + manifest.LevelSlice + sublevel manifest.Layer +} + +func (cl sublevelInfo) Clone() sublevelInfo { + return sublevelInfo{ + sublevel: cl.sublevel, + LevelSlice: cl.LevelSlice, + } +} +func (cl sublevelInfo) String() string { + return fmt.Sprintf(`Sublevel %s; Levels %s`, cl.sublevel, cl.LevelSlice) +} + +// generateSublevelInfo will generate the level slices for each of the sublevels +// from the level slice for all of L0. +func generateSublevelInfo(cmp base.Compare, levelFiles manifest.LevelSlice) []sublevelInfo { + sublevelMap := make(map[uint64][]*manifest.TableMetadata) + for f := range levelFiles.All() { + sublevelMap[uint64(f.SubLevel)] = append(sublevelMap[uint64(f.SubLevel)], f) + } + + var sublevels []int + for level := range sublevelMap { + sublevels = append(sublevels, int(level)) + } + sort.Ints(sublevels) + + var levelSlices []sublevelInfo + for _, sublevel := range sublevels { + metas := sublevelMap[uint64(sublevel)] + levelSlices = append( + levelSlices, + sublevelInfo{ + manifest.NewLevelSliceKeySorted(cmp, metas), + manifest.L0Sublevel(sublevel), + }, + ) + } + return levelSlices +} + +// pickedCompactionMetrics holds metrics related to the compaction picking process +type pickedCompactionMetrics struct { + // scores contains candidateLevelInfo.scores. + scores []float64 + singleLevelOverlappingRatio float64 + multiLevelOverlappingRatio float64 +} + +// pickedTableCompaction contains information about a compaction of sstables +// that has already been chosen, and is being constructed. Compaction +// construction info lives in this struct, and is copied over into the +// compaction struct in constructCompaction. +type pickedTableCompaction struct { + // score of the chosen compaction (candidateLevelInfo.score). + score float64 + // kind indicates the kind of compaction. + kind compactionKind + // manualID > 0 iff this is a manual compaction. It exists solely for + // internal bookkeeping. + manualID uint64 + // startLevel is the level that is being compacted. Inputs from startLevel + // and outputLevel will be merged to produce a set of outputLevel files. + startLevel *compactionLevel + // outputLevel is the level that files are being produced in. outputLevel is + // equal to startLevel+1 except when: + // - if startLevel is 0, the output level equals compactionPicker.baseLevel(). + // - in multilevel compaction, the output level is the lowest level involved in + // the compaction + outputLevel *compactionLevel + // inputs contain levels involved in the compaction in ascending order + inputs []compactionLevel + // LBase at the time of compaction picking. Might be uninitialized for + // intra-L0 compactions. + baseLevel int + // L0-specific compaction info. Set to a non-nil value for all compactions + // where startLevel == 0 that were generated by L0Sublevels. + lcf *manifest.L0CompactionFiles + // maxOutputFileSize is the maximum size of an individual table created + // during compaction. + maxOutputFileSize uint64 + // maxOverlapBytes is the maximum number of bytes of overlap allowed for a + // single output table with the tables in the grandparent level. + maxOverlapBytes uint64 + // maxReadCompactionBytes is the maximum bytes a read compaction is allowed to + // overlap in its output level with. If the overlap is greater than + // maxReadCompaction bytes, then we don't proceed with the compaction. + maxReadCompactionBytes uint64 + + // The boundaries of the input data. + bounds base.UserKeyBounds + version *manifest.Version + l0Organizer *manifest.L0Organizer + pickerMetrics pickedCompactionMetrics +} + +// Assert that *pickedTableCompaction implements pickedCompaction. +var _ pickedCompaction = (*pickedTableCompaction)(nil) + +// ManualID returns the ID of the manual compaction, or 0 if the picked +// compaction is not a result of a manual compaction. +func (pc *pickedTableCompaction) ManualID() uint64 { return pc.manualID } + +// Kind returns the kind of compaction. +func (pc *pickedTableCompaction) Kind() compactionKind { return pc.kind } + +// Score returns the score of the level at the time the compaction was picked. +func (pc *pickedTableCompaction) Score() float64 { return pc.score } + +// ConstructCompaction creates a compaction struct from the +// pickedTableCompaction. +func (pc *pickedTableCompaction) ConstructCompaction( + d *DB, grantHandle CompactionGrantHandle, +) compaction { + return newCompaction( + pc, + d.opts, + d.timeNow(), + d.ObjProvider(), + grantHandle, + d.TableFormat(), + d.determineCompactionValueSeparation) +} + +// WaitingCompaction returns a WaitingCompaction description of this compaction +// for consumption by the compaction scheduler. +func (pc *pickedTableCompaction) WaitingCompaction() WaitingCompaction { + if pc.manualID > 0 { + return WaitingCompaction{Priority: manualCompactionPriority, Score: pc.score} + } + entry, ok := scheduledCompactionMap[pc.kind] + if !ok { + panic(errors.AssertionFailedf("unexpected compactionKind %s", pc.kind)) + } + return WaitingCompaction{ + Optional: entry.optional, + Priority: entry.priority, + Score: pc.score, + } +} + +func defaultOutputLevel(startLevel, baseLevel int) int { + outputLevel := startLevel + 1 + if startLevel == 0 { + outputLevel = baseLevel + } + if outputLevel >= numLevels-1 { + outputLevel = numLevels - 1 + } + return outputLevel +} + +func newPickedTableCompaction( + opts *Options, + cur *manifest.Version, + l0Organizer *manifest.L0Organizer, + startLevel, outputLevel, baseLevel int, +) *pickedTableCompaction { + if outputLevel > 0 && baseLevel == 0 { + panic("base level cannot be 0") + } + if startLevel > 0 && startLevel < baseLevel { + panic(fmt.Sprintf("invalid compaction: start level %d should not be empty (base level %d)", + startLevel, baseLevel)) + } + + targetFileSize := opts.TargetFileSize(outputLevel, baseLevel) + pc := &pickedTableCompaction{ + version: cur, + l0Organizer: l0Organizer, + baseLevel: baseLevel, + inputs: []compactionLevel{{level: startLevel}, {level: outputLevel}}, + maxOutputFileSize: uint64(targetFileSize), + maxOverlapBytes: maxGrandparentOverlapBytes(targetFileSize), + maxReadCompactionBytes: maxReadCompactionBytes(targetFileSize), + } + pc.startLevel = &pc.inputs[0] + pc.outputLevel = &pc.inputs[1] + return pc +} + +// adjustedOutputLevel is the output level used for the purpose of +// determining the target output file size, overlap bytes, and expanded +// bytes, taking into account the base level. +func adjustedOutputLevel(outputLevel int, baseLevel int) int { + if outputLevel == 0 { + return 0 + } + if baseLevel == 0 { + panic("base level cannot be 0") + } + // Output level is in the range [baseLevel, numLevels). For the purpose of + // determining the target output file size, overlap bytes, and expanded + // bytes, we want to adjust the range to [1, numLevels). + return 1 + outputLevel - baseLevel +} + +func newPickedCompactionFromL0( + lcf *manifest.L0CompactionFiles, + opts *Options, + vers *manifest.Version, + l0Organizer *manifest.L0Organizer, + baseLevel int, + isBase bool, +) *pickedTableCompaction { + outputLevel := baseLevel + if !isBase { + outputLevel = 0 // Intra L0 + } + + pc := newPickedTableCompaction(opts, vers, l0Organizer, 0, outputLevel, baseLevel) + pc.lcf = lcf + pc.outputLevel.level = outputLevel + + // Manually build the compaction as opposed to calling + // pickAutoHelper. This is because L0Sublevels has already added + // any overlapping L0 SSTables that need to be added, and + // because compactions built by L0SSTables do not necessarily + // pick contiguous sequences of files in pc.version.Levels[0]. + pc.startLevel.files = manifest.NewLevelSliceSeqSorted(lcf.Files) + return pc +} + +func (pc *pickedTableCompaction) String() string { + var builder strings.Builder + builder.WriteString(fmt.Sprintf(`Score=%f, `, pc.score)) + builder.WriteString(fmt.Sprintf(`Kind=%s, `, pc.kind)) + builder.WriteString(fmt.Sprintf(`AdjustedOutputLevel=%d, `, adjustedOutputLevel(pc.outputLevel.level, pc.baseLevel))) + builder.WriteString(fmt.Sprintf(`maxOutputFileSize=%d, `, pc.maxOutputFileSize)) + builder.WriteString(fmt.Sprintf(`maxReadCompactionBytes=%d, `, pc.maxReadCompactionBytes)) + builder.WriteString(fmt.Sprintf(`bounds=%s, `, pc.bounds)) + builder.WriteString(fmt.Sprintf(`version=%s, `, pc.version)) + builder.WriteString(fmt.Sprintf(`inputs=%s, `, pc.inputs)) + builder.WriteString(fmt.Sprintf(`startlevel=%s, `, pc.startLevel)) + builder.WriteString(fmt.Sprintf(`outputLevel=%s, `, pc.outputLevel)) + builder.WriteString(fmt.Sprintf(`l0SublevelInfo=%s, `, pc.startLevel.l0SublevelInfo)) + builder.WriteString(fmt.Sprintf(`lcf=%s`, pc.lcf)) + return builder.String() +} + +// Clone creates a deep copy of the pickedCompaction +func (pc *pickedTableCompaction) clone() *pickedTableCompaction { + + // Quickly copy over fields that do not require special deep copy care, and + // set all fields that will require a deep copy to nil. + newPC := &pickedTableCompaction{ + score: pc.score, + kind: pc.kind, + baseLevel: pc.baseLevel, + maxOutputFileSize: pc.maxOutputFileSize, + maxOverlapBytes: pc.maxOverlapBytes, + maxReadCompactionBytes: pc.maxReadCompactionBytes, + bounds: pc.bounds.Clone(), + + // TODO(msbutler): properly clone picker metrics + pickerMetrics: pc.pickerMetrics, + + // Both copies see the same manifest, therefore, it's ok for them to share + // the same pc.version and pc.l0Organizer. + version: pc.version, + l0Organizer: pc.l0Organizer, + } + + newPC.inputs = make([]compactionLevel, len(pc.inputs)) + for i := range pc.inputs { + newPC.inputs[i] = pc.inputs[i].Clone() + if i == 0 { + newPC.startLevel = &newPC.inputs[i] + } else if i == len(pc.inputs)-1 { + newPC.outputLevel = &newPC.inputs[i] + } + } + + if len(pc.startLevel.l0SublevelInfo) > 0 { + newPC.startLevel.l0SublevelInfo = make([]sublevelInfo, len(pc.startLevel.l0SublevelInfo)) + for i := range pc.startLevel.l0SublevelInfo { + newPC.startLevel.l0SublevelInfo[i] = pc.startLevel.l0SublevelInfo[i].Clone() + } + } + if pc.lcf != nil { + newPC.lcf = pc.lcf.Clone() + } + return newPC +} + +// setupInputs returns true if a compaction has been set up using the provided inputLevel and +// pc.outputLevel. It returns false if a concurrent compaction is occurring on the start or +// output level files. Note that inputLevel is not necessarily pc.startLevel. In multiLevel +// compactions, inputs are set by calling setupInputs once for each adjacent pair of levels. +// This will preserve level invariants when expanding the compaction. pc.smallest and pc.largest +// will be updated to reflect the key range of the inputs. +func (pc *pickedTableCompaction) setupInputs( + opts *Options, + diskAvailBytes uint64, + inputLevel *compactionLevel, + problemSpans *problemspans.ByLevel, +) bool { + cmp := opts.Comparer.Compare + if !canCompactTables(inputLevel.files, inputLevel.level, problemSpans) { + return false + } + pc.bounds = manifest.ExtendKeyRange(cmp, pc.bounds, inputLevel.files.All()) + + // Setup output files and attempt to grow the inputLevel files with + // the expanded key range. No need to do this for intra-L0 compactions; + // outputLevel.files is left empty for those. + if inputLevel.level != pc.outputLevel.level { + // Determine the sstables in the output level which overlap with the compaction + // key range. + pc.outputLevel.files = pc.version.Overlaps(pc.outputLevel.level, pc.bounds) + if !canCompactTables(pc.outputLevel.files, pc.outputLevel.level, problemSpans) { + return false + } + pc.bounds = manifest.ExtendKeyRange(cmp, pc.bounds, pc.outputLevel.files.All()) + + // maxExpandedBytes is the maximum size of an expanded compaction. If + // growing a compaction results in a larger size, the original compaction + // is used instead. + targetFileSize := opts.TargetFileSize(pc.outputLevel.level, pc.baseLevel) + maxExpandedBytes := expandedCompactionByteSizeLimit(opts, targetFileSize, diskAvailBytes) + + // Grow the sstables in inputLevel.level as long as it doesn't affect the number + // of sstables included from pc.outputLevel.level. + if pc.lcf != nil && inputLevel.level == 0 { + pc.growL0ForBase(cmp, maxExpandedBytes) + } else if pc.grow(cmp, pc.bounds, maxExpandedBytes, inputLevel, problemSpans) { + // inputLevel was expanded, adjust key range if necessary. + pc.bounds = manifest.ExtendKeyRange(cmp, pc.bounds, inputLevel.files.All()) + } + } + + if inputLevel.level == 0 { + // If L0 is involved, it should always be the startLevel of the compaction. + pc.startLevel.l0SublevelInfo = generateSublevelInfo(cmp, pc.startLevel.files) + } + return true +} + +// grow grows the number of inputs at startLevel without changing the number of +// pc.outputLevel files in the compaction, and returns whether the inputs grew. sm +// and la are the smallest and largest InternalKeys in all of the inputs. +func (pc *pickedTableCompaction) grow( + cmp base.Compare, + bounds base.UserKeyBounds, + maxExpandedBytes uint64, + inputLevel *compactionLevel, + problemSpans *problemspans.ByLevel, +) bool { + if pc.outputLevel.files.Empty() { + return false + } + expandedInputLevel := pc.version.Overlaps(inputLevel.level, bounds) + if !canCompactTables(expandedInputLevel, inputLevel.level, problemSpans) { + return false + } + if expandedInputLevel.Len() <= inputLevel.files.Len() { + return false + } + if expandedInputLevel.AggregateSizeSum()+pc.outputLevel.files.AggregateSizeSum() >= maxExpandedBytes { + return false + } + // Check that expanding the input level does not change the number of overlapping files in output level. + // We need to include the outputLevel iter because without it, in a multiLevel scenario, + // expandedInputLevel's key range not fully cover all files currently in pc.outputLevel, + // since pc.outputLevel was created using the entire key range which includes higher levels. + expandedOutputLevel := pc.version.Overlaps(pc.outputLevel.level, + manifest.KeyRange(cmp, expandedInputLevel.All(), pc.outputLevel.files.All())) + if expandedOutputLevel.Len() != pc.outputLevel.files.Len() { + return false + } + if !canCompactTables(expandedOutputLevel, pc.outputLevel.level, problemSpans) { + return false + } + inputLevel.files = expandedInputLevel + return true +} + +// Similar logic as pc.grow. Additional L0 files are optionally added to the +// compaction at this step. Note that the bounds passed in are not the bounds +// of the compaction, but rather the smallest and largest internal keys that +// the compaction cannot include from L0 without pulling in more Lbase +// files. Consider this example: +// +// L0: c-d e+f g-h +// Lbase: a-b e+f i-j +// +// a b c d e f g h i j +// +// The e-f files have already been chosen in the compaction. As pulling +// in more LBase files is undesirable, the logic below will pass in +// smallest = b and largest = i to ExtendL0ForBaseCompactionTo, which +// will expand the compaction to include c-d and g-h from L0. The +// bounds passed in are exclusive; the compaction cannot be expanded +// to include files that "touch" it. +func (pc *pickedTableCompaction) growL0ForBase(cmp base.Compare, maxExpandedBytes uint64) bool { + if invariants.Enabled { + if pc.startLevel.level != 0 { + panic(fmt.Sprintf("pc.startLevel.level is %d, expected 0", pc.startLevel.level)) + } + } + smallestBaseKey := base.InvalidInternalKey + largestBaseKey := base.InvalidInternalKey + if pc.outputLevel.files.Empty() { + baseIter := pc.version.Levels[pc.outputLevel.level].Iter() + if sm := baseIter.SeekLT(cmp, pc.bounds.Start); sm != nil { + smallestBaseKey = sm.Largest() + } + if la := baseIter.SeekGE(cmp, pc.bounds.End.Key); la != nil { + largestBaseKey = la.Smallest() + } + } else { + // NB: We use Reslice to access the underlying level's files, but + // we discard the returned slice. The pc.outputLevel.files slice + // is not modified. + _ = pc.outputLevel.files.Reslice(func(start, end *manifest.LevelIterator) { + if sm := start.Prev(); sm != nil { + smallestBaseKey = sm.Largest() + } + if la := end.Next(); la != nil { + largestBaseKey = la.Smallest() + } + }) + } + oldLcf := pc.lcf.Clone() + if !pc.l0Organizer.ExtendL0ForBaseCompactionTo(smallestBaseKey, largestBaseKey, pc.lcf) { + return false + } + + var newStartLevelFiles []*manifest.TableMetadata + iter := pc.version.Levels[0].Iter() + var sizeSum uint64 + for j, f := 0, iter.First(); f != nil; j, f = j+1, iter.Next() { + if pc.lcf.FilesIncluded[f.L0Index] { + newStartLevelFiles = append(newStartLevelFiles, f) + sizeSum += f.Size + } + } + + if sizeSum+pc.outputLevel.files.AggregateSizeSum() >= maxExpandedBytes { + *pc.lcf = *oldLcf + return false + } + + pc.startLevel.files = manifest.NewLevelSliceSeqSorted(newStartLevelFiles) + pc.bounds = manifest.ExtendKeyRange(cmp, pc.bounds, + pc.startLevel.files.All(), pc.outputLevel.files.All()) + return true +} + +// estimatedInputSize returns an estimate of the size of the compaction's +// inputs, including the estimated physical size of input tables' blob +// references. +func (pc *pickedTableCompaction) estimatedInputSize() uint64 { + var bytesToCompact uint64 + for i := range pc.inputs { + bytesToCompact += pc.inputs[i].files.AggregateSizeSum() + } + return bytesToCompact +} + +// setupMultiLevelCandidate returns true if it successfully added another level +// to the compaction. +func (pc *pickedTableCompaction) setupMultiLevelCandidate( + opts *Options, diskAvailBytes uint64, +) bool { + pc.inputs = append(pc.inputs, compactionLevel{level: pc.outputLevel.level + 1}) + + // Recalibrate startLevel and outputLevel: + // - startLevel and outputLevel pointers may be obsolete after appending to pc.inputs. + // - push outputLevel to extraLevels and move the new level to outputLevel + pc.startLevel = &pc.inputs[0] + pc.outputLevel = &pc.inputs[2] + return pc.setupInputs(opts, diskAvailBytes, &pc.inputs[1], nil /* TODO(radu) */) +} + +// canCompactTables returns true if the tables in the level slice are not +// compacting already and don't intersect any problem spans. +func canCompactTables( + inputs manifest.LevelSlice, level int, problemSpans *problemspans.ByLevel, +) bool { + for f := range inputs.All() { + if f.IsCompacting() { + return false + } + if problemSpans != nil && problemSpans.Overlaps(level, f.UserKeyBounds()) { + return false + } + } + return true +} + +// newCompactionPickerByScore creates a compactionPickerByScore associated with +// the newest version. The picker is used under logLock (until a new version is +// installed). +func newCompactionPickerByScore( + v *manifest.Version, + lvs *latestVersionState, + opts *Options, + inProgressCompactions []compactionInfo, +) *compactionPickerByScore { + p := &compactionPickerByScore{ + opts: opts, + vers: v, + latestVersionState: lvs, + } + p.initLevelMaxBytes(inProgressCompactions) + return p +} + +// Information about a candidate compaction level that has been identified by +// the compaction picker. +type candidateLevelInfo struct { + // The fill factor of the level, calculated using uncompensated file sizes and + // without any adjustments. A factor > 1 means that the level has more data + // than the ideal size for that level. + // + // For L0, the fill factor is calculated based on the number of sublevels + // (see calculateL0FillFactor). + // + // For L1+, the fill factor is the ratio between the total uncompensated file + // size and the ideal size of the level (based on the total size of the DB). + fillFactor float64 + + // The score of the level, used to rank levels. + // + // If the level doesn't require compaction, the score is 0. Otherwise: + // - for L6 the score is equal to the fillFactor; + // - for L0-L5: + // - if the fillFactor is < 1: the score is equal to the fillFactor; + // - if the fillFactor is >= 1: the score is the ratio between the + // fillFactor and the next level's fillFactor. + score float64 + + // The fill factor of the level after accounting for level size compensation. + // + // For L0, the compensatedFillFactor is equal to the fillFactor as we don't + // account for level size compensation in L0. + // + // For l1+, the compensatedFillFactor takes into account the estimated + // savings in the lower levels because of deletions. + // + // The compensated fill factor is used to determine if the level should be + // compacted (see calculateLevelScores). + compensatedFillFactor float64 + + level int + // The level to compact to. + outputLevel int + // The file in level that will be compacted. Additional files may be + // picked by the compaction, and a pickedCompaction created for the + // compaction. + file manifest.LevelFile +} + +func (c *candidateLevelInfo) shouldCompact() bool { + return c.score > 0 +} + +func tableTombstoneCompensation(t *manifest.TableMetadata) uint64 { + return t.Stats.PointDeletionsBytesEstimate + t.Stats.RangeDeletionsBytesEstimate +} + +// tableCompensatedSize returns t's size, including an estimate of the physical +// size of its external references, and inflated according to compaction +// priorities. +func tableCompensatedSize(t *manifest.TableMetadata) uint64 { + // Add in the estimate of disk space that may be reclaimed by compacting the + // table's tombstones. + return t.Size + t.EstimatedReferenceSize() + tableTombstoneCompensation(t) +} + +// totalCompensatedSize computes the compensated size over a table metadata +// iterator. Note that this function is linear in the files available to the +// iterator. Use the compensatedSizeAnnotator if querying the total +// compensated size of a level. +func totalCompensatedSize(iter iter.Seq[*manifest.TableMetadata]) uint64 { + var sz uint64 + for f := range iter { + sz += tableCompensatedSize(f) + } + return sz +} + +// compactionPickerByScore holds the state and logic for picking a compaction. A +// compaction picker is associated with a single version. A new compaction +// picker is created and initialized every time a new version is installed. +type compactionPickerByScore struct { + opts *Options + vers *manifest.Version + // Unlike vers, which is immutable and the latest version when this picker + // is created, latestVersionState represents the mutable state of the latest + // version. This means that at some point in the future a + // compactionPickerByScore created in the past will have mutually + // inconsistent state in vers and latestVersionState. This is not a problem + // since (a) a new picker is created in UpdateVersionLocked when a new + // version is installed, and (b) only the latest picker is used for picking + // compactions. This is ensured by holding versionSet.logLock for both (a) + // and (b). + latestVersionState *latestVersionState + // The level to target for L0 compactions. Levels L1 to baseLevel must be + // empty. + baseLevel int + // levelMaxBytes holds the dynamically adjusted max bytes setting for each + // level. + levelMaxBytes [numLevels]int64 + dbSizeBytes uint64 +} + +var _ compactionPicker = &compactionPickerByScore{} + +func (p *compactionPickerByScore) getMetrics(inProgress []compactionInfo) compactionPickerMetrics { + var m compactionPickerMetrics + for _, info := range p.calculateLevelScores(inProgress) { + m.levels[info.level].score = info.score + m.levels[info.level].fillFactor = info.fillFactor + m.levels[info.level].compensatedFillFactor = info.compensatedFillFactor + } + return m +} + +func (p *compactionPickerByScore) getBaseLevel() int { + if p == nil { + return 1 + } + return p.baseLevel +} + +// estimatedCompactionDebt estimates the number of bytes which need to be +// compacted before the LSM tree becomes stable. +func (p *compactionPickerByScore) estimatedCompactionDebt() uint64 { + if p == nil { + return 0 + } + + // We assume that all the bytes in L0 need to be compacted to Lbase. This is + // unlike the RocksDB logic that figures out whether L0 needs compaction. + bytesAddedToNextLevel := p.vers.Levels[0].AggregateSize() + lbaseSize := p.vers.Levels[p.baseLevel].AggregateSize() + + var compactionDebt uint64 + if bytesAddedToNextLevel > 0 && lbaseSize > 0 { + // We only incur compaction debt if both L0 and Lbase contain data. If L0 + // is empty, no compaction is necessary. If Lbase is empty, a move-based + // compaction from L0 would occur. + compactionDebt += bytesAddedToNextLevel + lbaseSize + } + + // loop invariant: At the beginning of the loop, bytesAddedToNextLevel is the + // bytes added to `level` in the loop. + for level := p.baseLevel; level < numLevels-1; level++ { + levelSize := p.vers.Levels[level].AggregateSize() + bytesAddedToNextLevel + nextLevelSize := p.vers.Levels[level+1].AggregateSize() + if levelSize > uint64(p.levelMaxBytes[level]) { + bytesAddedToNextLevel = levelSize - uint64(p.levelMaxBytes[level]) + if nextLevelSize > 0 { + // We only incur compaction debt if the next level contains data. If the + // next level is empty, a move-based compaction would be used. + levelRatio := float64(nextLevelSize) / float64(levelSize) + // The current level contributes bytesAddedToNextLevel to compactions. + // The next level contributes levelRatio * bytesAddedToNextLevel. + compactionDebt += uint64(float64(bytesAddedToNextLevel) * (levelRatio + 1)) + } + } else { + // We're not moving any bytes to the next level. + bytesAddedToNextLevel = 0 + } + } + return compactionDebt +} + +func (p *compactionPickerByScore) initLevelMaxBytes(inProgressCompactions []compactionInfo) { + // The levelMaxBytes calculations here differ from RocksDB in two ways: + // + // 1. The use of dbSize vs maxLevelSize. RocksDB uses the size of the maximum + // level in L1-L6, rather than determining the size of the bottom level + // based on the total amount of data in the dB. The RocksDB calculation is + // problematic if L0 contains a significant fraction of data, or if the + // level sizes are roughly equal and thus there is a significant fraction + // of data outside of the largest level. + // + // 2. Not adjusting the size of Lbase based on L0. RocksDB computes + // baseBytesMax as the maximum of the configured LBaseMaxBytes and the + // size of L0. This is problematic because baseBytesMax is used to compute + // the max size of lower levels. A very large baseBytesMax will result in + // an overly large value for the size of lower levels which will caused + // those levels not to be compacted even when they should be + // compacted. This often results in "inverted" LSM shapes where Ln is + // larger than Ln+1. + + // Determine the first non-empty level and the total DB size. + firstNonEmptyLevel := -1 + var dbSize uint64 + for level := 1; level < numLevels; level++ { + if p.vers.Levels[level].AggregateSize() > 0 { + if firstNonEmptyLevel == -1 { + firstNonEmptyLevel = level + } + dbSize += p.vers.Levels[level].AggregateSize() + } + } + for _, c := range inProgressCompactions { + if c.outputLevel == 0 || c.outputLevel == -1 { + continue + } + if c.inputs[0].level == 0 && (firstNonEmptyLevel == -1 || c.outputLevel < firstNonEmptyLevel) { + firstNonEmptyLevel = c.outputLevel + } + } + + // Initialize the max-bytes setting for each level to "infinity" which will + // disallow compaction for that level. We'll fill in the actual value below + // for levels we want to allow compactions from. + for level := 0; level < numLevels; level++ { + p.levelMaxBytes[level] = math.MaxInt64 + } + + dbSizeBelowL0 := dbSize + dbSize += p.vers.Levels[0].AggregateSize() + p.dbSizeBytes = dbSize + if dbSizeBelowL0 == 0 { + // No levels for L1 and up contain any data. Target L0 compactions for the + // last level or to the level to which there is an ongoing L0 compaction. + p.baseLevel = numLevels - 1 + if firstNonEmptyLevel >= 0 { + p.baseLevel = firstNonEmptyLevel + } + return + } + + bottomLevelSize := dbSize - dbSize/uint64(p.opts.Experimental.LevelMultiplier) + + curLevelSize := bottomLevelSize + for level := numLevels - 2; level >= firstNonEmptyLevel; level-- { + curLevelSize = uint64(float64(curLevelSize) / float64(p.opts.Experimental.LevelMultiplier)) + } + + // Compute base level (where L0 data is compacted to). + baseBytesMax := uint64(p.opts.LBaseMaxBytes) + p.baseLevel = firstNonEmptyLevel + for p.baseLevel > 1 && curLevelSize > baseBytesMax { + p.baseLevel-- + curLevelSize = uint64(float64(curLevelSize) / float64(p.opts.Experimental.LevelMultiplier)) + } + + smoothedLevelMultiplier := 1.0 + if p.baseLevel < numLevels-1 { + smoothedLevelMultiplier = math.Pow( + float64(bottomLevelSize)/float64(baseBytesMax), + 1.0/float64(numLevels-p.baseLevel-1)) + } + + levelSize := float64(baseBytesMax) + for level := p.baseLevel; level < numLevels; level++ { + if level > p.baseLevel && levelSize > 0 { + levelSize *= smoothedLevelMultiplier + } + // Round the result since test cases use small target level sizes, which + // can be impacted by floating-point imprecision + integer truncation. + roundedLevelSize := math.Round(levelSize) + if roundedLevelSize > float64(math.MaxInt64) { + p.levelMaxBytes[level] = math.MaxInt64 + } else { + p.levelMaxBytes[level] = int64(roundedLevelSize) + } + } +} + +type levelSizeAdjust struct { + incomingActualBytes uint64 + outgoingActualBytes uint64 + outgoingCompensatedBytes uint64 +} + +func (a levelSizeAdjust) compensated() uint64 { + return a.incomingActualBytes - a.outgoingCompensatedBytes +} + +func (a levelSizeAdjust) actual() uint64 { + return a.incomingActualBytes - a.outgoingActualBytes +} + +func calculateSizeAdjust(inProgressCompactions []compactionInfo) [numLevels]levelSizeAdjust { + // Compute size adjustments for each level based on the in-progress + // compactions. We sum the file sizes of all files leaving and entering each + // level in in-progress compactions. For outgoing files, we also sum a + // separate sum of 'compensated file sizes', which are inflated according + // to deletion estimates. + // + // When we adjust a level's size according to these values during score + // calculation, we subtract the compensated size of start level inputs to + // account for the fact that score calculation uses compensated sizes. + // + // Since compensated file sizes may be compensated because they reclaim + // space from the output level's files, we only add the real file size to + // the output level. + // + // This is slightly different from RocksDB's behavior, which simply elides + // compacting files from the level size calculation. + var sizeAdjust [numLevels]levelSizeAdjust + for i := range inProgressCompactions { + c := &inProgressCompactions[i] + // If this compaction's version edit has already been applied, there's + // no need to adjust: The LSM we'll examine will already reflect the + // new LSM state. + if c.versionEditApplied { + continue + } + + for _, input := range c.inputs { + actualSize := input.files.AggregateSizeSum() + compensatedSize := totalCompensatedSize(input.files.All()) + + if input.level != c.outputLevel { + sizeAdjust[input.level].outgoingCompensatedBytes += compensatedSize + sizeAdjust[input.level].outgoingActualBytes += actualSize + if c.outputLevel != -1 { + sizeAdjust[c.outputLevel].incomingActualBytes += actualSize + } + } + } + } + return sizeAdjust +} + +// calculateLevelScores calculates the candidateLevelInfo for all levels and +// returns them in decreasing score order. +func (p *compactionPickerByScore) calculateLevelScores( + inProgressCompactions []compactionInfo, +) [numLevels]candidateLevelInfo { + var scores [numLevels]candidateLevelInfo + for i := range scores { + scores[i].level = i + scores[i].outputLevel = i + 1 + } + l0FillFactor := calculateL0FillFactor(p.vers, p.latestVersionState.l0Organizer, p.opts, inProgressCompactions) + scores[0] = candidateLevelInfo{ + outputLevel: p.baseLevel, + fillFactor: l0FillFactor, + compensatedFillFactor: l0FillFactor, // No compensation for L0. + } + sizeAdjust := calculateSizeAdjust(inProgressCompactions) + for level := 1; level < numLevels; level++ { + compensatedLevelSize := + // Actual file size. + p.vers.Levels[level].AggregateSize() + + // Point deletions. + *pointDeletionsBytesEstimateAnnotator.LevelAnnotation(p.vers.Levels[level]) + + // Range deletions. + *rangeDeletionsBytesEstimateAnnotator.LevelAnnotation(p.vers.Levels[level]) + + // Adjustments for in-progress compactions. + sizeAdjust[level].compensated() + scores[level].compensatedFillFactor = float64(compensatedLevelSize) / float64(p.levelMaxBytes[level]) + scores[level].fillFactor = float64(p.vers.Levels[level].AggregateSize()+sizeAdjust[level].actual()) / float64(p.levelMaxBytes[level]) + } + + // Adjust each level's fill factor by the fill factor of the next level to get + // an (uncompensated) score; and each level's compensated fill factor by the + // fill factor of the next level to get a compensated score. + // + // The compensated score is used to determine if the level should be compacted + // at all. The (uncompensated) score is used as the value used to rank levels. + // + // If the next level has a high fill factor, and is thus a priority for + // compaction, this reduces the priority for compacting the current level. If + // the next level has a low fill factor (i.e. it is below its target size), + // this increases the priority for compacting the current level. + // + // The effect of this adjustment is to help prioritize compactions in lower + // levels. The following example shows the scores and the fill factors. In this + // scenario, L0 has 68 sublevels. L3 (a.k.a. Lbase) is significantly above its + // target size. The original score prioritizes compactions from those two + // levels, but doing so ends up causing a future problem: data piles up in the + // higher levels, starving L5->L6 compactions, and to a lesser degree starving + // L4->L5 compactions. + // + // Note that in the example shown there is no level size compensation so the + // compensatedFillFactor and fillFactor are the same for each level. + // + // score fillFactor compensatedFillFactor size max-size + // L0 3.2 68.0 68.0 2.2 G - + // L3 3.2 21.1 21.1 1.3 G 64 M + // L4 3.4 6.7 6.7 3.1 G 467 M + // L5 3.4 2.0 2.0 6.6 G 3.3 G + // L6 0 0.6 0.6 14 G 24 G + // + // TODO(radu): the way compensation works needs some rethinking. For example, + // if compacting L5 can free up a lot of space in L6, the score of L5 should + // go *up* with the fill factor of L6, not the other way around. + for level := 0; level < numLevels; level++ { + if level > 0 && level < p.baseLevel { + continue + } + const compensatedFillFactorThreshold = 1.0 + if scores[level].compensatedFillFactor < compensatedFillFactorThreshold { + // No need to compact this level; score stays 0. + continue + } + score := scores[level].fillFactor + compensatedScore := scores[level].compensatedFillFactor + if level < numLevels-1 { + nextLevel := scores[level].outputLevel + // Avoid absurdly large scores by placing a floor on the factor that we'll + // adjust a level by. The value of 0.01 was chosen somewhat arbitrarily. + denominator := max(0.01, scores[nextLevel].fillFactor) + score /= denominator + compensatedScore /= denominator + } + // The level requires compaction iff both compensatedFillFactor and + // compensatedScore are >= 1.0. + // + // TODO(radu): this seems ad-hoc. In principle, the state of other levels + // should not come into play when we're determining this level's eligibility + // for compaction. The score should take care of correctly prioritizing the + // levels. + const compensatedScoreThreshold = 1.0 + if compensatedScore < compensatedScoreThreshold { + // No need to compact this level; score stays 0. + continue + } + scores[level].score = score + } + // Sort by score (decreasing) and break ties by level (increasing). + slices.SortFunc(scores[:], func(a, b candidateLevelInfo) int { + if a.score != b.score { + return cmp.Compare(b.score, a.score) + } + return cmp.Compare(a.level, b.level) + }) + return scores +} + +// calculateL0FillFactor calculates a float value representing the relative +// priority of compacting L0. A value less than 1 indicates that L0 does not +// need any compactions. +// +// L0 is special in that files within L0 may overlap one another, so a different +// set of heuristics that take into account read amplification apply. +func calculateL0FillFactor( + vers *manifest.Version, + l0Organizer *manifest.L0Organizer, + opts *Options, + inProgressCompactions []compactionInfo, +) float64 { + // Use the sublevel count to calculate the score. The base vs intra-L0 + // compaction determination happens in pickAuto, not here. + score := float64(2*l0Organizer.MaxDepthAfterOngoingCompactions()) / + float64(opts.L0CompactionThreshold) + + // Also calculate a score based on the file count but use it only if it + // produces a higher score than the sublevel-based one. This heuristic is + // designed to accommodate cases where L0 is accumulating non-overlapping + // files in L0. Letting too many non-overlapping files accumulate in few + // sublevels is undesirable, because: + // 1) we can produce a massive backlog to compact once files do overlap. + // 2) constructing L0 sublevels has a runtime that grows superlinearly with + // the number of files in L0 and must be done while holding D.mu. + noncompactingFiles := vers.Levels[0].Len() + for _, c := range inProgressCompactions { + for _, cl := range c.inputs { + if cl.level == 0 { + noncompactingFiles -= cl.files.Len() + } + } + } + fileScore := float64(noncompactingFiles) / float64(opts.L0CompactionFileThreshold) + if score < fileScore { + score = fileScore + } + return score +} + +// pickCompactionSeedFile picks a file from `level` in the `vers` to build a +// compaction around. Currently, this function implements a heuristic similar to +// RocksDB's kMinOverlappingRatio, seeking to minimize write amplification. This +// function is linear with respect to the number of files in `level` and +// `outputLevel`. +func pickCompactionSeedFile( + vers *manifest.Version, + virtualBackings *manifest.VirtualBackings, + opts *Options, + level, outputLevel int, + earliestSnapshotSeqNum base.SeqNum, + problemSpans *problemspans.ByLevel, +) (manifest.LevelFile, bool) { + // Select the file within the level to compact. We want to minimize write + // amplification, but also ensure that (a) deletes are propagated to the + // bottom level in a timely fashion, and (b) virtual sstables that are + // pinning backing sstables where most of the data is garbage are compacted + // away. Doing (a) and (b) reclaims disk space. A table's smallest sequence + // number provides a measure of its age. The ratio of overlapping-bytes / + // table-size gives an indication of write amplification (a smaller ratio is + // preferrable). + // + // The current heuristic is based off the RocksDB kMinOverlappingRatio + // heuristic. It chooses the file with the minimum overlapping ratio with + // the target level, which minimizes write amplification. + // + // The heuristic uses a "compensated size" for the denominator, which is the + // file size inflated by (a) an estimate of the space that may be reclaimed + // through compaction, and (b) a fraction of the amount of garbage in the + // backing sstable pinned by this (virtual) sstable. + // + // TODO(peter): For concurrent compactions, we may want to try harder to + // pick a seed file whose resulting compaction bounds do not overlap with + // an in-progress compaction. + + cmp := opts.Comparer.Compare + startIter := vers.Levels[level].Iter() + outputIter := vers.Levels[outputLevel].Iter() + + var file manifest.LevelFile + smallestRatio := uint64(math.MaxUint64) + + outputFile := outputIter.First() + + for f := startIter.First(); f != nil; f = startIter.Next() { + var overlappingBytes uint64 + if f.IsCompacting() { + // Move on if this file is already being compacted. We'll likely + // still need to move past the overlapping output files regardless, + // but in cases where all start-level files are compacting we won't. + continue + } + if problemSpans != nil && problemSpans.Overlaps(level, f.UserKeyBounds()) { + // File touches problem span which temporarily disallows auto compactions. + continue + } + + // Trim any output-level files smaller than f. + for outputFile != nil && sstableKeyCompare(cmp, outputFile.Largest(), f.Smallest()) < 0 { + outputFile = outputIter.Next() + } + + skip := false + for outputFile != nil && sstableKeyCompare(cmp, outputFile.Smallest(), f.Largest()) <= 0 { + overlappingBytes += outputFile.Size + if outputFile.IsCompacting() { + // If one of the overlapping files is compacting, we're not going to be + // able to compact f anyway, so skip it. + skip = true + break + } + if problemSpans != nil && problemSpans.Overlaps(outputLevel, outputFile.UserKeyBounds()) { + // Overlapping file touches problem span which temporarily disallows auto compactions. + skip = true + break + } + + // For files in the bottommost level of the LSM, the + // Stats.RangeDeletionsBytesEstimate field is set to the estimate + // of bytes /within/ the file itself that may be dropped by + // recompacting the file. These bytes from obsolete keys would not + // need to be rewritten if we compacted `f` into `outputFile`, so + // they don't contribute to write amplification. Subtracting them + // out of the overlapping bytes helps prioritize these compactions + // that are cheaper than their file sizes suggest. + if outputLevel == numLevels-1 && outputFile.LargestSeqNum < earliestSnapshotSeqNum { + overlappingBytes -= outputFile.Stats.RangeDeletionsBytesEstimate + } + + // If the file in the next level extends beyond f's largest key, + // break out and don't advance outputIter because f's successor + // might also overlap. + // + // Note, we stop as soon as we encounter an output-level file with a + // largest key beyond the input-level file's largest bound. We + // perform a simple user key comparison here using sstableKeyCompare + // which handles the potential for exclusive largest key bounds. + // There's some subtlety when the bounds are equal (eg, equal and + // inclusive, or equal and exclusive). Current Pebble doesn't split + // user keys across sstables within a level (and in format versions + // FormatSplitUserKeysMarkedCompacted and later we guarantee no + // split user keys exist within the entire LSM). In that case, we're + // assured that neither the input level nor the output level's next + // file shares the same user key, so compaction expansion will not + // include them in any compaction compacting `f`. + // + // NB: If we /did/ allow split user keys, or we're running on an + // old database with an earlier format major version where there are + // existing split user keys, this logic would be incorrect. Consider + // L1: [a#120,a#100] [a#80,a#60] + // L2: [a#55,a#45] [a#35,a#25] [a#15,a#5] + // While considering the first file in L1, [a#120,a#100], we'd skip + // past all of the files in L2. When considering the second file in + // L1, we'd improperly conclude that the second file overlaps + // nothing in the second level and is cheap to compact, when in + // reality we'd need to expand the compaction to include all 5 + // files. + if sstableKeyCompare(cmp, outputFile.Largest(), f.Largest()) > 0 { + break + } + outputFile = outputIter.Next() + } + if skip { + continue + } + + compSz := tableCompensatedSize(f) + responsibleForGarbageBytes(virtualBackings, f) + scaledRatio := overlappingBytes * 1024 / compSz + if scaledRatio < smallestRatio { + smallestRatio = scaledRatio + file = startIter.Take() + } + } + return file, file.TableMetadata != nil +} + +// responsibleForGarbageBytes returns the amount of garbage in the backing +// sstable that we consider the responsibility of this virtual sstable. For +// non-virtual sstables, this is of course 0. For virtual sstables, we equally +// distribute the responsibility of the garbage across all the virtual +// sstables that are referencing the same backing sstable. One could +// alternatively distribute this in proportion to the virtual sst sizes, but +// it isn't clear that more sophisticated heuristics are worth it, given that +// the garbage cannot be reclaimed until all the referencing virtual sstables +// are compacted. +func responsibleForGarbageBytes( + virtualBackings *manifest.VirtualBackings, m *manifest.TableMetadata, +) uint64 { + if !m.Virtual { + return 0 + } + useCount, virtualizedSize := virtualBackings.Usage(m.TableBacking.DiskFileNum) + // Since virtualizedSize is the sum of the estimated size of all virtual + // ssts, we allow for the possibility that virtualizedSize could exceed + // m.TableBacking.Size. + totalGarbage := int64(m.TableBacking.Size) - int64(virtualizedSize) + if totalGarbage <= 0 { + return 0 + } + if useCount == 0 { + // This cannot happen if m exists in the latest version. The call to + // ResponsibleForGarbageBytes during compaction picking ensures that m + // exists in the latest version by holding versionSet.logLock. + panic(errors.AssertionFailedf("%s has zero useCount", m.String())) + } + return uint64(totalGarbage) / uint64(useCount) +} + +func (p *compactionPickerByScore) getCompactionConcurrency() int { + lower, upper := p.opts.CompactionConcurrencyRange() + if lower >= upper { + return upper + } + // Compaction concurrency is controlled by L0 read-amp. We allow one + // additional compaction per L0CompactionConcurrency sublevels, as well as + // one additional compaction per CompactionDebtConcurrency bytes of + // compaction debt. Compaction concurrency is tied to L0 sublevels as that + // signal is independent of the database size. We tack on the compaction + // debt as a second signal to prevent compaction concurrency from dropping + // significantly right after a base compaction finishes, and before those + // bytes have been compacted further down the LSM. + // + // Let n be the number of in-progress compactions. + // + // l0ReadAmp >= ccSignal1 then can run another compaction, where + // ccSignal1 = n * p.opts.Experimental.L0CompactionConcurrency + // Rearranging, + // n <= l0ReadAmp / p.opts.Experimental.L0CompactionConcurrency. + // So we can run up to + // l0ReadAmp / p.opts.Experimental.L0CompactionConcurrency extra compactions. + l0ReadAmpCompactions := 0 + if p.opts.Experimental.L0CompactionConcurrency > 0 { + l0ReadAmp := p.latestVersionState.l0Organizer.MaxDepthAfterOngoingCompactions() + l0ReadAmpCompactions = (l0ReadAmp / p.opts.Experimental.L0CompactionConcurrency) + } + // compactionDebt >= ccSignal2 then can run another compaction, where + // ccSignal2 = uint64(n) * p.opts.Experimental.CompactionDebtConcurrency + // Rearranging, + // n <= compactionDebt / p.opts.Experimental.CompactionDebtConcurrency + // So we can run up to + // compactionDebt / p.opts.Experimental.CompactionDebtConcurrency extra + // compactions. + compactionDebtCompactions := 0 + if p.opts.Experimental.CompactionDebtConcurrency > 0 { + compactionDebt := p.estimatedCompactionDebt() + compactionDebtCompactions = int(compactionDebt / p.opts.Experimental.CompactionDebtConcurrency) + } + + compactableGarbageCompactions := 0 + garbageFractionLimit := p.opts.Experimental.CompactionGarbageFractionForMaxConcurrency() + if garbageFractionLimit > 0 && p.dbSizeBytes > 0 { + compactableGarbageBytes := + *pointDeletionsBytesEstimateAnnotator.MultiLevelAnnotation(p.vers.Levels[:]) + + *rangeDeletionsBytesEstimateAnnotator.MultiLevelAnnotation(p.vers.Levels[:]) + garbageFraction := float64(compactableGarbageBytes) / float64(p.dbSizeBytes) + compactableGarbageCompactions = + int((garbageFraction / garbageFractionLimit) * float64(upper-lower)) + } + + extraCompactions := max(l0ReadAmpCompactions, compactionDebtCompactions, compactableGarbageCompactions, 0) + + return min(lower+extraCompactions, upper) +} + +// TODO(sumeer): remove unless someone actually finds this useful. +func (p *compactionPickerByScore) logCompactionForTesting( + env compactionEnv, scores [numLevels]candidateLevelInfo, pc *pickedTableCompaction, +) { + var buf bytes.Buffer + for i := 0; i < numLevels; i++ { + if i != 0 && i < p.baseLevel { + continue + } + + var info *candidateLevelInfo + for j := range scores { + if scores[j].level == i { + info = &scores[j] + break + } + } + + marker := " " + if pc.startLevel.level == info.level { + marker = "*" + } + fmt.Fprintf(&buf, " %sL%d: score:%5.1f fillFactor:%5.1f compensatedFillFactor:%5.1f %8s %8s", + marker, info.level, info.score, info.fillFactor, info.compensatedFillFactor, + humanize.Bytes.Int64(int64(totalCompensatedSize( + p.vers.Levels[info.level].All(), + ))), + humanize.Bytes.Int64(p.levelMaxBytes[info.level]), + ) + + count := 0 + for i := range env.inProgressCompactions { + c := &env.inProgressCompactions[i] + if c.inputs[0].level != info.level { + continue + } + count++ + if count == 1 { + fmt.Fprintf(&buf, " [") + } else { + fmt.Fprintf(&buf, " ") + } + fmt.Fprintf(&buf, "L%d->L%d", c.inputs[0].level, c.outputLevel) + } + if count > 0 { + fmt.Fprintf(&buf, "]") + } + fmt.Fprintf(&buf, "\n") + } + p.opts.Logger.Infof("pickAuto: L%d->L%d\n%s", + pc.startLevel.level, pc.outputLevel.level, buf.String()) +} + +// pickAutoScore picks the best score-based compaction, if any. +// +// On each call, pickAutoScore computes per-level size adjustments based on +// in-progress compactions, and computes a per-level score. The levels are +// iterated over in decreasing score order trying to find a valid compaction +// anchored at that level. +// +// If a score-based compaction cannot be found, pickAuto falls back to looking +// for an elision-only compaction to remove obsolete keys. +func (p *compactionPickerByScore) pickAutoScore(env compactionEnv) pickedCompaction { + scores := p.calculateLevelScores(env.inProgressCompactions) + + // Check for a score-based compaction. candidateLevelInfos are first sorted + // by whether they should be compacted, so if we find a level which shouldn't + // be compacted, we can break early. + for i := range scores { + info := &scores[i] + if !info.shouldCompact() { + break + } + if info.level == numLevels-1 { + continue + } + + if info.level == 0 { + ptc := pickL0(env, p.opts, p.vers, p.latestVersionState.l0Organizer, p.baseLevel) + // Fail-safe to protect against compacting the same sstable + // concurrently. + if ptc != nil && !inputRangeAlreadyCompacting(p.opts.Comparer.Compare, env, ptc) { + p.addScoresToPickedCompactionMetrics(ptc, scores) + ptc.score = info.score + if false { + p.logCompactionForTesting(env, scores, ptc) + } + return ptc + } + continue + } + + // info.level > 0 + var ok bool + info.file, ok = pickCompactionSeedFile(p.vers, &p.latestVersionState.virtualBackings, p.opts, info.level, info.outputLevel, env.earliestSnapshotSeqNum, env.problemSpans) + if !ok { + continue + } + + pc := pickAutoLPositive(env, p.opts, p.vers, p.latestVersionState.l0Organizer, *info, p.baseLevel) + // Fail-safe to protect against compacting the same sstable concurrently. + if pc != nil && !inputRangeAlreadyCompacting(p.opts.Comparer.Compare, env, pc) { + p.addScoresToPickedCompactionMetrics(pc, scores) + pc.score = info.score + if false { + p.logCompactionForTesting(env, scores, pc) + } + return pc + } + } + return nil +} + +// pickAutoNonScore picks the best non-score-based compaction, if any. +func (p *compactionPickerByScore) pickAutoNonScore(env compactionEnv) (pc pickedCompaction) { + // Check for files which contain excessive point tombstones that could slow + // down reads. Unlike elision-only compactions, these compactions may select + // a file at any level rather than only the lowest level. + if pc := p.pickTombstoneDensityCompaction(env); pc != nil { + return pc + } + + // Check for L6 files with tombstones that may be elided. These files may + // exist if a snapshot prevented the elision of a tombstone or because of + // a move compaction. These are low-priority compactions because they + // don't help us keep up with writes, just reclaim disk space. + if pc := p.pickElisionOnlyCompaction(env); pc != nil { + return pc + } + + // Check for blob file rewrites. These are low-priority compactions because + // they don't help us keep up with writes, just reclaim disk space. + if pc := p.pickBlobFileRewriteCompaction(env); pc != nil { + return pc + } + + if pc := p.pickReadTriggeredCompaction(env); pc != nil { + return pc + } + + // NB: This should only be run if a read compaction wasn't + // scheduled. + // + // We won't be scheduling a read compaction right now, and in + // read heavy workloads, compactions won't be scheduled frequently + // because flushes aren't frequent. So we need to signal to the + // iterator to schedule a compaction when it adds compactions to + // the read compaction queue. + // + // We need the nil check here because without it, we have some + // tests which don't set that variable fail. Since there's a + // chance that one of those tests wouldn't want extra compactions + // to be scheduled, I added this check here, instead of + // setting rescheduleReadCompaction in those tests. + if env.readCompactionEnv.rescheduleReadCompaction != nil { + *env.readCompactionEnv.rescheduleReadCompaction = true + } + + // At the lowest possible compaction-picking priority, look for files marked + // for compaction. Pebble will mark files for compaction if they have atomic + // compaction units that span multiple files. While current Pebble code does + // not construct such sstables, RocksDB and earlier versions of Pebble may + // have created them. These split user keys form sets of files that must be + // compacted together for correctness (referred to as "atomic compaction + // units" within the code). Rewrite them in-place. + // + // It's also possible that a file may have been marked for compaction by + // even earlier versions of Pebble code, since TableMetadata's + // MarkedForCompaction field is persisted in the manifest. That's okay. We + // previously would've ignored the designation, whereas now we'll re-compact + // the file in place. + if p.vers.Stats.MarkedForCompaction > 0 { + if pc := p.pickRewriteCompaction(env); pc != nil { + return pc + } + } + + return nil +} + +func (p *compactionPickerByScore) addScoresToPickedCompactionMetrics( + pc *pickedTableCompaction, candInfo [numLevels]candidateLevelInfo, +) { + + // candInfo is sorted by score, not by compaction level. + infoByLevel := [numLevels]candidateLevelInfo{} + for i := range candInfo { + level := candInfo[i].level + infoByLevel[level] = candInfo[i] + } + // Gather the compaction scores for the levels participating in the compaction. + pc.pickerMetrics.scores = make([]float64, len(pc.inputs)) + inputIdx := 0 + for i := range infoByLevel { + if pc.inputs[inputIdx].level == infoByLevel[i].level { + pc.pickerMetrics.scores[inputIdx] = infoByLevel[i].score + inputIdx++ + } + if inputIdx == len(pc.inputs) { + break + } + } +} + +// elisionOnlyAnnotator is a manifest.Annotator that annotates B-Tree +// nodes with the *fileMetadata of a file meeting the obsolete keys criteria +// for an elision-only compaction within the subtree. If multiple files meet +// the criteria, it chooses whichever file has the lowest LargestSeqNum. The +// lowest LargestSeqNum file will be the first eligible for an elision-only +// compaction once snapshots less than or equal to its LargestSeqNum are closed. +var elisionOnlyAnnotator = &manifest.Annotator[manifest.TableMetadata]{ + Aggregator: manifest.PickFileAggregator{ + Filter: func(f *manifest.TableMetadata) (eligible bool, cacheOK bool) { + if f.IsCompacting() { + return false, true + } + if !f.StatsValid() { + return false, false + } + // Bottommost files are large and not worthwhile to compact just + // to remove a few tombstones. Consider a file eligible only if + // either its own range deletions delete at least 10% of its data or + // its deletion tombstones make at least 10% of its entries. + // + // TODO(jackson): This does not account for duplicate user keys + // which may be collapsed. Ideally, we would have 'obsolete keys' + // statistics that would include tombstones, the keys that are + // dropped by tombstones and duplicated user keys. See #847. + // + // Note that tables that contain exclusively range keys (i.e. no point keys, + // `NumEntries` and `RangeDeletionsBytesEstimate` are both zero) are excluded + // from elision-only compactions. + // TODO(travers): Consider an alternative heuristic for elision of range-keys. + return f.Stats.RangeDeletionsBytesEstimate*10 >= f.Size || f.Stats.NumDeletions*10 > f.Stats.NumEntries, true + }, + Compare: func(f1 *manifest.TableMetadata, f2 *manifest.TableMetadata) bool { + return f1.LargestSeqNum < f2.LargestSeqNum + }, + }, +} + +// markedForCompactionAnnotator is a manifest.Annotator that annotates B-Tree +// nodes with the *fileMetadata of a file that is marked for compaction +// within the subtree. If multiple files meet the criteria, it chooses +// whichever file has the lowest LargestSeqNum. +var markedForCompactionAnnotator = &manifest.Annotator[manifest.TableMetadata]{ + Aggregator: manifest.PickFileAggregator{ + Filter: func(f *manifest.TableMetadata) (eligible bool, cacheOK bool) { + return f.MarkedForCompaction, true + }, + Compare: func(f1 *manifest.TableMetadata, f2 *manifest.TableMetadata) bool { + return f1.LargestSeqNum < f2.LargestSeqNum + }, + }, +} + +// pickedCompactionFromCandidateFile creates a pickedCompaction from a *fileMetadata +// with various checks to ensure that the file still exists in the expected level +// and isn't already being compacted. +func (p *compactionPickerByScore) pickedCompactionFromCandidateFile( + candidate *manifest.TableMetadata, + env compactionEnv, + startLevel int, + outputLevel int, + kind compactionKind, +) *pickedTableCompaction { + if candidate == nil || candidate.IsCompacting() { + return nil + } + + var inputs manifest.LevelSlice + if startLevel == 0 { + // Overlapping L0 files must also be compacted alongside the candidate. + inputs = p.vers.Overlaps(0, candidate.UserKeyBounds()) + } else { + inputs = p.vers.Levels[startLevel].Find(p.opts.Comparer.Compare, candidate) + } + if invariants.Enabled { + found := false + for f := range inputs.All() { + if f.TableNum == candidate.TableNum { + found = true + } + } + if !found { + panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.TableNum, startLevel)) + } + } + + pc := newPickedTableCompaction(p.opts, p.vers, p.latestVersionState.l0Organizer, + startLevel, outputLevel, p.baseLevel) + pc.kind = kind + pc.startLevel.files = inputs + pc.bounds = manifest.KeyRange(p.opts.Comparer.Compare, pc.startLevel.files.All()) + + // Fail-safe to protect against compacting the same sstable concurrently. + if inputRangeAlreadyCompacting(p.opts.Comparer.Compare, env, pc) { + return nil + } + + if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel, env.problemSpans) { + return nil + } + + return pc +} + +// pickElisionOnlyCompaction looks for compactions of sstables in the +// bottommost level containing obsolete records that may now be dropped. +func (p *compactionPickerByScore) pickElisionOnlyCompaction( + env compactionEnv, +) (pc *pickedTableCompaction) { + if p.opts.private.disableElisionOnlyCompactions { + return nil + } + candidate := elisionOnlyAnnotator.LevelAnnotation(p.vers.Levels[numLevels-1]) + if candidate == nil { + return nil + } + if candidate.LargestSeqNum >= env.earliestSnapshotSeqNum { + return nil + } + return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly) +} + +// pickRewriteCompaction attempts to construct a compaction that +// rewrites a file marked for compaction. pickRewriteCompaction will +// pull in adjacent files in the file's atomic compaction unit if +// necessary. A rewrite compaction outputs files to the same level as +// the input level. +func (p *compactionPickerByScore) pickRewriteCompaction( + env compactionEnv, +) (pc *pickedTableCompaction) { + if p.vers.Stats.MarkedForCompaction == 0 { + return nil + } + for l := numLevels - 1; l >= 0; l-- { + candidate := markedForCompactionAnnotator.LevelAnnotation(p.vers.Levels[l]) + if candidate == nil { + // Try the next level. + continue + } + pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite) + if pc != nil { + return pc + } + } + return nil +} + +// pickBlobFileRewriteCompaction looks for compactions of blob files that +// can be rewritten to reclaim disk space. +func (p *compactionPickerByScore) pickBlobFileRewriteCompaction( + env compactionEnv, +) (pc *pickedBlobFileCompaction) { + aggregateStats, heuristicStats := p.latestVersionState.blobFiles.Stats() + if heuristicStats.CountFilesEligible == 0 && heuristicStats.CountFilesTooRecent == 0 { + // No blob files with any garbage to rewrite. + return nil + } + policy := p.opts.Experimental.ValueSeparationPolicy() + if policy.TargetGarbageRatio >= 1.0 { + // Blob file rewrite compactions are disabled. + return nil + } + garbagePct := float64(aggregateStats.ValueSize-aggregateStats.ReferencedValueSize) / + float64(aggregateStats.ValueSize) + if garbagePct <= policy.TargetGarbageRatio { + // Not enough garbage to warrant a rewrite compaction. + return nil + } + + // Check if there is an ongoing blob file rewrite compaction. If there is, + // don't schedule a new one. + for _, c := range env.inProgressCompactions { + if c.kind == compactionKindBlobFileRewrite { + return nil + } + } + + candidate, ok := p.latestVersionState.blobFiles.ReplacementCandidate() + if !ok { + // None meet the heuristic. + return nil + } + return &pickedBlobFileCompaction{ + vers: p.vers, + file: candidate, + referencingTables: p.latestVersionState.blobFiles.ReferencingTables(candidate.FileID), + } +} + +// pickTombstoneDensityCompaction looks for a compaction that eliminates +// regions of extremely high point tombstone density. For each level, it picks +// a file where the ratio of tombstone-dense blocks is at least +// options.Experimental.MinTombstoneDenseRatio, prioritizing compaction of +// files with higher ratios of tombstone-dense blocks. +func (p *compactionPickerByScore) pickTombstoneDensityCompaction( + env compactionEnv, +) (pc *pickedTableCompaction) { + if p.opts.Experimental.TombstoneDenseCompactionThreshold <= 0 { + // Tombstone density compactions are disabled. + return nil + } + + var candidate *manifest.TableMetadata + var level int + // If a candidate file has a very high overlapping ratio, point tombstones + // in it are likely sparse in keyspace even if the sstable itself is tombstone + // dense. These tombstones likely wouldn't be slow to iterate over, so we exclude + // these files from tombstone density compactions. The threshold of 40.0 is + // chosen somewhat arbitrarily, after some observations around excessively large + // tombstone density compactions. + const maxOverlappingRatio = 40.0 + // NB: we don't consider the lowest level because elision-only compactions + // handle that case. + lastNonEmptyLevel := numLevels - 1 + for l := numLevels - 2; l >= 0; l-- { + iter := p.vers.Levels[l].Iter() + for f := iter.First(); f != nil; f = iter.Next() { + if f.IsCompacting() || !f.StatsValid() || f.Size == 0 { + continue + } + if f.Stats.TombstoneDenseBlocksRatio < p.opts.Experimental.TombstoneDenseCompactionThreshold { + continue + } + overlaps := p.vers.Overlaps(lastNonEmptyLevel, f.UserKeyBounds()) + if float64(overlaps.AggregateSizeSum())/float64(f.Size) > maxOverlappingRatio { + continue + } + if candidate == nil || candidate.Stats.TombstoneDenseBlocksRatio < f.Stats.TombstoneDenseBlocksRatio { + candidate = f + level = l + } + } + // We prefer lower level (ie. L5) candidates over higher level (ie. L4) ones. + if candidate != nil { + break + } + if !p.vers.Levels[l].Empty() { + lastNonEmptyLevel = l + } + } + + return p.pickedCompactionFromCandidateFile(candidate, env, level, defaultOutputLevel(level, p.baseLevel), compactionKindTombstoneDensity) +} + +// pickAutoLPositive picks an automatic compaction for the candidate +// file in a positive-numbered level. This function must not be used for +// L0. +func pickAutoLPositive( + env compactionEnv, + opts *Options, + vers *manifest.Version, + l0Organizer *manifest.L0Organizer, + cInfo candidateLevelInfo, + baseLevel int, +) (pc *pickedTableCompaction) { + if cInfo.level == 0 { + panic("pebble: pickAutoLPositive called for L0") + } + + pc = newPickedTableCompaction(opts, vers, l0Organizer, cInfo.level, defaultOutputLevel(cInfo.level, baseLevel), baseLevel) + if pc.outputLevel.level != cInfo.outputLevel { + panic("pebble: compaction picked unexpected output level") + } + pc.startLevel.files = cInfo.file.Slice() + + if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel, env.problemSpans) { + return nil + } + return pc.maybeAddLevel(opts, env.diskAvailBytes) +} + +// maybeAddLevel maybe adds a level to the picked compaction. +func (pc *pickedTableCompaction) maybeAddLevel( + opts *Options, diskAvailBytes uint64, +) *pickedTableCompaction { + pc.pickerMetrics.singleLevelOverlappingRatio = pc.overlappingRatio() + if pc.outputLevel.level == numLevels-1 { + // Don't add a level if the current output level is in L6. + return pc + } + if !opts.Experimental.MultiLevelCompactionHeuristic.allowL0() && pc.startLevel.level == 0 { + return pc + } + targetFileSize := opts.TargetFileSize(pc.outputLevel.level, pc.baseLevel) + if pc.estimatedInputSize() > expandedCompactionByteSizeLimit(opts, targetFileSize, diskAvailBytes) { + // Don't add a level if the current compaction exceeds the compaction size limit + return pc + } + return opts.Experimental.MultiLevelCompactionHeuristic.pick(pc, opts, diskAvailBytes) +} + +// MultiLevelHeuristic evaluates whether to add files from the next level into the compaction. +type MultiLevelHeuristic interface { + // Evaluate returns the preferred compaction. + pick(pc *pickedTableCompaction, opts *Options, diskAvailBytes uint64) *pickedTableCompaction + + // Returns if the heuristic allows L0 to be involved in ML compaction + allowL0() bool + + // String implements fmt.Stringer. + String() string +} + +// NoMultiLevel will never add an additional level to the compaction. +type NoMultiLevel struct{} + +var _ MultiLevelHeuristic = (*NoMultiLevel)(nil) + +func (nml NoMultiLevel) pick( + pc *pickedTableCompaction, opts *Options, diskAvailBytes uint64, +) *pickedTableCompaction { + return pc +} + +func (nml NoMultiLevel) allowL0() bool { return false } +func (nml NoMultiLevel) String() string { return "none" } + +func (pc *pickedTableCompaction) predictedWriteAmp() float64 { + var bytesToCompact uint64 + var higherLevelBytes uint64 + for i := range pc.inputs { + levelSize := pc.inputs[i].files.AggregateSizeSum() + bytesToCompact += levelSize + if i != len(pc.inputs)-1 { + higherLevelBytes += levelSize + } + } + return float64(bytesToCompact) / float64(higherLevelBytes) +} + +func (pc *pickedTableCompaction) overlappingRatio() float64 { + var higherLevelBytes uint64 + var lowestLevelBytes uint64 + for i := range pc.inputs { + levelSize := pc.inputs[i].files.AggregateSizeSum() + if i == len(pc.inputs)-1 { + lowestLevelBytes += levelSize + continue + } + higherLevelBytes += levelSize + } + return float64(lowestLevelBytes) / float64(higherLevelBytes) +} + +// WriteAmpHeuristic defines a multi level compaction heuristic which will add +// an additional level to the picked compaction if it reduces predicted write +// amp of the compaction + the addPropensity constant. +type WriteAmpHeuristic struct { + // addPropensity is a constant that affects the propensity to conduct multilevel + // compactions. If positive, a multilevel compaction may get picked even if + // the single level compaction has lower write amp, and vice versa. + AddPropensity float64 + + // AllowL0 if true, allow l0 to be involved in a ML compaction. + AllowL0 bool +} + +var _ MultiLevelHeuristic = (*WriteAmpHeuristic)(nil) + +// TODO(msbutler): microbenchmark the extent to which multilevel compaction +// picking slows down the compaction picking process. This should be as fast as +// possible since Compaction-picking holds d.mu, which prevents WAL rotations, +// in-progress flushes and compactions from completing, etc. Consider ways to +// deduplicate work, given that setupInputs has already been called. +func (wa WriteAmpHeuristic) pick( + pcOrig *pickedTableCompaction, opts *Options, diskAvailBytes uint64, +) *pickedTableCompaction { + pcMulti := pcOrig.clone() + if !pcMulti.setupMultiLevelCandidate(opts, diskAvailBytes) { + return pcOrig + } + // We consider the addition of a level as an "expansion" of the compaction. + // If pcMulti is past the expanded compaction byte size limit already, + // we don't consider it. + targetFileSize := opts.TargetFileSize(pcMulti.outputLevel.level, pcMulti.baseLevel) + if pcMulti.estimatedInputSize() >= expandedCompactionByteSizeLimit(opts, targetFileSize, diskAvailBytes) { + return pcOrig + } + picked := pcOrig + if pcMulti.predictedWriteAmp() <= pcOrig.predictedWriteAmp()+wa.AddPropensity { + picked = pcMulti + } + // Regardless of what compaction was picked, log the multilevelOverlapping ratio. + picked.pickerMetrics.multiLevelOverlappingRatio = pcMulti.overlappingRatio() + return picked +} + +func (wa WriteAmpHeuristic) allowL0() bool { + return wa.AllowL0 +} + +// String implements fmt.Stringer. +func (wa WriteAmpHeuristic) String() string { + return fmt.Sprintf("wamp(%.2f, %t)", wa.AddPropensity, wa.AllowL0) +} + +// Helper method to pick compactions originating from L0. Uses information about +// sublevels to generate a compaction. +func pickL0( + env compactionEnv, + opts *Options, + vers *manifest.Version, + l0Organizer *manifest.L0Organizer, + baseLevel int, +) *pickedTableCompaction { + // It is important to pass information about Lbase files to L0Sublevels + // so it can pick a compaction that does not conflict with an Lbase => Lbase+1 + // compaction. Without this, we observed reduced concurrency of L0=>Lbase + // compactions, and increasing read amplification in L0. + // + // TODO(bilal) Remove the minCompactionDepth parameter once fixing it at 1 + // has been shown to not cause a performance regression. + lcf := l0Organizer.PickBaseCompaction(opts.Logger, 1, vers.Levels[baseLevel].Slice(), baseLevel, env.problemSpans) + if lcf != nil { + pc := newPickedCompactionFromL0(lcf, opts, vers, l0Organizer, baseLevel, true) + if pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel, env.problemSpans) { + if pc.startLevel.files.Empty() { + opts.Logger.Errorf("%v", base.AssertionFailedf("empty compaction chosen")) + } + return pc.maybeAddLevel(opts, env.diskAvailBytes) + } + // TODO(radu): investigate why this happens. + // opts.Logger.Errorf("%v", base.AssertionFailedf("setupInputs failed")) + } + + // Couldn't choose a base compaction. Try choosing an intra-L0 + // compaction. Note that we pass in L0CompactionThreshold here as opposed to + // 1, since choosing a single sublevel intra-L0 compaction is + // counterproductive. + lcf = l0Organizer.PickIntraL0Compaction(env.earliestUnflushedSeqNum, minIntraL0Count, env.problemSpans) + if lcf != nil { + pc := newPickedCompactionFromL0(lcf, opts, vers, l0Organizer, baseLevel, false) + if pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel, env.problemSpans) { + if pc.startLevel.files.Empty() { + opts.Logger.Fatalf("empty compaction chosen") + } + // A single-file intra-L0 compaction is unproductive. + if iter := pc.startLevel.files.Iter(); iter.First() != nil && iter.Next() != nil { + pc.bounds = manifest.KeyRange(opts.Comparer.Compare, pc.startLevel.files.All()) + return pc + } + } else { + // TODO(radu): investigate why this happens. + // opts.Logger.Errorf("%v", base.AssertionFailedf("setupInputs failed")) + } + } + return nil +} + +func newPickedManualCompaction( + vers *manifest.Version, + l0Organizer *manifest.L0Organizer, + opts *Options, + env compactionEnv, + baseLevel int, + manual *manualCompaction, +) (pc *pickedTableCompaction, retryLater bool) { + outputLevel := manual.level + 1 + if manual.level == 0 { + outputLevel = baseLevel + } else if manual.level < baseLevel { + // The start level for a compaction must be >= Lbase. A manual + // compaction could have been created adhering to that condition, and + // then an automatic compaction came in and compacted all of the + // sstables in Lbase to Lbase+1 which caused Lbase to change. Simply + // ignore this manual compaction as there is nothing to do (manual.level + // points to an empty level). + return nil, false + } + // This conflictsWithInProgress call is necessary for the manual compaction to + // be retried when it conflicts with an ongoing automatic compaction. Without + // it, the compaction is dropped due to pc.setupInputs returning false since + // the input/output range is already being compacted, and the manual + // compaction ends with a non-compacted LSM. + if conflictsWithInProgress(manual, outputLevel, env.inProgressCompactions, opts.Comparer.Compare) { + return nil, true + } + pc = newPickedTableCompaction(opts, vers, l0Organizer, manual.level, defaultOutputLevel(manual.level, baseLevel), baseLevel) + pc.manualID = manual.id + manual.outputLevel = pc.outputLevel.level + pc.startLevel.files = vers.Overlaps(manual.level, base.UserKeyBoundsInclusive(manual.start, manual.end)) + if pc.startLevel.files.Empty() { + // Nothing to do + return nil, false + } + // We use nil problemSpans because we don't want problem spans to prevent + // manual compactions. + if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel, nil /* problemSpans */) { + // setupInputs returned false indicating there's a conflicting + // concurrent compaction. + return nil, true + } + if pc = pc.maybeAddLevel(opts, env.diskAvailBytes); pc == nil { + return nil, false + } + if pc.outputLevel.level != outputLevel { + if len(pc.inputs) > 2 { + // Multilevel compactions relax this invariant. + } else { + panic("pebble: compaction picked unexpected output level") + } + } + // Fail-safe to protect against compacting the same sstable concurrently. + if inputRangeAlreadyCompacting(opts.Comparer.Compare, env, pc) { + return nil, true + } + return pc, false +} + +// pickDownloadCompaction picks a download compaction for the downloadSpan, +// which could be specified as being performed either by a copy compaction of +// the backing file or a rewrite compaction. +func pickDownloadCompaction( + vers *manifest.Version, + l0Organizer *manifest.L0Organizer, + opts *Options, + env compactionEnv, + baseLevel int, + kind compactionKind, + level int, + file *manifest.TableMetadata, +) (pc *pickedTableCompaction) { + // Check if the file is compacting already. + if file.CompactionState == manifest.CompactionStateCompacting { + return nil + } + if kind != compactionKindCopy && kind != compactionKindRewrite { + panic("invalid download/rewrite compaction kind") + } + pc = newPickedTableCompaction(opts, vers, l0Organizer, level, level, baseLevel) + pc.kind = kind + pc.startLevel.files = manifest.NewLevelSliceKeySorted(opts.Comparer.Compare, []*manifest.TableMetadata{file}) + if !pc.setupInputs(opts, env.diskAvailBytes, pc.startLevel, nil /* problemSpans */) { + // setupInputs returned false indicating there's a conflicting + // concurrent compaction. + return nil + } + if pc.outputLevel.level != level { + panic("pebble: download compaction picked unexpected output level") + } + // Fail-safe to protect against compacting the same sstable concurrently. + if inputRangeAlreadyCompacting(opts.Comparer.Compare, env, pc) { + return nil + } + return pc +} + +func (p *compactionPickerByScore) pickReadTriggeredCompaction( + env compactionEnv, +) (pc *pickedTableCompaction) { + // If a flush is in-progress or expected to happen soon, it means more writes are taking place. We would + // soon be scheduling more write focussed compactions. In this case, skip read compactions as they are + // lower priority. + if env.readCompactionEnv.flushing || env.readCompactionEnv.readCompactions == nil { + return nil + } + for env.readCompactionEnv.readCompactions.size > 0 { + rc := env.readCompactionEnv.readCompactions.remove() + if pc = pickReadTriggeredCompactionHelper(p, rc, env); pc != nil { + break + } + } + return pc +} + +func pickReadTriggeredCompactionHelper( + p *compactionPickerByScore, rc *readCompaction, env compactionEnv, +) (pc *pickedTableCompaction) { + overlapSlice := p.vers.Overlaps(rc.level, base.UserKeyBoundsInclusive(rc.start, rc.end)) + var fileMatches bool + for f := range overlapSlice.All() { + if f.TableNum == rc.tableNum { + fileMatches = true + break + } + } + if !fileMatches { + return nil + } + + pc = newPickedTableCompaction(p.opts, p.vers, p.latestVersionState.l0Organizer, + rc.level, defaultOutputLevel(rc.level, p.baseLevel), p.baseLevel) + + pc.startLevel.files = overlapSlice + if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel, env.problemSpans) { + return nil + } + if inputRangeAlreadyCompacting(p.opts.Comparer.Compare, env, pc) { + return nil + } + pc.kind = compactionKindRead + + // Prevent read compactions which are too wide. + outputOverlaps := pc.version.Overlaps(pc.outputLevel.level, pc.bounds) + if outputOverlaps.AggregateSizeSum() > pc.maxReadCompactionBytes { + return nil + } + + // Prevent compactions which start with a small seed file X, but overlap + // with over allowedCompactionWidth * X file sizes in the output layer. + const allowedCompactionWidth = 35 + if outputOverlaps.AggregateSizeSum() > overlapSlice.AggregateSizeSum()*allowedCompactionWidth { + return nil + } + + return pc +} + +func (p *compactionPickerByScore) forceBaseLevel1() { + p.baseLevel = 1 +} + +func inputRangeAlreadyCompacting( + cmp base.Compare, env compactionEnv, pc *pickedTableCompaction, +) bool { + for _, cl := range pc.inputs { + for f := range cl.files.All() { + if f.IsCompacting() { + return true + } + } + } + + // Look for active compactions outputting to the same region of the key + // space in the same output level. Two potential compactions may conflict + // without sharing input files if there are no files in the output level + // that overlap with the intersection of the compactions' key spaces. + // + // Consider an active L0->Lbase compaction compacting two L0 files one + // [a-f] and the other [t-z] into Lbase. + // + // L0 + // ↦ 000100 ↤ ↦ 000101 ↤ + // L1 + // ↦ 000004 ↤ + // a b c d e f g h i j k l m n o p q r s t u v w x y z + // + // If a new file 000102 [j-p] is flushed while the existing compaction is + // still ongoing, new file would not be in any compacting sublevel + // intervals and would not overlap with any Lbase files that are also + // compacting. However, this compaction cannot be picked because the + // compaction's output key space [j-p] would overlap the existing + // compaction's output key space [a-z]. + // + // L0 + // ↦ 000100* ↤ ↦ 000102 ↤ ↦ 000101* ↤ + // L1 + // ↦ 000004* ↤ + // a b c d e f g h i j k l m n o p q r s t u v w x y z + // + // * - currently compacting + if pc.outputLevel != nil && pc.outputLevel.level != 0 { + for _, c := range env.inProgressCompactions { + if pc.outputLevel.level != c.outputLevel { + continue + } + if !c.bounds.Overlaps(cmp, &pc.bounds) { + continue + } + // The picked compaction and the in-progress compaction c are + // outputting to the same region of the key space of the same + // level. + return true + } + } + return false +} + +// conflictsWithInProgress checks if there are any in-progress compactions with overlapping keyspace. +func conflictsWithInProgress( + manual *manualCompaction, outputLevel int, inProgressCompactions []compactionInfo, cmp Compare, +) bool { + for _, c := range inProgressCompactions { + if (c.outputLevel == manual.level || c.outputLevel == outputLevel) && + areUserKeysOverlapping(manual.start, manual.end, c.bounds.Start, c.bounds.End.Key, cmp) { + return true + } + for _, in := range c.inputs { + if in.files.Empty() { + continue + } + iter := in.files.Iter() + smallest := iter.First().Smallest().UserKey + largest := iter.Last().Largest().UserKey + if (in.level == manual.level || in.level == outputLevel) && + areUserKeysOverlapping(manual.start, manual.end, smallest, largest, cmp) { + return true + } + } + } + return false +} + +func areUserKeysOverlapping(x1, x2, y1, y2 []byte, cmp Compare) bool { + return cmp(x1, y2) <= 0 && cmp(y1, x2) <= 0 +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/compaction_scheduler.go b/vendor/github.com/cockroachdb/pebble/v2/compaction_scheduler.go new file mode 100644 index 0000000..abe92f4 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/compaction_scheduler.go @@ -0,0 +1,473 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "sync" + "time" + + "github.com/cockroachdb/pebble/v2/internal/base" +) + +type CompactionGrantHandle = base.CompactionGrantHandle +type CompactionGrantHandleStats = base.CompactionGrantHandleStats +type CompactionGoroutineKind = base.CompactionGoroutineKind + +const ( + CompactionGoroutinePrimary = base.CompactionGoroutinePrimary + CompactionGoroutineSSTableSecondary = base.CompactionGoroutineSSTableSecondary + CompactionGoroutineBlobFileSecondary = base.CompactionGoroutineBlobFileSecondary +) + +// NB: This interface is experimental and subject to change. +// +// For instance, we may incorporate more information in TrySchedule and in the +// return value of Schedule to tell CompactionScheduler of the sub-category of +// compaction so that the scheduler can have more granular estimates. For +// example, the input or output level could affect the write bandwidth if the +// inputs are better cached (say at higher levels). + +// CompactionScheduler is responsible for scheduling both automatic and manual +// compactions. In the case of multiple DB instances on a node (i.e. a +// multi-store configuration), implementations of CompactionScheduler may +// enforce a global maximum compaction concurrency. Additionally, +// implementations of CompactionScheduler may be resource aware and permit +// more than the compactions that are "allowed without permission" if +// resources are available. +// +// Locking: CompactionScheduler's mutexes are ordered after DBForCompaction +// mutexes. We need to specify some lock ordering since CompactionScheduler +// and DBForCompaction call into each other. This ordering choice is made to +// simplify the implementation of DBForCompaction. There are three exceptions +// to this DBForCompaction.GetAllowedWithoutPermission, +// CompactionScheduler.Unregister, CompactionGrantHandle.Done -- see those +// declarations for details. +type CompactionScheduler interface { + // Register is called to register this DB and to specify the number of + // goroutines that consume CPU in each compaction (see the CPU reporting + // interface, CompactionGrantHandle.MeasureCPU). Must be called exactly once + // by this DB if it successfully opens. + Register(numGoroutinesPerCompaction int, db DBForCompaction) + // Unregister is used to unregister the DB. Must be called once when the DB + // is being closed. Unregister waits until all ongoing calls to + // DBForCompaction are finished, so Unregister must not be called while + // holding locks that DBForCompaction acquires in those calls. + Unregister() + // TrySchedule is called by DB when it wants to run a compaction. The bool + // is true iff permission is granted, and in that case the + // CompactionGrantHandle needs to be exercised by the DB. + TrySchedule() (bool, CompactionGrantHandle) + // UpdateGetAllowedWithoutPermission is to inform the scheduler that some + // external behavior may have caused this value to change. It exists because + // flushes are not otherwise visible to the CompactionScheduler, and can + // cause the value to increase. CompactionScheduler implementation should do + // periodic sampling (e.g. as done by + // ConcurrencyLimitScheduler.periodicGranter), but this provides an + // instantaneous opportunity to act. + UpdateGetAllowedWithoutPermission() +} + +// DBForCompaction is the interface implemented by the DB to interact with the +// CompactionScheduler. +type DBForCompaction interface { + // GetAllowedWithoutPermission returns what is permitted at the DB-level + // (there may be further restrictions at the node level, when there are + // multiple DBs at a node, which is not captured by this number). This can + // vary based on compaction backlog or other factors. This method must not + // acquire any mutex in DBForCompaction that is covered by the general mutex + // ordering rule stated earlier. + GetAllowedWithoutPermission() int + // GetWaitingCompaction returns true iff the DB can run a compaction. The + // true return is accompanied by a populated WaitingForCompaction, that the + // scheduler can use to pick across DBs or other work in the system. This + // method should typically be efficient, in that the DB should try to cache + // some state if its previous call to TrySchedule resulted in a failure to + // get permission. It is ok if it is sometimes slow since all work scheduled + // by CompactionScheduler is long-lived (often executing for multiple + // seconds). + GetWaitingCompaction() (bool, WaitingCompaction) + // Schedule grants the DB permission to run a compaction. The DB returns + // true iff it accepts the grant, in which case it must exercise the + // CompactionGrantHandle. + Schedule(CompactionGrantHandle) bool +} + +// WaitingCompaction captures state for a compaction that can be used to +// prioritize wrt compactions in other DBs or other long-lived work in the +// system. +type WaitingCompaction struct { + // Optional is true for a compaction that isn't necessary for maintaining an + // overall healthy LSM. This value can be compared across compactions and + // other long-lived work. + Optional bool + // Priority is the priority of a compaction. It is only compared across + // compactions, and when the Optional value is the same. + Priority int + // Score is only compared across compactions. It is only compared across + // compactions, and when the Optional and Priority are the same. + Score float64 +} + +// Ordering is by priority and if the optional value is different, false is +// more important than true. +// +// The ordering here must be consistent with the order in which compactions +// are picked in compactionPickerByScore.pickAuto. +type compactionOptionalAndPriority struct { + optional bool + priority int +} + +var scheduledCompactionMap map[compactionKind]compactionOptionalAndPriority +var manualCompactionPriority int + +func init() { + // Manual compactions have priority just below the score-rebased + // compactions, since DB.pickAnyCompaction first picks score-based + // compactions, and then manual compactions. + manualCompactionPriority = 70 + scheduledCompactionMap = map[compactionKind]compactionOptionalAndPriority{} + // Score-based-compactions have priorities {100, 90, 80}. + // + // We don't actually know if it is a compactionKindMove or + // compactionKindCopy until a compactionKindDefault is turned from a + // pickedCompaction into a compaction struct. So we will never see those + // values here, but for completeness we include them. + scheduledCompactionMap[compactionKindMove] = compactionOptionalAndPriority{priority: 100} + scheduledCompactionMap[compactionKindCopy] = compactionOptionalAndPriority{priority: 90} + scheduledCompactionMap[compactionKindDefault] = compactionOptionalAndPriority{priority: 80} + scheduledCompactionMap[compactionKindTombstoneDensity] = + compactionOptionalAndPriority{optional: true, priority: 60} + scheduledCompactionMap[compactionKindElisionOnly] = + compactionOptionalAndPriority{optional: true, priority: 50} + scheduledCompactionMap[compactionKindBlobFileRewrite] = + compactionOptionalAndPriority{optional: true, priority: 40} + scheduledCompactionMap[compactionKindRead] = + compactionOptionalAndPriority{optional: true, priority: 30} + scheduledCompactionMap[compactionKindRewrite] = + compactionOptionalAndPriority{optional: true, priority: 20} +} + +// noopGrantHandle is used in cases that don't interact with a CompactionScheduler. +type noopGrantHandle struct{} + +var _ CompactionGrantHandle = noopGrantHandle{} + +func (h noopGrantHandle) Started() {} +func (h noopGrantHandle) MeasureCPU(CompactionGoroutineKind) {} +func (h noopGrantHandle) CumulativeStats(stats base.CompactionGrantHandleStats) {} +func (h noopGrantHandle) Done() {} + +// pickedCompactionCache is used to avoid the work of repeatedly picking a +// compaction that then fails to run immediately because TrySchedule returns +// false. +// +// The high-level approach is to construct a pickedCompaction in +// DB.maybeScheduleCompaction if there isn't one in the cache, and if +// TrySchedule returns false, to remember it. Ignoring flushes, the worst-case +// behavior is 1 of 2 pickedCompactions gets to run (so half the picking work +// is wasted). This worst-case happens when the system is running at the limit +// of the long-lived work (including compactions) it can support. In this +// setting, each started compaction invalidates the pickedCompaction in the +// cache when it completes, and the reason the cache has a pickedCompaction +// (that got invalidated) is that the CompactionScheduler called +// GetWaitingCompaction and decided not to run the pickedCompaction (some +// other work won). We consider the CPU overhead of this waste acceptable. +// +// For the default case of a ConcurrencyLimitScheduler, which only considers a +// single DB, the aforementioned worst-case is avoided by not constructing a +// new pickedCompaction in DB.maybeScheduleCompaction when +// pickedCompactionCache.isWaiting is already true (which became true once, +// when a backlog developed). Whenever a compaction completes and a new +// compaction can be started, the call to DBForCompaction.GetWaitingCompaction +// constructs a new pickedCompaction and caches it, and then this immediately +// gets to run when DBForCompaction.Schedule is called. +type pickedCompactionCache struct { + // pc != nil => waiting. + // + // It is acceptable for waiting to be true and pc to be nil, when pc is + // invalidated due to starting a compaction, or completing a + // compaction/flush (since it changes the latest version). + waiting bool + pc pickedCompaction +} + +// invalidate the cache because a new Version is installed or a compaction is +// started (since a new in-progress compaction affects future compaction +// picking). The value of waiting is not changed. +func (c *pickedCompactionCache) invalidate() { + c.pc = nil +} + +// isWaiting returns the value of waiting. +func (c *pickedCompactionCache) isWaiting() bool { + return c.waiting +} + +// getForRunning returns a pickedCompaction if in the cache. The cache is +// cleared. It may return nil. +func (c *pickedCompactionCache) getForRunning() pickedCompaction { + // NB: This does not set c.waiting = false, since there may be more + // compactions to run. + pc := c.pc + c.pc = nil + return pc +} + +// setNotWaiting sets waiting to false. +func (c *pickedCompactionCache) setNotWaiting() { + c.waiting = false + c.pc = nil +} + +// peek return the pickedCompaction, if any, in the cache. +func (c *pickedCompactionCache) peek() pickedCompaction { + return c.pc +} + +// add adds a pickedCompaction to the cache and sets waiting to true. +func (c *pickedCompactionCache) add(pc pickedCompaction) { + c.waiting = true + c.pc = pc +} + +// ConcurrencyLimitScheduler is the default scheduler used by Pebble. It +// simply uses the concurrency limit retrieved from +// DBForCompaction.GetAllowedWithoutPermission to decide the number of +// compactions to schedule. ConcurrencyLimitScheduler must have its Register +// method called at most once -- i.e., it cannot be reused across DBs. +// +// Since the GetAllowedWithoutPermission value changes over time, the +// scheduler needs to be quite current in its sampling, especially if the +// value is increasing, to prevent lag in scheduling compactions. Calls to +// ConcurrencyLimitScheduler.Done and ConcurrencyLimitScheduler.TrySchedule +// are obvious places this value is sampled. However, since +// ConcurrencyLimitScheduler does not observe flushes (which can increase the +// value), and there can be situations where compactions last 10+ seconds, +// this sampling is not considered sufficient. Note that calls to +// ConcurrencyLimitScheduler.TrySchedule are dampened in +// DB.maybeScheduleCompaction when there is a waiting compaction (to prevent +// wasted computation of pickedCompaction). If DB.maybeScheduleCompaction +// always called ConcurrencyLimitScheduler.TrySchedule we would have no lag as +// DB.maybeScheduleCompaction is called on flush completion. Hence, we resort +// to having a background thread in ConcurrencyLimitScheduler sample the value +// every 100ms, plus sample in UpdateGetAllowedWithoutPermission. +type ConcurrencyLimitScheduler struct { + ts schedulerTimeSource + // db is set in Register, but not protected by mu since it is strictly + // before any calls to the other methods. + db DBForCompaction + mu struct { + sync.Mutex + runningCompactions int + // unregistered transitions once from false => true. + unregistered bool + // isGranting is used to (a) serialize granting from Done and + // periodicGranter, (b) ensure that granting is stopped before returning + // from Unregister. + isGranting bool + isGrantingCond *sync.Cond + lastAllowedWithoutPermission int + } + stopPeriodicGranterCh chan struct{} + pokePeriodicGranterCh chan struct{} + // Only non-nil in some tests. + periodicGranterRanChForTesting chan struct{} +} + +var _ CompactionScheduler = &ConcurrencyLimitScheduler{} + +func newConcurrencyLimitScheduler(ts schedulerTimeSource) *ConcurrencyLimitScheduler { + s := &ConcurrencyLimitScheduler{ + ts: ts, + stopPeriodicGranterCh: make(chan struct{}), + pokePeriodicGranterCh: make(chan struct{}, 1), + } + s.mu.isGrantingCond = sync.NewCond(&s.mu.Mutex) + return s +} + +func NewConcurrencyLimitSchedulerWithNoPeriodicGrantingForTest() *ConcurrencyLimitScheduler { + s := &ConcurrencyLimitScheduler{ + ts: defaultTimeSource{}, + } + s.mu.isGrantingCond = sync.NewCond(&s.mu.Mutex) + return s +} + +func (s *ConcurrencyLimitScheduler) Register(numGoroutinesPerCompaction int, db DBForCompaction) { + s.db = db + if s.stopPeriodicGranterCh != nil { + go s.periodicGranter() + } +} + +func (s *ConcurrencyLimitScheduler) Unregister() { + if s.stopPeriodicGranterCh != nil { + s.stopPeriodicGranterCh <- struct{}{} + } + s.mu.Lock() + defer s.mu.Unlock() + s.mu.unregistered = true + // Wait until isGranting becomes false. Since unregistered has been set to + // true, once isGranting becomes false, no more granting will happen. + for s.mu.isGranting { + s.mu.isGrantingCond.Wait() + } +} + +func (s *ConcurrencyLimitScheduler) TrySchedule() (bool, CompactionGrantHandle) { + s.mu.Lock() + defer s.mu.Unlock() + if s.mu.unregistered { + return false, nil + } + s.mu.lastAllowedWithoutPermission = s.db.GetAllowedWithoutPermission() + if s.mu.lastAllowedWithoutPermission > s.mu.runningCompactions { + s.mu.runningCompactions++ + return true, s + } + return false, nil +} + +func (s *ConcurrencyLimitScheduler) Started() {} +func (s *ConcurrencyLimitScheduler) MeasureCPU(CompactionGoroutineKind) {} +func (s *ConcurrencyLimitScheduler) CumulativeStats(stats base.CompactionGrantHandleStats) {} + +func (s *ConcurrencyLimitScheduler) Done() { + s.mu.Lock() + s.mu.runningCompactions-- + s.tryGrantLockedAndUnlock() +} + +func (s *ConcurrencyLimitScheduler) UpdateGetAllowedWithoutPermission() { + s.mu.Lock() + allowedWithoutPermission := s.db.GetAllowedWithoutPermission() + tryGrant := allowedWithoutPermission > s.mu.lastAllowedWithoutPermission + s.mu.lastAllowedWithoutPermission = allowedWithoutPermission + s.mu.Unlock() + if tryGrant { + select { + case s.pokePeriodicGranterCh <- struct{}{}: + default: + } + } +} + +func (s *ConcurrencyLimitScheduler) tryGrantLockedAndUnlock() { + defer s.mu.Unlock() + if s.mu.unregistered { + return + } + // Wait for turn to grant. + for s.mu.isGranting { + s.mu.isGrantingCond.Wait() + } + // INVARIANT: !isGranting. + if s.mu.unregistered { + return + } + s.mu.lastAllowedWithoutPermission = s.db.GetAllowedWithoutPermission() + toGrant := s.mu.lastAllowedWithoutPermission - s.mu.runningCompactions + if toGrant > 0 { + s.mu.isGranting = true + } else { + return + } + s.mu.Unlock() + // We call GetWaitingCompaction iff we can successfully grant, so that there + // is no wasted pickedCompaction. + // + // INVARIANT: loop exits with s.mu unlocked. + for toGrant > 0 { + waiting, _ := s.db.GetWaitingCompaction() + if !waiting { + break + } + accepted := s.db.Schedule(s) + if !accepted { + break + } + s.mu.Lock() + s.mu.runningCompactions++ + toGrant-- + s.mu.Unlock() + } + // Will be unlocked by the defer statement. + s.mu.Lock() + s.mu.isGranting = false + s.mu.isGrantingCond.Broadcast() +} + +func (s *ConcurrencyLimitScheduler) periodicGranter() { + ticker := s.ts.newTicker(100 * time.Millisecond) + for { + select { + case <-ticker.ch(): + s.mu.Lock() + s.tryGrantLockedAndUnlock() + case <-s.pokePeriodicGranterCh: + s.mu.Lock() + s.tryGrantLockedAndUnlock() + case <-s.stopPeriodicGranterCh: + ticker.stop() + return + } + if s.periodicGranterRanChForTesting != nil { + s.periodicGranterRanChForTesting <- struct{}{} + } + } +} + +func (s *ConcurrencyLimitScheduler) adjustRunningCompactionsForTesting(delta int) { + s.mu.Lock() + s.mu.runningCompactions += delta + if delta < 0 { + s.tryGrantLockedAndUnlock() + } else { + s.mu.Unlock() + } +} + +func (s *ConcurrencyLimitScheduler) isUnregisteredForTesting() bool { + s.mu.Lock() + defer s.mu.Unlock() + return s.mu.unregistered +} + +// schedulerTimeSource is used to abstract time.NewTicker for +// ConcurrencyLimitScheduler. +type schedulerTimeSource interface { + newTicker(duration time.Duration) schedulerTicker +} + +// schedulerTicker is used to abstract time.Ticker for +// ConcurrencyLimitScheduler. +type schedulerTicker interface { + stop() + ch() <-chan time.Time +} + +// defaultTime is a schedulerTimeSource using the time package. +type defaultTimeSource struct{} + +var _ schedulerTimeSource = defaultTimeSource{} + +func (defaultTimeSource) newTicker(duration time.Duration) schedulerTicker { + return (*defaultTicker)(time.NewTicker(duration)) +} + +// defaultTicker uses time.Ticker. +type defaultTicker time.Ticker + +var _ schedulerTicker = &defaultTicker{} + +func (t *defaultTicker) stop() { + (*time.Ticker)(t).Stop() +} + +func (t *defaultTicker) ch() <-chan time.Time { + return (*time.Ticker)(t).C +} diff --git a/vendor/github.com/cockroachdb/pebble/comparer.go b/vendor/github.com/cockroachdb/pebble/v2/comparer.go similarity index 84% rename from vendor/github.com/cockroachdb/pebble/comparer.go rename to vendor/github.com/cockroachdb/pebble/v2/comparer.go index c92cd79..f2fb1d1 100644 --- a/vendor/github.com/cockroachdb/pebble/comparer.go +++ b/vendor/github.com/cockroachdb/pebble/v2/comparer.go @@ -4,7 +4,7 @@ package pebble -import "github.com/cockroachdb/pebble/internal/base" +import "github.com/cockroachdb/pebble/v2/internal/base" // Compare exports the base.Compare type. type Compare = base.Compare @@ -29,3 +29,6 @@ type Comparer = base.Comparer // DefaultComparer exports the base.DefaultComparer variable. var DefaultComparer = base.DefaultComparer + +// CheckComparer exports the base.CheckComparer type. +var CheckComparer = base.CheckComparer diff --git a/vendor/github.com/cockroachdb/pebble/db.go b/vendor/github.com/cockroachdb/pebble/v2/db.go similarity index 67% rename from vendor/github.com/cockroachdb/pebble/db.go rename to vendor/github.com/cockroachdb/pebble/v2/db.go index 67a2065..9209752 100644 --- a/vendor/github.com/cockroachdb/pebble/db.go +++ b/vendor/github.com/cockroachdb/pebble/v2/db.go @@ -3,44 +3,50 @@ // the LICENSE file. // Package pebble provides an ordered key/value store. -package pebble // import "github.com/cockroachdb/pebble" +package pebble // import "github.com/cockroachdb/pebble/v2" import ( "context" "fmt" "io" - "os" - "strconv" + "slices" "sync" "sync/atomic" "time" + "unsafe" + "github.com/cockroachdb/crlib/crtime" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/arenaskl" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invalidating" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/internal/manual" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/remote" - "github.com/cockroachdb/pebble/rangekey" - "github.com/cockroachdb/pebble/record" - "github.com/cockroachdb/pebble/sstable" - "github.com/cockroachdb/pebble/vfs" - "github.com/cockroachdb/pebble/vfs/atomicfs" + "github.com/cockroachdb/pebble/v2/internal/arenaskl" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/cache" + "github.com/cockroachdb/pebble/v2/internal/invalidating" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/manual" + "github.com/cockroachdb/pebble/v2/internal/problemspans" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/rangekey" + "github.com/cockroachdb/pebble/v2/record" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/pebble/v2/vfs/atomicfs" + "github.com/cockroachdb/pebble/v2/wal" "github.com/cockroachdb/tokenbucket" "github.com/prometheus/client_golang/prometheus" ) const ( - // minTableCacheSize is the minimum size of the table cache, for a single db. - minTableCacheSize = 64 + // minFileCacheSize is the minimum size of the file cache, for a single db. + minFileCacheSize = 64 - // numNonTableCacheFiles is an approximation for the number of files - // that we don't use for table caches, for a given db. - numNonTableCacheFiles = 10 + // numNonFileCacheFiles is an approximation for the number of files + // that we don't account for in the file cache, for a given db. + numNonFileCacheFiles = 10 ) var ( @@ -77,6 +83,10 @@ type Reader interface { // SeekLT, First or Last. NewIter(o *IterOptions) (*Iterator, error) + // NewIterWithContext is like NewIter, and additionally accepts a context + // for tracing. + NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) + // Close closes the Reader. It may or may not close any underlying io.Reader // or io.Writer, depending on how the DB was created. // @@ -131,8 +141,28 @@ type Writer interface { // properly. Only use if you have a workload where the performance gain is critical and you // can guarantee that a record is written once and then deleted once. // - // SingleDelete is internally transformed into a Delete if the most recent record for a key is either - // a Merge or Delete record. + // Note that SINGLEDEL, SET, SINGLEDEL, SET, DEL/RANGEDEL, ... from most + // recent to older will work as intended since there is a single SET + // sandwiched between SINGLEDEL/DEL/RANGEDEL. + // + // IMPLEMENTATION WARNING: By offering SingleDelete, Pebble must guarantee + // that there is no duplication of writes inside Pebble. That is, idempotent + // application of writes is insufficient. For example, if a SET operation + // gets duplicated inside Pebble, resulting in say SET#20 and SET#17, the + // caller may issue a SINGLEDEL#25 and it will not have the desired effect. + // A duplication where a SET#20 is duplicated across two sstables will have + // the same correctness problem, since the SINGLEDEL may meet one of the + // SETs. This guarantee is partially achieved by ensuring that a WAL and a + // flushable are usually in one-to-one correspondence, and atomically + // updating the MANIFEST when the flushable is flushed (which ensures the + // WAL will never be replayed). There is one exception: a flushableBatch (a + // batch too large to fit in a memtable) is written to the end of the WAL + // that it shares with the preceding memtable. This is safe because the + // memtable and the flushableBatch are part of the same flush (see DB.flush1 + // where this invariant is maintained). If the memtable were to be flushed + // without the flushableBatch, the WAL cannot yet be deleted and if a crash + // happened, the WAL would be replayed despite the memtable already being + // flushed. // // It is safe to modify the contents of the arguments after SingleDelete returns. SingleDelete(key []byte, o *WriteOptions) error @@ -192,40 +222,6 @@ type Writer interface { RangeKeyDelete(start, end []byte, opts *WriteOptions) error } -// CPUWorkHandle represents a handle used by the CPUWorkPermissionGranter API. -type CPUWorkHandle interface { - // Permitted indicates whether Pebble can use additional CPU resources. - Permitted() bool -} - -// CPUWorkPermissionGranter is used to request permission to opportunistically -// use additional CPUs to speed up internal background work. -type CPUWorkPermissionGranter interface { - // GetPermission returns a handle regardless of whether permission is granted - // or not. In the latter case, the handle is only useful for recording - // the CPU time actually spent on this calling goroutine. - GetPermission(time.Duration) CPUWorkHandle - // CPUWorkDone must be called regardless of whether CPUWorkHandle.Permitted - // returns true or false. - CPUWorkDone(CPUWorkHandle) -} - -// Use a default implementation for the CPU work granter to avoid excessive nil -// checks in the code. -type defaultCPUWorkHandle struct{} - -func (d defaultCPUWorkHandle) Permitted() bool { - return false -} - -type defaultCPUWorkGranter struct{} - -func (d defaultCPUWorkGranter) GetPermission(_ time.Duration) CPUWorkHandle { - return defaultCPUWorkHandle{} -} - -func (d defaultCPUWorkGranter) CPUWorkDone(_ CPUWorkHandle) {} - // DB provides a concurrent, persistent ordered key/value store. // // A DB's basic operations (Get, Set, Delete) should be self-explanatory. Get @@ -264,15 +260,19 @@ type DB struct { // recycled. memTableRecycle atomic.Pointer[memTable] - // The size of the current log file (i.e. db.mu.log.queue[len(queue)-1]. + // The logical size of the current WAL. logSize atomic.Uint64 + // The number of input bytes to the log. This is the raw size of the + // batches written to the WAL, without the overhead of the record + // envelopes. + logBytesIn atomic.Uint64 // The number of bytes available on disk. - diskAvailBytes atomic.Uint64 + diskAvailBytes atomic.Uint64 + lowDiskSpaceReporter lowDiskSpaceReporter - cacheID uint64 + cacheHandle *cache.Handle dirname string - walDirname string opts *Options cmp Compare equal Equal @@ -292,11 +292,10 @@ type DB struct { fileLock *Lock dataDir vfs.File - walDir vfs.File - tableCache *tableCacheContainer + fileCache *fileCacheHandle newIters tableNewIters - tableNewRangeKeyIter keyspan.TableNewSpanIter + tableNewRangeKeyIter keyspanimpl.TableNewSpanIter commit *commitPipeline @@ -306,11 +305,6 @@ type DB struct { sync.RWMutex val *readState } - // logRecycler holds a set of log file numbers that are available for - // reuse. Writing to a recycled log file is faster than to a new log file on - // some common filesystems (xfs, and ext3/4) due to avoiding metadata - // updates. - logRecycler logRecycler closed *atomic.Value closedCh chan struct{} @@ -329,7 +323,7 @@ type DB struct { // // Care is taken to avoid holding DB.mu during IO operations. Accomplishing // this sometimes requires releasing DB.mu in a method that was called with - // it held. See versionSet.logAndApply() and DB.makeRoomForWrite() for + // it held. See versionSet.UpdateVersionLocked() and DB.makeRoomForWrite() for // examples. This is a common pattern, so be careful about expectations that // DB.mu will be held continuously across a set of calls. mu struct { @@ -362,7 +356,7 @@ type DB struct { // notifications and act as a mechanism for tying together the events and // log messages for a single job such as a flush, compaction, or file // ingestion. Job IDs are not serialized to disk or used for correctness. - nextJobID int + nextJobID JobID // The collection of immutable versions and state about the log and visible // sequence numbers. Use the pointer here to ensure the atomic fields in @@ -370,30 +364,33 @@ type DB struct { versions *versionSet log struct { - // The queue of logs, containing both flushed and unflushed logs. The - // flushed logs will be a prefix, the unflushed logs a suffix. The - // delimeter between flushed and unflushed logs is - // versionSet.minUnflushedLogNum. - queue []fileInfo - // The number of input bytes to the log. This is the raw size of the - // batches written to the WAL, without the overhead of the record - // envelopes. Requires DB.mu to be held when read or written. - bytesIn uint64 - // The LogWriter is protected by commitPipeline.mu. This allows log - // writes to be performed without holding DB.mu, but requires both + // manager is not protected by mu, but calls to Create must be + // serialized, and happen after the previous writer is closed. + manager wal.Manager + // The Writer is protected by commitPipeline.mu. This allows log writes + // to be performed without holding DB.mu, but requires both // commitPipeline.mu and DB.mu to be held when rotating the WAL/memtable - // (i.e. makeRoomForWrite). - *record.LogWriter - // Can be nil. + // (i.e. makeRoomForWrite). Can be nil. + writer wal.Writer metrics struct { + // fsyncLatency has its own internal synchronization, and is not + // protected by mu. fsyncLatency prometheus.Histogram + // Updated whenever a wal.Writer is closed. record.LogWriterMetrics } - registerLogWriterForTesting func(w *record.LogWriter) } mem struct { - // The current mutable memTable. + // The current mutable memTable. Readers of the pointer may hold + // either DB.mu or commitPipeline.mu. + // + // Its internal fields are protected by commitPipeline.mu. This + // allows batch commits to be performed without DB.mu as long as no + // memtable rotation is required. + // + // Both commitPipeline.mu and DB.mu must be held when rotating the + // memtable. mutable *memTable // Queue of flushables (the mutable memtable is at end). Elements are // added to the end of the slice and removed from the beginning. Once an @@ -415,20 +412,28 @@ type DB struct { cond sync.Cond // True when a flush is in progress. flushing bool - // The number of ongoing compactions. + // The number of ongoing non-download compactions. compactingCount int + // The number of download compactions. + downloadingCount int // The list of deletion hints, suggesting ranges for delete-only // compactions. deletionHints []deleteCompactionHint // The list of manual compactions. The next manual compaction to perform // is at the start of the list. New entries are added to the end. - manual []*manualCompaction + manual []*manualCompaction + manualLen atomic.Int32 + // manualID is used to identify manualCompactions in the manual slice. + manualID uint64 + // downloads is the list of pending download tasks. The next download to + // perform is at the start of the list. New entries are added to the end. + downloads []*downloadSpanTask // inProgress is the set of in-progress flushes and compactions. // It's used in the calculation of some metrics and to initialize L0 // sublevels' state. Some of the compactions contained within this // map may have already committed an edit to the version but are // lingering performing cleanup, like deleting obsolete files. - inProgress map[*compaction]struct{} + inProgress map[compaction]struct{} // rescheduleReadCompaction indicates to an iterator that a read compaction // should be scheduled. @@ -445,13 +450,19 @@ type DB struct { flushWriteThroughput ThroughputMetric // The idle start time for the flush "loop", i.e., when the flushing // bool above transitions to false. - noOngoingFlushStartTime time.Time + noOngoingFlushStartTime crtime.Mono } - // Non-zero when file cleaning is disabled. The disabled count acts as a - // reference count to prohibit file cleaning. See - // DB.{disable,Enable}FileDeletions(). - disableFileDeletions int + fileDeletions struct { + // Non-zero when file cleaning is disableCount. The disableCount + // count acts as a reference count to prohibit file cleaning. See + // DB.{disable,enable}FileDeletions(). + disableCount int + // queuedStats holds cumulative stats for files that have been + // queued for deletion by the cleanup manager. These stats are + // monotonically increasing for the *DB's lifetime. + queuedStats obsoleteObjectStats + } snapshots struct { // The list of active snapshots. @@ -478,7 +489,7 @@ type DB struct { // Compactions, ingests, flushes append files to be processed. An // active stat collection goroutine clears the list and processes // them. - pending []manifest.NewFileEntry + pending []manifest.NewTableEntry } tableValidation struct { @@ -488,12 +499,28 @@ type DB struct { // pending is a slice of metadata for sstables waiting to be // validated. Only physical sstables should be added to the pending // queue. - pending []newFileEntry + pending []manifest.NewTableEntry // validating is set to true when validation is running. validating bool } + + // annotators contains various instances of manifest.Annotator which + // should be protected from concurrent access. + annotators struct { + // totalFileSize is the sum of the size of all files in the + // database. This includes local, remote, and external sstables -- + // along with blob files. + totalFileSize *manifest.Annotator[uint64] + remoteSize *manifest.Annotator[uint64] + externalSize *manifest.Annotator[uint64] + } } + // problemSpans keeps track of spans of keys within LSM levels where + // compactions have failed; used to avoid retrying these compactions too + // quickly. + problemSpans problemspans.ByLevel + // Normally equal to time.Now() but may be overridden in tests. timeNow func() time.Time // the time at database Open; may be used to compute metrics like effective @@ -544,7 +571,7 @@ func (d *DB) getInternal(key []byte, b *Batch, s *Snapshot) ([]byte, io.Closer, // Determine the seqnum to read at after grabbing the read state (current and // memtables) above. - var seqNum uint64 + var seqNum base.SeqNum if s != nil { seqNum = s.seqNum } else { @@ -555,15 +582,22 @@ func (d *DB) getInternal(key []byte, b *Batch, s *Snapshot) ([]byte, io.Closer, get := &buf.get *get = getIter{ - logger: d.opts.Logger, comparer: d.opts.Comparer, newIters: d.newIters, snapshot: seqNum, - key: key, - batch: b, - mem: readState.memtables, - l0: readState.current.L0SublevelFiles, - version: readState.current, + iterOpts: IterOptions{ + // TODO(sumeer): replace with a parameter provided by the caller. + Category: categoryGet, + logger: d.opts.Logger, + snapshotForHideObsoletePoints: seqNum, + }, + key: key, + // Compute the key prefix for bloom filtering. + prefix: key[:d.opts.Comparer.Split(key)], + batch: b, + mem: readState.memtables, + l0: readState.current.L0SublevelFiles, + version: readState.current, } // Strip off memtables which cannot possibly contain the seqNum being read @@ -588,6 +622,9 @@ func (d *DB) getInternal(key []byte, b *Batch, s *Snapshot) ([]byte, io.Closer, readState: readState, keyBuf: buf.keyBuf, } + // Set up a blob value fetcher to use for retrieving values from blob files. + i.blobValueFetcher.Init(&readState.current.BlobFiles, d.fileCache, block.NoReadEnv) + get.iiopts.blobValueFetcher = &i.blobValueFetcher if !i.First() { err := i.Close() @@ -610,8 +647,7 @@ func (d *DB) Set(key, value []byte, opts *WriteOptions) error { return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // Delete deletes the value for the given key. Deletes are blind all will @@ -625,8 +661,7 @@ func (d *DB) Delete(key []byte, opts *WriteOptions) error { return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // DeleteSized behaves identically to Delete, but takes an additional @@ -649,13 +684,14 @@ func (d *DB) DeleteSized(key []byte, valueSize uint32, opts *WriteOptions) error return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // SingleDelete adds an action to the batch that single deletes the entry for key. // See Writer.SingleDelete for more details on the semantics of SingleDelete. // +// WARNING: See the detailed warning in Writer.SingleDelete before using this. +// // It is safe to modify the contents of the arguments after SingleDelete returns. func (d *DB) SingleDelete(key []byte, opts *WriteOptions) error { b := newBatch(d) @@ -664,8 +700,7 @@ func (d *DB) SingleDelete(key []byte, opts *WriteOptions) error { return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // DeleteRange deletes all of the keys (and values) in the range [start,end) @@ -680,8 +715,7 @@ func (d *DB) DeleteRange(start, end []byte, opts *WriteOptions) error { return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // Merge adds an action to the DB that merges the value at key with the new @@ -696,8 +730,7 @@ func (d *DB) Merge(key, value []byte, opts *WriteOptions) error { return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // LogData adds the specified to the batch. The data will be written to the @@ -712,8 +745,7 @@ func (d *DB) LogData(data []byte, opts *WriteOptions) error { return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC @@ -729,8 +761,7 @@ func (d *DB) RangeKeySet(start, end, suffix, value []byte, opts *WriteOptions) e return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // RangeKeyUnset removes a range key mapping the key range [start, end) at the @@ -748,8 +779,7 @@ func (d *DB) RangeKeyUnset(start, end, suffix []byte, opts *WriteOptions) error return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // RangeKeyDelete deletes all of the range keys in the range [start,end) @@ -766,8 +796,7 @@ func (d *DB) RangeKeyDelete(start, end []byte, opts *WriteOptions) error { return err } // Only release the batch on success. - b.release() - return nil + return b.Close() } // Apply the operations contained in the batch to the DB. If the batch is large @@ -776,6 +805,8 @@ func (d *DB) RangeKeyDelete(start, end []byte, opts *WriteOptions) error { // reuse them. // // It is safe to modify the contents of the arguments after Apply returns. +// +// Apply returns ErrInvalidBatch if the provided batch is invalid in any way. func (d *DB) Apply(batch *Batch, opts *WriteOptions) error { return d.applyInternal(batch, opts, false) } @@ -822,20 +853,17 @@ func (d *DB) applyInternal(batch *Batch, opts *WriteOptions, noSyncWait bool) er return errors.New("pebble: WAL disabled") } - if batch.minimumFormatMajorVersion != FormatMostCompatible { - if fmv := d.FormatMajorVersion(); fmv < batch.minimumFormatMajorVersion { - panic(fmt.Sprintf( - "pebble: batch requires at least format major version %d (current: %d)", - batch.minimumFormatMajorVersion, fmv, - )) - } + if fmv := d.FormatMajorVersion(); fmv < batch.minimumFormatMajorVersion { + panic(fmt.Sprintf( + "pebble: batch requires at least format major version %d (current: %d)", + batch.minimumFormatMajorVersion, fmv, + )) } if batch.countRangeKeys > 0 { if d.split == nil { return errNoSplit } - // TODO(jackson): Assert that all range key operands are suffixless. } batch.committing = true @@ -922,45 +950,49 @@ func (d *DB) commitWrite(b *Batch, syncWG *sync.WaitGroup, syncErr *error) (*mem b.flushable.setSeqNum(b.SeqNum()) if !d.opts.DisableWAL { var err error - size, err = d.mu.log.SyncRecord(repr, syncWG, syncErr) + size, err = d.mu.log.writer.WriteRecord(repr, wal.SyncOptions{Done: syncWG, Err: syncErr}, b) if err != nil { panic(err) } } } - d.mu.Lock() - var err error + // Grab a reference to the memtable. We don't hold DB.mu, but we do hold + // d.commit.mu. It's okay for readers of d.mu.mem.mutable to only hold one of + // d.commit.mu or d.mu, because memtable rotations require holding both. + mem := d.mu.mem.mutable + // Batches which contain keys of kind InternalKeyKindIngestSST will + // never be applied to the memtable, so we don't need to make room for + // write. if !b.ingestedSSTBatch { - // Batches which contain keys of kind InternalKeyKindIngestSST will - // never be applied to the memtable, so we don't need to make room for - // write. For the other cases, switch out the memtable if there was not - // enough room to store the batch. - err = d.makeRoomForWrite(b) - } - - if err == nil && !d.opts.DisableWAL { - d.mu.log.bytesIn += uint64(len(repr)) + // Flushable batches will require a rotation of the memtable regardless, + // so only attempt an optimistic reservation of space in the current + // memtable if this batch is not a large flushable batch. + if b.flushable == nil { + err = d.mu.mem.mutable.prepare(b) + } + if b.flushable != nil || err == arenaskl.ErrArenaFull { + // Slow path. + // We need to acquire DB.mu and rotate the memtable. + func() { + d.mu.Lock() + defer d.mu.Unlock() + err = d.makeRoomForWrite(b) + mem = d.mu.mem.mutable + }() + } } - - // Grab a reference to the memtable while holding DB.mu. Note that for - // non-flushable batches (b.flushable == nil) makeRoomForWrite() added a - // reference to the memtable which will prevent it from being flushed until - // we unreference it. This reference is dropped in DB.commitApply(). - mem := d.mu.mem.mutable - - d.mu.Unlock() if err != nil { return nil, err } - if d.opts.DisableWAL { return mem, nil } + d.logBytesIn.Add(uint64(len(repr))) if b.flushable == nil { - size, err = d.mu.log.SyncRecord(repr, syncWG, syncErr) + size, err = d.mu.log.writer.WriteRecord(repr, wal.SyncOptions{Done: syncWG, Err: syncErr}, b) if err != nil { panic(err) } @@ -997,28 +1029,39 @@ var iterAllocPool = sync.Pool{ // and the specified seqNum will be used as the snapshot seqNum. // - EFOS in file-only state: Only `seqNum` and `vers` are set. All the // relevant SSTs are referenced by the *version. +// - EFOS that has been excised but is in alwaysCreateIters mode (tests only). +// Only `seqNum` and `readState` are set. type snapshotIterOpts struct { - seqNum uint64 - vers *version + seqNum base.SeqNum + vers *manifest.Version + readState *readState +} + +type batchIterOpts struct { + batchOnly bool +} +type newIterOpts struct { + snapshot snapshotIterOpts + batch batchIterOpts } // newIter constructs a new iterator, merging in batch iterators as an extra // level. func (d *DB) newIter( - ctx context.Context, batch *Batch, sOpts snapshotIterOpts, o *IterOptions, + ctx context.Context, batch *Batch, newIterOpts newIterOpts, o *IterOptions, ) *Iterator { + if newIterOpts.batch.batchOnly { + if batch == nil { + panic("batchOnly is true, but batch is nil") + } + if newIterOpts.snapshot.vers != nil { + panic("batchOnly is true, but snapshotIterOpts is initialized") + } + } if err := d.closed.Load(); err != nil { panic(err) } - seqNum := sOpts.seqNum - if o.rangeKeys() { - if d.FormatMajorVersion() < FormatRangeKeys { - panic(fmt.Sprintf( - "pebble: range keys require at least format major version %d (current: %d)", - FormatRangeKeys, d.FormatMajorVersion(), - )) - } - } + seqNum := newIterOpts.snapshot.seqNum if o != nil && o.RangeKeyMasking.Suffix != nil && o.KeyTypes != IterKeyTypePointsAndRanges { panic("pebble: range key masking requires IterKeyTypePointsAndRanges") } @@ -1029,22 +1072,33 @@ func (d *DB) newIter( // DB.mem.queue[0].logSeqNum. panic("OnlyReadGuaranteedDurable is not supported for batches or snapshots") } - // Grab and reference the current readState. This prevents the underlying - // files in the associated version from being deleted if there is a current - // compaction. The readState is unref'd by Iterator.Close(). var readState *readState - if sOpts.vers == nil { - // NB: loadReadState() calls readState.ref(). - readState = d.loadReadState() - } else { - // s.vers != nil - sOpts.vers.Ref() - } + var newIters tableNewIters + var newIterRangeKey keyspanimpl.TableNewSpanIter + if !newIterOpts.batch.batchOnly { + // Grab and reference the current readState. This prevents the underlying + // files in the associated version from being deleted if there is a current + // compaction. The readState is unref'd by Iterator.Close(). + if newIterOpts.snapshot.vers == nil { + if newIterOpts.snapshot.readState != nil { + readState = newIterOpts.snapshot.readState + readState.ref() + } else { + // NB: loadReadState() calls readState.ref(). + readState = d.loadReadState() + } + } else { + // vers != nil + newIterOpts.snapshot.vers.Ref() + } - // Determine the seqnum to read at after grabbing the read state (current and - // memtables) above. - if seqNum == 0 { - seqNum = d.mu.versions.visibleSeqNum.Load() + // Determine the seqnum to read at after grabbing the read state (current and + // memtables) above. + if seqNum == 0 { + seqNum = d.mu.versions.visibleSeqNum.Load() + } + newIters = d.newIters + newIterRangeKey = d.tableNewRangeKeyIter } // Bundle various structures under a single umbrella in order to allocate @@ -1057,14 +1111,16 @@ func (d *DB) newIter( merge: d.merge, comparer: *d.opts.Comparer, readState: readState, - version: sOpts.vers, + version: newIterOpts.snapshot.vers, keyBuf: buf.keyBuf, prefixOrFullSeekKey: buf.prefixOrFullSeekKey, boundsBuf: buf.boundsBuf, batch: batch, - newIters: d.newIters, - newIterRangeKey: d.tableNewRangeKeyIter, + fc: d.fileCache, + newIters: newIters, + newIterRangeKey: newIterRangeKey, seqNum: seqNum, + batchOnlyIter: newIterOpts.batch.batchOnly, } if o != nil { dbi.opts = *o @@ -1116,7 +1172,7 @@ func finishInitializingIter(ctx context.Context, buf *iterAlloc) *Iterator { } if dbi.opts.rangeKeys() { - dbi.rangeKeyMasking.init(dbi, dbi.comparer.Compare, dbi.comparer.Split) + dbi.rangeKeyMasking.init(dbi, &dbi.comparer) // When iterating over both point and range keys, don't create the // range-key iterator stack immediately if we can avoid it. This @@ -1159,7 +1215,6 @@ func finishInitializingIter(ctx context.Context, buf *iterAlloc) *Iterator { } if dbi.rangeKey == nil { dbi.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) - dbi.rangeKey.init(dbi.comparer.Compare, dbi.comparer.Split, &dbi.opts) dbi.constructRangeKeyIter() } else { dbi.rangeKey.iterConfig.SetBounds(dbi.opts.LowerBound, dbi.opts.UpperBound) @@ -1216,25 +1271,31 @@ func finishInitializingIter(ctx context.Context, buf *iterAlloc) *Iterator { // iteration is invalid in those cases. func (d *DB) ScanInternal( ctx context.Context, + category block.Category, lower, upper []byte, visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, - visitRangeDel func(start, end []byte, seqNum uint64) error, + visitRangeDel func(start, end []byte, seqNum SeqNum) error, visitRangeKey func(start, end []byte, keys []rangekey.Key) error, visitSharedFile func(sst *SharedSSTMeta) error, + visitExternalFile func(sst *ExternalFile) error, ) error { scanInternalOpts := &scanInternalOptions{ - visitPointKey: visitPointKey, - visitRangeDel: visitRangeDel, - visitRangeKey: visitRangeKey, - visitSharedFile: visitSharedFile, - skipSharedLevels: visitSharedFile != nil, + category: category, + visitPointKey: visitPointKey, + visitRangeDel: visitRangeDel, + visitRangeKey: visitRangeKey, + visitSharedFile: visitSharedFile, + visitExternalFile: visitExternalFile, IterOptions: IterOptions{ KeyTypes: IterKeyTypePointsAndRanges, LowerBound: lower, UpperBound: upper, }, } - iter := d.newInternalIter(snapshotIterOpts{} /* snapshot */, scanInternalOpts) + iter, err := d.newInternalIter(ctx, snapshotIterOpts{} /* snapshot */, scanInternalOpts) + if err != nil { + return err + } defer iter.close() return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts) } @@ -1246,7 +1307,9 @@ func (d *DB) ScanInternal( // TODO(bilal): This method has a lot of similarities with db.newIter as well as // finishInitializingIter. Both pairs of methods should be refactored to reduce // this duplication. -func (d *DB) newInternalIter(sOpts snapshotIterOpts, o *scanInternalOptions) *scanInternalIterator { +func (d *DB) newInternalIter( + ctx context.Context, sOpts snapshotIterOpts, o *scanInternalOptions, +) (*scanInternalIterator, error) { if err := d.closed.Load(); err != nil { panic(err) } @@ -1254,10 +1317,18 @@ func (d *DB) newInternalIter(sOpts snapshotIterOpts, o *scanInternalOptions) *sc // files in the associated version from being deleted if there is a current // compaction. The readState is unref'd by Iterator.Close(). var readState *readState + var vers *manifest.Version if sOpts.vers == nil { - readState = d.loadReadState() - } - if sOpts.vers != nil { + if sOpts.readState != nil { + readState = sOpts.readState + readState.ref() + vers = readState.current + } else { + readState = d.loadReadState() + vers = readState.current + } + } else { + vers = sOpts.vers sOpts.vers.Ref() } @@ -1272,6 +1343,7 @@ func (d *DB) newInternalIter(sOpts snapshotIterOpts, o *scanInternalOptions) *sc // them together. buf := iterAllocPool.Get().(*iterAlloc) dbi := &scanInternalIterator{ + ctx: ctx, db: d, comparer: d.opts.Comparer, merge: d.opts.Merger.Merge, @@ -1283,9 +1355,9 @@ func (d *DB) newInternalIter(sOpts snapshotIterOpts, o *scanInternalOptions) *sc seqNum: seqNum, mergingIter: &buf.merging, } - if o != nil { - dbi.opts = *o - } + dbi.blobValueFetcher.Init(&vers.BlobFiles, d.fileCache, block.ReadEnv{}) + + dbi.opts = *o dbi.opts.logger = d.opts.Logger if d.opts.private.disableLazyCombinedIteration { dbi.opts.disableLazyCombinedIteration = true @@ -1293,7 +1365,21 @@ func (d *DB) newInternalIter(sOpts snapshotIterOpts, o *scanInternalOptions) *sc return finishInitializingInternalIter(buf, dbi) } -func finishInitializingInternalIter(buf *iterAlloc, i *scanInternalIterator) *scanInternalIterator { +type internalIterOpts struct { + // if compaction is set, sstable-level iterators will be created using + // NewCompactionIter; these iterators have a more constrained interface + // and are optimized for the sequential scan of a compaction. + compaction bool + readEnv sstable.ReadEnv + boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter + // blobValueFetcher is the base.ValueFetcher to use when constructing + // internal values to represent values stored externally in blob files. + blobValueFetcher base.ValueFetcher +} + +func finishInitializingInternalIter( + buf *iterAlloc, i *scanInternalIterator, +) (*scanInternalIterator, error) { // Short-hand. var memtables flushableList if i.readState != nil { @@ -1309,13 +1395,16 @@ func finishInitializingInternalIter(buf *iterAlloc, i *scanInternalIterator) *sc } i.initializeBoundBufs(i.opts.LowerBound, i.opts.UpperBound) - i.constructPointIter(memtables, buf) + if err := i.constructPointIter(i.opts.category, memtables, buf); err != nil { + return nil, err + } // For internal iterators, we skip the lazy combined iteration optimization // entirely, and create the range key iterator stack directly. i.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) - i.rangeKey.init(i.comparer.Compare, i.comparer.Split, &i.opts.IterOptions) - i.constructRangeKeyIter() + if err := i.constructRangeKeyIter(); err != nil { + return nil, err + } // Wrap the point iterator (currently i.iter) with an interleaving // iterator that interleaves range keys pulled from @@ -1327,7 +1416,7 @@ func finishInitializingInternalIter(buf *iterAlloc, i *scanInternalIterator) *sc }) i.iter = &i.rangeKey.iiter - return i + return i, nil } func (i *Iterator) constructPointIter( @@ -1337,7 +1426,26 @@ func (i *Iterator) constructPointIter( // Already have one. return } - internalOpts := internalIterOpts{stats: &i.stats.InternalStats} + readEnv := block.ReadEnv{ + Stats: &i.stats.InternalStats, + // If the file cache has a sstable stats collector, ask it for an + // accumulator for this iterator's configured category and QoS. All SSTable + // iterators created by this Iterator will accumulate their stats to it as + // they Close during iteration. + IterStats: i.fc.SSTStatsCollector().Accumulator( + uint64(uintptr(unsafe.Pointer(i))), + i.opts.Category, + ), + } + if i.readState != nil { + i.blobValueFetcher.Init(&i.readState.current.BlobFiles, i.fc, readEnv) + } else if i.version != nil { + i.blobValueFetcher.Init(&i.version.BlobFiles, i.fc, readEnv) + } + internalOpts := internalIterOpts{ + readEnv: sstable.ReadEnv{Block: readEnv}, + blobValueFetcher: &i.blobValueFetcher, + } if i.opts.RangeKeyMasking.Filter != nil { internalOpts.boundLimitedFilter = &i.rangeKeyMasking } @@ -1354,20 +1462,24 @@ func (i *Iterator) constructPointIter( if i.batch != nil { numMergingLevels++ } - numMergingLevels += len(memtables) - current := i.version - if current == nil { - current = i.readState.current - } - numMergingLevels += len(current.L0SublevelFiles) - numLevelIters += len(current.L0SublevelFiles) - for level := 1; level < len(current.Levels); level++ { - if current.Levels[level].Empty() { - continue + var current *manifest.Version + if !i.batchOnlyIter { + numMergingLevels += len(memtables) + + current = i.version + if current == nil { + current = i.readState.current + } + numMergingLevels += len(current.L0SublevelFiles) + numLevelIters += len(current.L0SublevelFiles) + for level := 1; level < len(current.Levels); level++ { + if current.Levels[level].Empty() { + continue + } + numMergingLevels++ + numLevelIters++ } - numMergingLevels++ - numLevelIters++ } if numMergingLevels > cap(mlevels) { @@ -1380,12 +1492,8 @@ func (i *Iterator) constructPointIter( // Top-level is the batch, if any. if i.batch != nil { if i.batch.index == nil { - // This isn't an indexed batch. Include an error iterator so that - // the resulting iterator correctly surfaces ErrIndexed. - mlevels = append(mlevels, mergingIterLevel{ - iter: newErrorIter(ErrNotIndexed), - rangeDelIter: newErrorKeyspanIter(ErrNotIndexed), - }) + // This isn't an indexed batch. We shouldn't have gotten this far. + panic(errors.AssertionFailedf("creating an iterator over an unindexed batch")) } else { i.batch.initInternalIter(&i.opts, &i.batchPointIter) i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, i.batchSeqNum) @@ -1405,47 +1513,48 @@ func (i *Iterator) constructPointIter( } } - // Next are the memtables. - for j := len(memtables) - 1; j >= 0; j-- { - mem := memtables[j] - mlevels = append(mlevels, mergingIterLevel{ - iter: mem.newIter(&i.opts), - rangeDelIter: mem.newRangeDelIter(&i.opts), - }) - } + if !i.batchOnlyIter { + // Next are the memtables. + for j := len(memtables) - 1; j >= 0; j-- { + mem := memtables[j] + mlevels = append(mlevels, mergingIterLevel{ + iter: mem.newIter(&i.opts), + rangeDelIter: mem.newRangeDelIter(&i.opts), + }) + } - // Next are the file levels: L0 sub-levels followed by lower levels. - mlevelsIndex := len(mlevels) - levelsIndex := len(levels) - mlevels = mlevels[:numMergingLevels] - levels = levels[:numLevelIters] - i.opts.snapshotForHideObsoletePoints = buf.dbi.seqNum - addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) { - li := &levels[levelsIndex] + // Next are the file levels: L0 sub-levels followed by lower levels. + mlevelsIndex := len(mlevels) + levelsIndex := len(levels) + mlevels = mlevels[:numMergingLevels] + levels = levels[:numLevelIters] + i.opts.snapshotForHideObsoletePoints = buf.dbi.seqNum + addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Layer) { + li := &levels[levelsIndex] - li.init(ctx, i.opts, &i.comparer, i.newIters, files, level, internalOpts) - li.initRangeDel(&mlevels[mlevelsIndex].rangeDelIter) - li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext) - li.initCombinedIterState(&i.lazyCombinedIter.combinedIterState) - mlevels[mlevelsIndex].levelIter = li - mlevels[mlevelsIndex].iter = invalidating.MaybeWrapIfInvariants(li) + li.init(ctx, i.opts, &i.comparer, i.newIters, files, level, internalOpts) + li.initRangeDel(&mlevels[mlevelsIndex]) + li.initCombinedIterState(&i.lazyCombinedIter.combinedIterState) + mlevels[mlevelsIndex].levelIter = li + mlevels[mlevelsIndex].iter = invalidating.MaybeWrapIfInvariants(li) - levelsIndex++ - mlevelsIndex++ - } + levelsIndex++ + mlevelsIndex++ + } - // Add level iterators for the L0 sublevels, iterating from newest to - // oldest. - for i := len(current.L0SublevelFiles) - 1; i >= 0; i-- { - addLevelIterForFiles(current.L0SublevelFiles[i].Iter(), manifest.L0Sublevel(i)) - } + // Add level iterators for the L0 sublevels, iterating from newest to + // oldest. + for i := len(current.L0SublevelFiles) - 1; i >= 0; i-- { + addLevelIterForFiles(current.L0SublevelFiles[i].Iter(), manifest.L0Sublevel(i)) + } - // Add level iterators for the non-empty non-L0 levels. - for level := 1; level < len(current.Levels); level++ { - if current.Levels[level].Empty() { - continue + // Add level iterators for the non-empty non-L0 levels. + for level := 1; level < len(current.Levels); level++ { + if current.Levels[level].Empty() { + continue + } + addLevelIterForFiles(current.Levels[level].Iter(), manifest.Level(level)) } - addLevelIterForFiles(current.Levels[level].Iter(), manifest.Level(level)) } buf.merging.init(&i.opts, &i.stats.InternalStats, i.comparer.Compare, i.comparer.Split, mlevels...) if len(mlevels) <= cap(buf.levelsPositioned) { @@ -1454,20 +1563,20 @@ func (i *Iterator) constructPointIter( buf.merging.snapshot = i.seqNum buf.merging.batchSnapshot = i.batchSeqNum buf.merging.combinedIterState = &i.lazyCombinedIter.combinedIterState - i.pointIter = invalidating.MaybeWrapIfInvariants(&buf.merging) + i.pointIter = invalidating.MaybeWrapIfInvariants(&buf.merging).(topLevelIterator) i.merging = &buf.merging } // NewBatch returns a new empty write-only batch. Any reads on the batch will // return an error. If the batch is committed it will be applied to the DB. -func (d *DB) NewBatch() *Batch { - return newBatch(d) +func (d *DB) NewBatch(opts ...BatchOption) *Batch { + return newBatch(d, opts...) } // NewBatchWithSize is mostly identical to NewBatch, but it will allocate the // the specified memory space for the internal slice in advance. -func (d *DB) NewBatchWithSize(size int) *Batch { - return newBatchWithSize(d, size) +func (d *DB) NewBatchWithSize(size int, opts ...BatchOption) *Batch { + return newBatchWithSize(d, size, opts...) } // NewIndexedBatch returns a new empty read-write batch. Any reads on the batch @@ -1480,7 +1589,7 @@ func (d *DB) NewIndexedBatch() *Batch { } // NewIndexedBatchWithSize is mostly identical to NewIndexedBatch, but it will -// allocate the the specified memory space for the internal slice in advance. +// allocate the specified memory space for the internal slice in advance. func (d *DB) NewIndexedBatchWithSize(size int) *Batch { return newIndexedBatchWithSize(d, d.opts.Comparer, size) } @@ -1500,7 +1609,7 @@ func (d *DB) NewIter(o *IterOptions) (*Iterator, error) { // NewIterWithContext is like NewIter, and additionally accepts a context for // tracing. func (d *DB) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) { - return d.newIter(ctx, nil /* batch */, snapshotIterOpts{}, o), nil + return d.newIter(ctx, nil /* batch */, newIterOpts{}, o), nil } // NewSnapshot returns a point-in-time view of the current DB state. Iterators @@ -1511,11 +1620,21 @@ func (d *DB) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, // will not prevent memtables from being released or sstables from being // deleted. Instead, a snapshot prevents deletion of sequence numbers // referenced by the snapshot. +// +// There exists one violation of a Snapshot's point-in-time guarantee: An excise +// (see DB.Excise and DB.IngestAndExcise) that occurs after the snapshot's +// creation will be observed by iterators created from the snapshot after the +// excise. See NewEventuallyFileOnlySnapshot for a variant of NewSnapshot that +// provides a full point-in-time guarantee. func (d *DB) NewSnapshot() *Snapshot { + // TODO(jackson): Consider removal of regular, non-eventually-file-only + // snapshots given they no longer provide a true point-in-time snapshot of + // the database due to excises. If we had a mechanism to construct a maximal + // key range, we could implement NewSnapshot in terms of + // NewEventuallyFileOnlySnapshot and provide a true point-in-time guarantee. if err := d.closed.Load(); err != nil { panic(err) } - d.mu.Lock() s := &Snapshot{ db: d, @@ -1534,19 +1653,12 @@ func (d *DB) NewEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFile if err := d.closed.Load(); err != nil { panic(err) } - - internalKeyRanges := make([]internalKeyRange, len(keyRanges)) for i := range keyRanges { if i > 0 && d.cmp(keyRanges[i-1].End, keyRanges[i].Start) > 0 { panic("pebble: key ranges for eventually-file-only-snapshot not in order") } - internalKeyRanges[i] = internalKeyRange{ - smallest: base.MakeInternalKey(keyRanges[i].Start, InternalKeySeqNumMax, InternalKeyKindMax), - largest: base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, keyRanges[i].End), - } } - - return d.makeEventuallyFileOnlySnapshot(keyRanges, internalKeyRanges) + return d.makeEventuallyFileOnlySnapshot(keyRanges) } // Close closes the DB. @@ -1555,6 +1667,15 @@ func (d *DB) NewEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFile // or to call Close concurrently with any other DB method. It is not valid // to call any of a DB's methods after the DB has been closed. func (d *DB) Close() error { + if err := d.closed.Load(); err != nil { + panic(err) + } + d.compactionSchedulers.Wait() + // Compactions can be asynchronously started by the CompactionScheduler + // calling d.Schedule. When this Unregister returns, we know that the + // CompactionScheduler will never again call a method on the DB. Note that + // this must be called without holding d.mu. + d.opts.Experimental.CompactionScheduler.Unregister() // Lock the commit pipeline for the duration of Close. This prevents a race // with makeRoomForWrite. Rotating the WAL in makeRoomForWrite requires // dropping d.mu several times for I/O. If Close only holds d.mu, an @@ -1569,10 +1690,14 @@ func (d *DB) Close() error { defer d.commit.mu.Unlock() d.mu.Lock() defer d.mu.Unlock() + // Check that the DB is not closed again. If there are two concurrent calls + // to DB.Close, the best-effort check at the top of DB.Close may not fire. + // But since this second check happens after mutex acquisition, the two + // concurrent calls will get serialized and the second one will see the + // effect of the d.closed.Store below. if err := d.closed.Load(); err != nil { panic(err) } - // Clear the finalizer that is used to check that an unreferenced DB has been // closed. We're closing the DB here, so the check performed by that // finalizer isn't necessary. @@ -1583,9 +1708,9 @@ func (d *DB) Close() error { d.closed.Store(errors.WithStack(ErrClosed)) close(d.closedCh) - defer d.opts.Cache.Unref() + defer d.cacheHandle.Close() - for d.mu.compact.compactingCount > 0 || d.mu.compact.flushing { + for d.mu.compact.compactingCount > 0 || d.mu.compact.downloadingCount > 0 || d.mu.compact.flushing { d.mu.compact.cond.Wait() } for d.mu.tableStats.loading { @@ -1600,12 +1725,15 @@ func (d *DB) Close() error { err = errors.Errorf("pebble: %d unexpected in-progress compactions", errors.Safe(n)) } err = firstError(err, d.mu.formatVers.marker.Close()) - err = firstError(err, d.tableCache.close()) if !d.opts.ReadOnly { - err = firstError(err, d.mu.log.Close()) - } else if d.mu.log.LogWriter != nil { + if d.mu.log.writer != nil { + _, err2 := d.mu.log.writer.Close() + err = firstError(err, err2) + } + } else if d.mu.log.writer != nil { panic("pebble: log-writer should be nil in read-only mode") } + err = firstError(err, d.mu.log.manager.Close()) err = firstError(err, d.fileLock.Close()) // Note that versionSet.close() only closes the MANIFEST. The versions list @@ -1613,9 +1741,6 @@ func (d *DB) Close() error { err = firstError(err, d.mu.versions.close()) err = firstError(err, d.dataDir.Close()) - if d.dataDir != d.walDir { - err = firstError(err, d.walDir.Close()) - } d.readState.val.unrefLocked() @@ -1650,12 +1775,11 @@ func (d *DB) Close() error { // Since we called d.readState.val.unrefLocked() above, we are expected to // manually schedule deletion of obsolete files. - if len(d.mu.versions.obsoleteTables) > 0 { - d.deleteObsoleteFiles(d.mu.nextJobID) + if len(d.mu.versions.obsoleteTables) > 0 || len(d.mu.versions.obsoleteBlobs) > 0 { + d.deleteObsoleteFiles(d.newJobIDLocked()) } d.mu.Unlock() - d.compactionSchedulers.Wait() // Wait for all cleaning jobs to finish. d.cleanupManager.Close() @@ -1671,11 +1795,16 @@ func (d *DB) Close() error { d.mu.Lock() - // As a sanity check, ensure that there are no zombie tables. A non-zero count - // hints at a reference count leak. - if ztbls := len(d.mu.versions.zombieTables); ztbls > 0 { + // As a sanity check, ensure that there are no zombie tables or blob files. + // A non-zero count hints at a reference count leak. + if ztbls := d.mu.versions.zombieTables.Count(); ztbls > 0 { err = firstError(err, errors.Errorf("non-zero zombie file count: %d", ztbls)) } + if zblobs := d.mu.versions.zombieBlobs.Count(); zblobs > 0 { + err = firstError(err, errors.Errorf("non-zero zombie blob count: %d", zblobs)) + } + + err = firstError(err, d.fileCache.Close()) err = firstError(err, d.objProvider.Close()) @@ -1693,7 +1822,7 @@ func (d *DB) Close() error { } // Compact the specified range of keys in the database. -func (d *DB) Compact(start, end []byte, parallelize bool) error { +func (d *DB) Compact(ctx context.Context, start, end []byte, parallelize bool) error { if err := d.closed.Load(); err != nil { panic(err) } @@ -1704,25 +1833,17 @@ func (d *DB) Compact(start, end []byte, parallelize bool) error { return errors.Errorf("Compact start %s is not less than end %s", d.opts.Comparer.FormatKey(start), d.opts.Comparer.FormatKey(end)) } - iStart := base.MakeInternalKey(start, InternalKeySeqNumMax, InternalKeyKindMax) - iEnd := base.MakeInternalKey(end, 0, 0) - m := (&fileMetadata{}).ExtendPointKeyBounds(d.cmp, iStart, iEnd) - meta := []*fileMetadata{m} d.mu.Lock() maxLevelWithFiles := 1 cur := d.mu.versions.currentVersion() for level := 0; level < numLevels; level++ { - overlaps := cur.Overlaps(level, d.cmp, start, end, iEnd.IsExclusiveSentinel()) + overlaps := cur.Overlaps(level, base.UserKeyBoundsInclusive(start, end)) if !overlaps.Empty() { maxLevelWithFiles = level + 1 } } - keyRanges := make([]internalKeyRange, len(meta)) - for i := range meta { - keyRanges[i] = internalKeyRange{smallest: m.Smallest, largest: m.Largest} - } // Determine if any memtable overlaps with the compaction range. We wait for // any such overlap to flush (initiating a flush if necessary). mem, err := func() (*flushableEntry, error) { @@ -1732,25 +1853,31 @@ func (d *DB) Compact(start, end []byte, parallelize bool) error { // overlaps. for i := len(d.mu.mem.queue) - 1; i >= 0; i-- { mem := d.mu.mem.queue[i] - if ingestMemtableOverlaps(d.cmp, mem, keyRanges) { - var err error + var anyOverlaps bool + mem.computePossibleOverlaps(func(b bounded) shouldContinue { + anyOverlaps = true + return stopIteration + }, KeyRange{Start: start, End: end}) + if !anyOverlaps { + continue + } + var err error + if mem.flushable == d.mu.mem.mutable { + // We have to hold both commitPipeline.mu and DB.mu when calling + // makeRoomForWrite(). Lock order requirements elsewhere force us to + // unlock DB.mu in order to grab commitPipeline.mu first. + d.mu.Unlock() + d.commit.mu.Lock() + d.mu.Lock() + defer d.commit.mu.Unlock() //nolint:deferloop if mem.flushable == d.mu.mem.mutable { - // We have to hold both commitPipeline.mu and DB.mu when calling - // makeRoomForWrite(). Lock order requirements elsewhere force us to - // unlock DB.mu in order to grab commitPipeline.mu first. - d.mu.Unlock() - d.commit.mu.Lock() - d.mu.Lock() - defer d.commit.mu.Unlock() - if mem.flushable == d.mu.mem.mutable { - // Only flush if the active memtable is unchanged. - err = d.makeRoomForWrite(nil) - } + // Only flush if the active memtable is unchanged. + err = d.makeRoomForWrite(nil) } - mem.flushForced = true - d.maybeScheduleFlush() - return mem, err } + mem.flushForced = true + d.maybeScheduleFlush() + return mem, err } return nil, nil }() @@ -1761,13 +1888,23 @@ func (d *DB) Compact(start, end []byte, parallelize bool) error { return err } if mem != nil { - <-mem.flushed + select { + case <-mem.flushed: + case <-ctx.Done(): + return ctx.Err() + } } for level := 0; level < maxLevelWithFiles; { - if err := d.manualCompact( - iStart.UserKey, iEnd.UserKey, level, parallelize); err != nil { - return err + for { + if err := d.manualCompact( + ctx, start, end, level, parallelize); err != nil { + if errors.Is(err, ErrCancelledCompaction) { + continue + } + return err + } + break } level++ if level == numLevels-1 { @@ -1779,10 +1916,12 @@ func (d *DB) Compact(start, end []byte, parallelize bool) error { return nil } -func (d *DB) manualCompact(start, end []byte, level int, parallelize bool) error { +func (d *DB) manualCompact( + ctx context.Context, start, end []byte, level int, parallelize bool, +) error { d.mu.Lock() curr := d.mu.versions.currentVersion() - files := curr.Overlaps(level, d.cmp, start, end, false) + files := curr.Overlaps(level, base.UserKeyBoundsInclusive(start, end)) if files.Empty() { d.mu.Unlock() return nil @@ -1799,10 +1938,52 @@ func (d *DB) manualCompact(start, end []byte, level int, parallelize bool) error end: end, }) } + n := len(compactions) + if n == 0 { + d.mu.Unlock() + return nil + } + for i := range compactions { + d.mu.compact.manualID++ + compactions[i].id = d.mu.compact.manualID + } + // [manualIDStart, manualIDEnd] are the compactions that have been added to + // d.mu.compact.manual. + manualIDStart := compactions[0].id + manualIDEnd := compactions[n-1].id d.mu.compact.manual = append(d.mu.compact.manual, compactions...) + d.mu.compact.manualLen.Store(int32(len(d.mu.compact.manual))) d.maybeScheduleCompaction() d.mu.Unlock() + // On context cancellation, we only cancel the compactions that have not yet + // started. The assumption is that it is relatively harmless to have the + // already started compactions run to completion. We don't wait for the + // ongoing compactions to finish, since the assumption is that the caller + // has already given up on the operation (and the cancellation error is + // going to be returned anyway). + // + // An alternative would be to store the context in each *manualCompaction, + // and have the goroutine that retrieves the *manualCompaction for running + // notice the cancellation and write the cancellation error to + // manualCompaction.done. That approach would require this method to wait + // for all the *manualCompactions it has enqueued to finish before returning + // (to not leak a context). Since there is no timeliness guarantee on when a + // *manualCompaction will be retrieved for running, the wait until a + // cancelled context causes this method to return is not bounded. Hence, we + // don't adopt that approach. + cancelPendingCompactions := func() { + d.mu.Lock() + for i := 0; i < len(d.mu.compact.manual); { + if d.mu.compact.manual[i].id >= manualIDStart && d.mu.compact.manual[i].id <= manualIDEnd { + d.mu.compact.manual = slices.Delete(d.mu.compact.manual, i, i+1) + d.mu.compact.manualLen.Store(int32(len(d.mu.compact.manual))) + } else { + i++ + } + } + d.mu.Unlock() + } // Each of the channels is guaranteed to be eventually sent to once. After a // compaction is possibly picked in d.maybeScheduleCompaction(), either the // compaction is dropped, executed after being scheduled, or retried later. @@ -1811,8 +1992,15 @@ func (d *DB) manualCompact(start, end []byte, level int, parallelize bool) error // necessary to read from each channel, and so we can exit early in the event // of an error. for _, compaction := range compactions { - if err := <-compaction.done; err != nil { - return err + select { + case <-ctx.Done(): + cancelPendingCompactions() + return ctx.Err() + case err := <-compaction.done: + if err != nil { + cancelPendingCompactions() + return err + } } } return nil @@ -1829,43 +2017,19 @@ func (d *DB) splitManualCompaction( if level == 0 { endLevel = baseLevel } - keyRanges := calculateInuseKeyRanges(curr, d.cmp, level, endLevel, start, end) + keyRanges := curr.CalculateInuseKeyRanges(d.mu.versions.latest.l0Organizer, level, endLevel, start, end) for _, keyRange := range keyRanges { splitCompactions = append(splitCompactions, &manualCompaction{ level: level, done: make(chan error, 1), start: keyRange.Start, - end: keyRange.End, + end: keyRange.End.Key, split: true, }) } return splitCompactions } -// DownloadSpan is a key range passed to the Download method. -type DownloadSpan struct { - StartKey []byte - // EndKey is exclusive. - EndKey []byte -} - -// Download ensures that the LSM does not use any external sstables for the -// given key ranges. It does so by performing appropriate compactions so that -// all external data becomes available locally. -// -// Note that calling this method does not imply that all other compactions stop; -// it simply informs Pebble of a list of spans for which external data should be -// downloaded with high priority. -// -// The method returns once no external sstasbles overlap the given spans, the -// context is canceled, or an error is hit. -// -// TODO(radu): consider passing a priority/impact knob to express how important -// the download is (versus live traffic performance, LSM health). -func (d *DB) Download(ctx context.Context, spans []DownloadSpan) error { - return errors.Errorf("not implemented") -} - // Flush the memtable to stable storage. func (d *DB) Flush() error { flushDone, err := d.AsyncFlush() @@ -1903,21 +2067,24 @@ func (d *DB) AsyncFlush() (<-chan struct{}, error) { // Metrics returns metrics about the database. func (d *DB) Metrics() *Metrics { metrics := &Metrics{} - recycledLogsCount, recycledLogSize := d.logRecycler.stats() + walStats := d.mu.log.manager.Stats() + completedObsoleteFileStats := d.cleanupManager.CompletedStats() d.mu.Lock() vers := d.mu.versions.currentVersion() *metrics = d.mu.versions.metrics - metrics.Compact.EstimatedDebt = d.mu.versions.picker.estimatedCompactionDebt(0) + metrics.Compact.EstimatedDebt = d.mu.versions.picker.estimatedCompactionDebt() metrics.Compact.InProgressBytes = d.mu.versions.atomicInProgressBytes.Load() - metrics.Compact.NumInProgress = int64(d.mu.compact.compactingCount) + // TODO(radu): split this to separate the download compactions. + metrics.Compact.NumInProgress = int64(d.mu.compact.compactingCount + d.mu.compact.downloadingCount) metrics.Compact.MarkedFiles = vers.Stats.MarkedForCompaction metrics.Compact.Duration = d.mu.compact.duration for c := range d.mu.compact.inProgress { - if c.kind != compactionKindFlush { - metrics.Compact.Duration += d.timeNow().Sub(c.beganAt) + if !c.IsFlush() { + metrics.Compact.Duration += d.timeNow().Sub(c.BeganAt()) } } + metrics.Compact.NumProblemSpans = d.problemSpans.Len() for _, m := range d.mu.mem.queue { metrics.MemTable.Size += m.totalBytes() @@ -1931,80 +2098,117 @@ func (d *DB) Metrics() *Metrics { metrics.MemTable.Count = int64(len(d.mu.mem.queue)) metrics.MemTable.ZombieCount = d.memTableCount.Load() - metrics.MemTable.Count metrics.MemTable.ZombieSize = uint64(d.memTableReserved.Load()) - metrics.MemTable.Size - metrics.WAL.ObsoleteFiles = int64(recycledLogsCount) - metrics.WAL.ObsoletePhysicalSize = recycledLogSize + metrics.WAL.ObsoleteFiles = int64(walStats.ObsoleteFileCount) + metrics.WAL.ObsoletePhysicalSize = walStats.ObsoleteFileSize + metrics.WAL.Files = int64(walStats.LiveFileCount) + // The current WAL's size (d.logSize) is the logical size, which may be less + // than the WAL's physical size if it was recycled. walStats.LiveFileSize + // includes the physical size of all live WALs, but for the current WAL it + // reflects the physical size when it was opened. So it is possible that + // d.atomic.logSize has exceeded that physical size. We allow for this + // anomaly. + metrics.WAL.PhysicalSize = walStats.LiveFileSize + metrics.WAL.BytesIn = d.logBytesIn.Load() metrics.WAL.Size = d.logSize.Load() - // The current WAL size (d.atomic.logSize) is the current logical size, - // which may be less than the WAL's physical size if it was recycled. - // The file sizes in d.mu.log.queue are updated to the physical size - // during WAL rotation. Use the larger of the two for the current WAL. All - // the previous WALs's fileSizes in d.mu.log.queue are already updated. - metrics.WAL.PhysicalSize = metrics.WAL.Size - if len(d.mu.log.queue) > 0 && metrics.WAL.PhysicalSize < d.mu.log.queue[len(d.mu.log.queue)-1].fileSize { - metrics.WAL.PhysicalSize = d.mu.log.queue[len(d.mu.log.queue)-1].fileSize - } - for i, n := 0, len(d.mu.log.queue)-1; i < n; i++ { - metrics.WAL.PhysicalSize += d.mu.log.queue[i].fileSize - } - - metrics.WAL.BytesIn = d.mu.log.bytesIn // protected by d.mu for i, n := 0, len(d.mu.mem.queue)-1; i < n; i++ { metrics.WAL.Size += d.mu.mem.queue[i].logSize } - metrics.WAL.BytesWritten = metrics.Levels[0].BytesIn + metrics.WAL.Size + metrics.WAL.BytesWritten = metrics.Levels[0].TableBytesIn + metrics.WAL.Size + metrics.WAL.Failover = walStats.Failover + if p := d.mu.versions.picker; p != nil { compactions := d.getInProgressCompactionInfoLocked(nil) - for level, score := range p.getScores(compactions) { - metrics.Levels[level].Score = score + m := p.getMetrics(compactions) + for level, lm := range m.levels { + metrics.Levels[level].Score = lm.score + metrics.Levels[level].FillFactor = lm.fillFactor + metrics.Levels[level].CompensatedFillFactor = lm.compensatedFillFactor } } - metrics.Table.ZombieCount = int64(len(d.mu.versions.zombieTables)) - for _, size := range d.mu.versions.zombieTables { - metrics.Table.ZombieSize += size - } + metrics.Table.ZombieCount = int64(d.mu.versions.zombieTables.Count()) + metrics.Table.ZombieSize = d.mu.versions.zombieTables.TotalSize() + metrics.Table.Local.ZombieCount, metrics.Table.Local.ZombieSize = d.mu.versions.zombieTables.LocalStats() + + // The obsolete blob/table metrics have a subtle calculation: + // + // (A) The vs.metrics.{Table,BlobFiles}.[Local.]{ObsoleteCount,ObsoleteSize} + // fields reflect the set of files currently sitting in + // vs.obsolete{Tables,Blobs} but not yet enqueued to the cleanup manager. + // + // (B) The d.mu.fileDeletions.queuedStats field holds the set of files that have + // been queued for deletion by the cleanup manager. + // + // (C) The cleanup manager also maintains cumulative stats for the set of + // files that have been deleted. + // + // The value of currently pending obsolete files is (A) + (B) - (C). + pendingObsoleteFileStats := d.mu.fileDeletions.queuedStats + pendingObsoleteFileStats.Sub(completedObsoleteFileStats) + metrics.Table.Local.ObsoleteCount += pendingObsoleteFileStats.tablesLocal.count + metrics.Table.Local.ObsoleteSize += pendingObsoleteFileStats.tablesLocal.size + metrics.Table.ObsoleteCount += int64(pendingObsoleteFileStats.tablesAll.count) + metrics.Table.ObsoleteSize += pendingObsoleteFileStats.tablesAll.size + metrics.BlobFiles.Local.ObsoleteCount += pendingObsoleteFileStats.blobFilesLocal.count + metrics.BlobFiles.Local.ObsoleteSize += pendingObsoleteFileStats.blobFilesLocal.size + metrics.BlobFiles.ObsoleteCount += pendingObsoleteFileStats.blobFilesAll.count + metrics.BlobFiles.ObsoleteSize += pendingObsoleteFileStats.blobFilesAll.size metrics.private.optionsFileSize = d.optionsFileSize // TODO(jackson): Consider making these metrics optional. - metrics.Keys.RangeKeySetsCount = countRangeKeySetFragments(vers) - metrics.Keys.TombstoneCount = countTombstones(vers) + metrics.Keys.RangeKeySetsCount = *rangeKeySetsAnnotator.MultiLevelAnnotation(vers.RangeKeyLevels[:]) + metrics.Keys.TombstoneCount = *tombstonesAnnotator.MultiLevelAnnotation(vers.Levels[:]) + + metrics.Table.Garbage.PointDeletionsBytesEstimate = + *pointDeletionsBytesEstimateAnnotator.MultiLevelAnnotation(vers.Levels[:]) + metrics.Table.Garbage.RangeDeletionsBytesEstimate = + *rangeDeletionsBytesEstimateAnnotator.MultiLevelAnnotation(vers.Levels[:]) d.mu.versions.logLock() metrics.private.manifestFileSize = uint64(d.mu.versions.manifest.Size()) - metrics.Table.BackingTableCount = uint64(len(d.mu.versions.backingState.fileBackingMap)) - metrics.Table.BackingTableSize = d.mu.versions.backingState.fileBackingSize - if invariants.Enabled { - var totalSize uint64 - for _, backing := range d.mu.versions.backingState.fileBackingMap { - totalSize += backing.Size - } - if totalSize != metrics.Table.BackingTableSize { - panic("pebble: invalid backing table size accounting") - } - } + backingCount, backingTotalSize := d.mu.versions.latest.virtualBackings.Stats() + metrics.Table.BackingTableCount = uint64(backingCount) + metrics.Table.BackingTableSize = backingTotalSize + blobStats, _ := d.mu.versions.latest.blobFiles.Stats() d.mu.versions.logUnlock() + metrics.BlobFiles.LiveCount = blobStats.Count + metrics.BlobFiles.LiveSize = blobStats.PhysicalSize + metrics.BlobFiles.ValueSize = blobStats.ValueSize + metrics.BlobFiles.ReferencedValueSize = blobStats.ReferencedValueSize metrics.LogWriter.FsyncLatency = d.mu.log.metrics.fsyncLatency if err := metrics.LogWriter.Merge(&d.mu.log.metrics.LogWriterMetrics); err != nil { - d.opts.Logger.Infof("metrics error: %s", err) + d.opts.Logger.Errorf("metrics error: %s", err) } metrics.Flush.WriteThroughput = d.mu.compact.flushWriteThroughput if d.mu.compact.flushing { metrics.Flush.NumInProgress = 1 } for i := 0; i < numLevels; i++ { - metrics.Levels[i].Additional.ValueBlocksSize = valueBlocksSizeForLevel(vers, i) + metrics.Levels[i].Additional.ValueBlocksSize = *valueBlockSizeAnnotator.LevelAnnotation(vers.Levels[i]) + compressionTypes := compressionTypeAnnotator.LevelAnnotation(vers.Levels[i]) + metrics.Table.CompressedCountUnknown += int64(compressionTypes.unknown) + metrics.Table.CompressedCountSnappy += int64(compressionTypes.snappy) + metrics.Table.CompressedCountZstd += int64(compressionTypes.zstd) + metrics.Table.CompressedCountMinLZ += int64(compressionTypes.minlz) + metrics.Table.CompressedCountNone += int64(compressionTypes.none) } + metrics.Table.PendingStatsCollectionCount = int64(len(d.mu.tableStats.pending)) + metrics.Table.InitialStatsCollectionComplete = d.mu.tableStats.loadedInitial + d.mu.Unlock() metrics.BlockCache = d.opts.Cache.Metrics() - metrics.TableCache, metrics.Filter = d.tableCache.metrics() - metrics.TableIters = int64(d.tableCache.iterCount()) + metrics.FileCache, metrics.Filter = d.fileCache.Metrics() + metrics.TableIters = d.fileCache.IterCount() + metrics.CategoryStats = d.fileCache.SSTStatsCollector().GetStats() metrics.SecondaryCacheMetrics = d.objProvider.Metrics() metrics.Uptime = d.timeNow().Sub(d.openedAt) + metrics.manualMemory = manual.GetMetrics() + return metrics } @@ -2044,8 +2248,7 @@ func WithKeyRangeFilter(start, end []byte) SSTablesOption { // WithApproximateSpanBytes enables capturing the approximate number of bytes that // overlap the provided key span for each sstable. -// NOTE: this option can only be used with WithKeyRangeFilter and WithProperties -// provided. +// NOTE: This option requires WithKeyRangeFilter. func WithApproximateSpanBytes() SSTablesOption { return func(opt *sstablesOptions) { opt.withApproximateSpanBytes = true @@ -2072,23 +2275,40 @@ const ( // or lifecycle management. An example of an external file is a file restored // from a backup. BackingTypeExternal + backingTypeCount ) +var backingTypeToString = [backingTypeCount]string{ + BackingTypeLocal: "local", + BackingTypeShared: "shared", + BackingTypeSharedForeign: "shared-foreign", + BackingTypeExternal: "external", +} + +// String implements fmt.Stringer. +func (b BackingType) String() string { + return backingTypeToString[b] +} + // SSTableInfo export manifest.TableInfo with sstable.Properties alongside // other file backing info. type SSTableInfo struct { manifest.TableInfo + TableStats manifest.TableStats // Virtual indicates whether the sstable is virtual. Virtual bool - // BackingSSTNum is the file number associated with backing sstable which - // backs the sstable associated with this SSTableInfo. If Virtual is false, - // then BackingSSTNum == FileNum. - BackingSSTNum base.FileNum + // BackingSSTNum is the disk file number associated with the backing sstable. + // If Virtual is false, BackingSSTNum == PhysicalTableDiskFileNum(TableNum). + BackingSSTNum base.DiskFileNum // BackingType is the type of storage backing this sstable. BackingType BackingType // Locator is the remote.Locator backing this sstable, if the backing type is // not BackingTypeLocal. Locator remote.Locator + // ApproximateSpanBytes describes the approximate number of bytes within the + // sstable that fall within a particular span. It's populated only when the + // ApproximateSpanBytes option is passed into DB.SSTables. + ApproximateSpanBytes uint64 `json:"ApproximateSpanBytes,omitempty"` // Properties is the sstable properties of this table. If Virtual is true, // then the Properties are associated with the backing sst. @@ -2105,11 +2325,8 @@ func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error) { fn(opt) } - if opt.withApproximateSpanBytes && !opt.withProperties { - return nil, errors.Errorf("Cannot use WithApproximateSpanBytes without WithProperties option.") - } if opt.withApproximateSpanBytes && (opt.start == nil || opt.end == nil) { - return nil, errors.Errorf("Cannot use WithApproximateSpanBytes without WithKeyRangeFilter option.") + return nil, errors.Errorf("cannot use WithApproximateSpanBytes without WithKeyRangeFilter option") } // Grab and reference the current readState. @@ -2117,7 +2334,7 @@ func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error) { defer readState.unref() // TODO(peter): This is somewhat expensive, especially on a large - // database. It might be worthwhile to unify TableInfo and FileMetadata and + // database. It might be worthwhile to unify TableInfo and TableMetadata and // then we could simply return current.Files. Note that RocksDB is doing // something similar to the current code, so perhaps it isn't too bad. srcLevels := readState.current.Levels @@ -2129,25 +2346,38 @@ func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error) { destTables := make([]SSTableInfo, totalTables) destLevels := make([][]SSTableInfo, len(srcLevels)) for i := range destLevels { - iter := srcLevels[i].Iter() j := 0 - for m := iter.First(); m != nil; m = iter.Next() { - if opt.start != nil && opt.end != nil && !m.Overlaps(d.opts.Comparer.Compare, opt.start, opt.end, true /* exclusive end */) { - continue + for m := range srcLevels[i].All() { + if opt.start != nil && opt.end != nil { + b := base.UserKeyBoundsEndExclusive(opt.start, opt.end) + if !m.Overlaps(d.opts.Comparer.Compare, &b) { + continue + } + } + var tableStats manifest.TableStats + if m.StatsValid() { + tableStats = m.Stats + } + destTables[j] = SSTableInfo{ + TableInfo: m.TableInfo(), + TableStats: tableStats, } - destTables[j] = SSTableInfo{TableInfo: m.TableInfo()} if opt.withProperties { - p, err := d.tableCache.getTableProperties( + p, err := d.fileCache.getTableProperties( m, ) if err != nil { return nil, err } + if m.Virtual { + commonProps := p.GetScaledProperties(m.TableBacking.Size, m.Size) + p = &sstable.Properties{CommonProperties: commonProps} + } destTables[j].Properties = p } destTables[j].Virtual = m.Virtual - destTables[j].BackingSSTNum = m.FileBacking.DiskFileNum.FileNum() - objMeta, err := d.objProvider.Lookup(fileTypeTable, m.FileBacking.DiskFileNum) + destTables[j].BackingSSTNum = m.TableBacking.DiskFileNum + objMeta, err := d.objProvider.Lookup(base.FileTypeTable, m.TableBacking.DiskFileNum) if err != nil { return nil, err } @@ -2167,25 +2397,15 @@ func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error) { } if opt.withApproximateSpanBytes { - var spanBytes uint64 if m.ContainedWithinSpan(d.opts.Comparer.Compare, opt.start, opt.end) { - spanBytes = m.Size + destTables[j].ApproximateSpanBytes = m.Size } else { - size, err := d.tableCache.estimateSize(m, opt.start, opt.end) + size, err := d.fileCache.estimateSize(m, opt.start, opt.end) if err != nil { return nil, err } - spanBytes = size - } - propertiesCopy := *destTables[j].Properties - - // Deep copy user properties so approximate span bytes can be added. - propertiesCopy.UserProperties = make(map[string]string, len(destTables[j].Properties.UserProperties)+1) - for k, v := range destTables[j].Properties.UserProperties { - propertiesCopy.UserProperties[k] = v + destTables[j].ApproximateSpanBytes = size } - propertiesCopy.UserProperties["approximate-span-bytes"] = strconv.FormatUint(spanBytes, 10) - destTables[j].Properties = &propertiesCopy } j++ } @@ -2196,6 +2416,39 @@ func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error) { return destLevels, nil } +// makeFileSizeAnnotator returns an annotator that computes the total +// storage size of files that meet some criteria defined by filter. When +// applicable, this includes both the sstable size and the size of any +// referenced blob files. +func (d *DB) makeFileSizeAnnotator( + filter func(f *manifest.TableMetadata) bool, +) *manifest.Annotator[uint64] { + return &manifest.Annotator[uint64]{ + Aggregator: manifest.SumAggregator{ + AccumulateFunc: func(f *manifest.TableMetadata) (uint64, bool) { + if filter(f) { + return f.Size + f.EstimatedReferenceSize(), true + } + return 0, true + }, + AccumulatePartialOverlapFunc: func(f *manifest.TableMetadata, bounds base.UserKeyBounds) uint64 { + if filter(f) { + overlappingFileSize, err := d.fileCache.estimateSize(f, bounds.Start, bounds.End.Key) + if err != nil { + return 0 + } + overlapFraction := float64(overlappingFileSize) / float64(f.Size) + // Scale the blob reference size proportionally to the file + // overlap from the bounds to approximate only the blob + // references that overlap with the requested bounds. + return overlappingFileSize + uint64(float64(f.EstimatedReferenceSize())*overlapFraction) + } + return 0 + }, + }, + } +} + // EstimateDiskUsage returns the estimated filesystem space used in bytes for // storing the range `[start, end]`. The estimation is computed as follows: // @@ -2222,7 +2475,9 @@ func (d *DB) EstimateDiskUsageByBackingType( if err := d.closed.Load(); err != nil { panic(err) } - if d.opts.Comparer.Compare(start, end) > 0 { + + bounds := base.UserKeyBoundsInclusive(start, end) + if !bounds.Valid(d.cmp) { return 0, 0, 0, errors.New("invalid key-range specified (start > end)") } @@ -2232,70 +2487,11 @@ func (d *DB) EstimateDiskUsageByBackingType( readState := d.loadReadState() defer readState.unref() - for level, files := range readState.current.Levels { - iter := files.Iter() - if level > 0 { - // We can only use `Overlaps` to restrict `files` at L1+ since at L0 it - // expands the range iteratively until it has found a set of files that - // do not overlap any other L0 files outside that set. - overlaps := readState.current.Overlaps(level, d.opts.Comparer.Compare, start, end, false /* exclusiveEnd */) - iter = overlaps.Iter() - } - for file := iter.First(); file != nil; file = iter.Next() { - if d.opts.Comparer.Compare(start, file.Smallest.UserKey) <= 0 && - d.opts.Comparer.Compare(file.Largest.UserKey, end) <= 0 { - // The range fully contains the file, so skip looking it up in - // table cache/looking at its indexes, and add the full file size. - meta, err := d.objProvider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) - if err != nil { - return 0, 0, 0, err - } - if meta.IsRemote() { - remoteSize += file.Size - if meta.Remote.CleanupMethod == objstorage.SharedNoCleanup { - externalSize += file.Size - } - } - totalSize += file.Size - } else if d.opts.Comparer.Compare(file.Smallest.UserKey, end) <= 0 && - d.opts.Comparer.Compare(start, file.Largest.UserKey) <= 0 { - var size uint64 - var err error - if file.Virtual { - err = d.tableCache.withVirtualReader( - file.VirtualMeta(), - func(r sstable.VirtualReader) (err error) { - size, err = r.EstimateDiskUsage(start, end) - return err - }, - ) - } else { - err = d.tableCache.withReader( - file.PhysicalMeta(), - func(r *sstable.Reader) (err error) { - size, err = r.EstimateDiskUsage(start, end) - return err - }, - ) - } - if err != nil { - return 0, 0, 0, err - } - meta, err := d.objProvider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) - if err != nil { - return 0, 0, 0, err - } - if meta.IsRemote() { - remoteSize += size - if meta.Remote.CleanupMethod == objstorage.SharedNoCleanup { - externalSize += size - } - } - totalSize += size - } - } - } - return totalSize, remoteSize, externalSize, nil + totalSize = *d.mu.annotators.totalFileSize.VersionRangeAnnotation(readState.current, bounds) + remoteSize = *d.mu.annotators.remoteSize.VersionRangeAnnotation(readState.current, bounds) + externalSize = *d.mu.annotators.externalSize.VersionRangeAnnotation(readState.current, bounds) + + return } func (d *DB) walPreallocateSize() int { @@ -2312,13 +2508,24 @@ func (d *DB) walPreallocateSize() int { return int(size) } -func (d *DB) newMemTable(logNum FileNum, logSeqNum uint64) (*memTable, *flushableEntry) { +func (d *DB) newMemTable( + logNum base.DiskFileNum, logSeqNum base.SeqNum, minSize uint64, +) (*memTable, *flushableEntry) { + targetSize := minSize + uint64(memTableEmptySize) + // The targetSize should be less than MemTableSize, because any batch >= + // MemTableSize/2 should be treated as a large flushable batch. + if targetSize > d.opts.MemTableSize { + panic(errors.AssertionFailedf("attempting to allocate memtable larger than MemTableSize")) + } + // Double until the next memtable size is at least large enough to fit + // minSize. + for d.mu.mem.nextSize < targetSize { + d.mu.mem.nextSize = min(2*d.mu.mem.nextSize, d.opts.MemTableSize) + } size := d.mu.mem.nextSize + // The next memtable should be double the size, up to Options.MemTableSize. if d.mu.mem.nextSize < d.opts.MemTableSize { - d.mu.mem.nextSize *= 2 - if d.mu.mem.nextSize > d.opts.MemTableSize { - d.mu.mem.nextSize = d.opts.MemTableSize - } + d.mu.mem.nextSize = min(2*d.mu.mem.nextSize, d.opts.MemTableSize) } memtblOpts := memTableOptions{ @@ -2336,7 +2543,7 @@ func (d *DB) newMemTable(logNum FileNum, logSeqNum uint64) (*memTable, *flushabl // existing memory. var mem *memTable mem = d.memTableRecycle.Swap(nil) - if mem != nil && uint64(len(mem.arenaBuf)) != size { + if mem != nil && uint64(mem.arenaBuf.Len()) != size { d.freeMemTable(mem) mem = nil } @@ -2346,7 +2553,7 @@ func (d *DB) newMemTable(logNum FileNum, logSeqNum uint64) (*memTable, *flushabl memtblOpts.releaseAccountingReservation = mem.releaseAccountingReservation } else { mem = new(memTable) - memtblOpts.arenaBuf = manual.New(int(size)) + memtblOpts.arenaBuf = manual.New(manual.MemTable, uintptr(size)) memtblOpts.releaseAccountingReservation = d.opts.Cache.Reserve(int(size)) d.memTableCount.Add(1) d.memTableReserved.Add(int64(size)) @@ -2379,11 +2586,13 @@ func (d *DB) newMemTable(logNum FileNum, logSeqNum uint64) (*memTable, *flushabl func (d *DB) freeMemTable(m *memTable) { d.memTableCount.Add(-1) - d.memTableReserved.Add(-int64(len(m.arenaBuf))) + d.memTableReserved.Add(-int64(m.arenaBuf.Len())) m.free() } -func (d *DB) newFlushableEntry(f flushable, logNum FileNum, logSeqNum uint64) *flushableEntry { +func (d *DB) newFlushableEntry( + f flushable, logNum base.DiskFileNum, logSeqNum base.SeqNum, +) *flushableEntry { fe := &flushableEntry{ flushable: f, flushed: make(chan struct{}), @@ -2396,61 +2605,49 @@ func (d *DB) newFlushableEntry(f flushable, logNum FileNum, logSeqNum uint64) *f return fe } -// makeRoomForWrite ensures that the memtable has room to hold the contents of -// Batch. It reserves the space in the memtable and adds a reference to the -// memtable. The caller must later ensure that the memtable is unreferenced. If -// the memtable is full, or a nil Batch is provided, the current memtable is -// rotated (marked as immutable) and a new mutable memtable is allocated. This -// memtable rotation also causes a log rotation. +// maybeInduceWriteStall is called before performing a memtable rotation in +// makeRoomForWrite. In some conditions, we prefer to stall the user's write +// workload rather than continuing to accept writes that may result in resource +// exhaustion or prohibitively slow reads. // -// Both DB.mu and commitPipeline.mu must be held by the caller. Note that DB.mu -// may be released and reacquired. -func (d *DB) makeRoomForWrite(b *Batch) error { - if b != nil && b.ingestedSSTBatch { - panic("pebble: invalid function call") - } - - force := b == nil || b.flushable != nil +// There are a couple reasons we might wait to rotate the memtable and +// instead induce a write stall: +// 1. If too many memtables have queued, we wait for a flush to finish before +// creating another memtable. +// 2. If L0 read amplification has grown too high, we wait for compactions +// to reduce the read amplification before accepting more writes that will +// increase write pressure. +// +// maybeInduceWriteStall checks these stall conditions, and if present, waits +// for them to abate. +func (d *DB) maybeInduceWriteStall(b *Batch) { stalled := false + // This function will call EventListener.WriteStallBegin at most once. If + // it does call it, it will call EventListener.WriteStallEnd once before + // returning. for { - if b != nil && b.flushable == nil { - err := d.mu.mem.mutable.prepare(b) - if err != arenaskl.ErrArenaFull { - if stalled { - d.opts.EventListener.WriteStallEnd() - } - return err - } - } else if !force { - if stalled { - d.opts.EventListener.WriteStallEnd() - } - return nil - } - // force || err == ErrArenaFull, so we need to rotate the current memtable. - { - var size uint64 - for i := range d.mu.mem.queue { - size += d.mu.mem.queue[i].totalBytes() + var size uint64 + for i := range d.mu.mem.queue { + size += d.mu.mem.queue[i].totalBytes() + } + if size >= uint64(d.opts.MemTableStopWritesThreshold)*d.opts.MemTableSize && + !d.mu.log.manager.ElevateWriteStallThresholdForFailover() { + // We have filled up the current memtable, but already queued memtables + // are still flushing, so we wait. + if !stalled { + stalled = true + d.opts.EventListener.WriteStallBegin(WriteStallBeginInfo{ + Reason: "memtable count limit reached", + }) } - if size >= uint64(d.opts.MemTableStopWritesThreshold)*d.opts.MemTableSize { - // We have filled up the current memtable, but already queued memtables - // are still flushing, so we wait. - if !stalled { - stalled = true - d.opts.EventListener.WriteStallBegin(WriteStallBeginInfo{ - Reason: "memtable count limit reached", - }) - } - now := time.Now() - d.mu.compact.cond.Wait() - if b != nil { - b.commitStats.MemTableWriteStallDuration += time.Since(now) - } - continue + beforeWait := crtime.NowMono() + d.mu.compact.cond.Wait() + if b != nil { + b.commitStats.MemTableWriteStallDuration += beforeWait.Elapsed() } + continue } - l0ReadAmp := d.mu.versions.currentVersion().L0Sublevels.ReadAmplification() + l0ReadAmp := d.mu.versions.latest.l0Organizer.ReadAmplification() if l0ReadAmp >= d.opts.L0StopWritesThreshold { // There are too many level-0 files, so we wait. if !stalled { @@ -2459,38 +2656,59 @@ func (d *DB) makeRoomForWrite(b *Batch) error { Reason: "L0 file count limit exceeded", }) } - now := time.Now() + beforeWait := crtime.NowMono() d.mu.compact.cond.Wait() if b != nil { - b.commitStats.L0ReadAmpWriteStallDuration += time.Since(now) + b.commitStats.L0ReadAmpWriteStallDuration += beforeWait.Elapsed() } continue } - - var newLogNum base.FileNum - var prevLogSize uint64 - if !d.opts.DisableWAL { - now := time.Now() - newLogNum, prevLogSize = d.rotateWAL() - if b != nil { - b.commitStats.WALRotationDuration += time.Since(now) - } + // Not stalled. + if stalled { + d.opts.EventListener.WriteStallEnd() } + return + } +} - immMem := d.mu.mem.mutable - imm := d.mu.mem.queue[len(d.mu.mem.queue)-1] - imm.logSize = prevLogSize - imm.flushForced = imm.flushForced || (b == nil) +// makeRoomForWrite rotates the current mutable memtable, ensuring that the +// resulting mutable memtable has room to hold the contents of the provided +// Batch. The current memtable is rotated (marked as immutable) and a new +// mutable memtable is allocated. It reserves space in the new memtable and adds +// a reference to the memtable. The caller must later ensure that the memtable +// is unreferenced. This memtable rotation also causes a log rotation. +// +// If the current memtable is not full but the caller wishes to trigger a +// rotation regardless, the caller may pass a nil Batch, and no space in the +// resulting mutable memtable will be reserved. +// +// Both DB.mu and commitPipeline.mu must be held by the caller. Note that DB.mu +// may be released and reacquired. +func (d *DB) makeRoomForWrite(b *Batch) error { + if b != nil && b.ingestedSSTBatch { + panic("pebble: invalid function call") + } + d.maybeInduceWriteStall(b) - // If we are manually flushing and we used less than half of the bytes in - // the memtable, don't increase the size for the next memtable. This - // reduces memtable memory pressure when an application is frequently - // manually flushing. - if (b == nil) && uint64(immMem.availBytes()) > immMem.totalBytes()/2 { - d.mu.mem.nextSize = immMem.totalBytes() + var newLogNum base.DiskFileNum + var prevLogSize uint64 + if !d.opts.DisableWAL { + beforeRotate := crtime.NowMono() + newLogNum, prevLogSize = d.rotateWAL() + if b != nil { + b.commitStats.WALRotationDuration += beforeRotate.Elapsed() } + } + immMem := d.mu.mem.mutable + imm := d.mu.mem.queue[len(d.mu.mem.queue)-1] + imm.logSize = prevLogSize - if b != nil && b.flushable != nil { + var logSeqNum base.SeqNum + var minSize uint64 + if b != nil { + logSeqNum = b.SeqNum() + if b.flushable != nil { + logSeqNum += base.SeqNum(b.Count()) // The batch is too large to fit in the memtable so add it directly to // the immutable queue. The flushable batch is associated with the same // log as the immutable memtable, but logically occurs after it in @@ -2507,24 +2725,43 @@ func (d *DB) makeRoomForWrite(b *Batch) error { // for it until it is flushed. entry.releaseMemAccounting = d.opts.Cache.Reserve(int(b.flushable.totalBytes())) d.mu.mem.queue = append(d.mu.mem.queue, entry) - } - - var logSeqNum uint64 - if b != nil { - logSeqNum = b.SeqNum() - if b.flushable != nil { - logSeqNum += uint64(b.Count()) - } } else { - logSeqNum = d.mu.versions.logSeqNum.Load() + minSize = b.memTableSize } - d.rotateMemtable(newLogNum, logSeqNum, immMem) - force = false + } else { + // b == nil + // + // This is a manual forced flush. + logSeqNum = base.SeqNum(d.mu.versions.logSeqNum.Load()) + imm.flushForced = true + // If we are manually flushing and we used less than half of the bytes in + // the memtable, don't increase the size for the next memtable. This + // reduces memtable memory pressure when an application is frequently + // manually flushing. + if uint64(immMem.availBytes()) > immMem.totalBytes()/2 { + d.mu.mem.nextSize = immMem.totalBytes() + } + } + d.rotateMemtable(newLogNum, logSeqNum, immMem, minSize) + if b != nil && b.flushable == nil { + err := d.mu.mem.mutable.prepare(b) + // Reserving enough space for the batch after rotation must never fail. + // We pass in a minSize that's equal to b.memtableSize to ensure that + // memtable rotation allocates a memtable sufficiently large. We also + // held d.commit.mu for the entirety of this function, ensuring that no + // other committers may have reserved memory in the new memtable yet. + if err == arenaskl.ErrArenaFull { + panic(errors.AssertionFailedf("memtable still full after rotation")) + } + return err } + return nil } // Both DB.mu and commitPipeline.mu must be held by the caller. -func (d *DB) rotateMemtable(newLogNum FileNum, logSeqNum uint64, prev *memTable) { +func (d *DB) rotateMemtable( + newLogNum base.DiskFileNum, logSeqNum base.SeqNum, prev *memTable, minSize uint64, +) { // Create a new memtable, scheduling the previous one for flushing. We do // this even if the previous memtable was empty because the DB.Flush // mechanism is dependent on being able to wait for the empty memtable to @@ -2541,7 +2778,7 @@ func (d *DB) rotateMemtable(newLogNum FileNum, logSeqNum uint64, prev *memTable) // // NB: prev should be the current mutable memtable. var entry *flushableEntry - d.mu.mem.mutable, entry = d.newMemTable(newLogNum, logSeqNum) + d.mu.mem.mutable, entry = d.newMemTable(newLogNum, logSeqNum, minSize) d.mu.mem.queue = append(d.mu.mem.queue, entry) // d.logSize tracks the log size of the WAL file corresponding to the most // recent flushable. The log size of the previous mutable memtable no longer @@ -2565,135 +2802,46 @@ func (d *DB) rotateMemtable(newLogNum FileNum, logSeqNum uint64, prev *memTable) // // Both DB.mu and commitPipeline.mu must be held by the caller. Note that DB.mu // may be released and reacquired. -func (d *DB) rotateWAL() (newLogNum FileNum, prevLogSize uint64) { +func (d *DB) rotateWAL() (newLogNum base.DiskFileNum, prevLogSize uint64) { if d.opts.DisableWAL { panic("pebble: invalid function call") } + jobID := d.newJobIDLocked() + newLogNum = d.mu.versions.getNextDiskFileNum() - jobID := d.mu.nextJobID - d.mu.nextJobID++ - newLogNum = d.mu.versions.getNextFileNum() - - prevLogSize = uint64(d.mu.log.Size()) - - // The previous log may have grown past its original physical - // size. Update its file size in the queue so we have a proper - // accounting of its file size. - if d.mu.log.queue[len(d.mu.log.queue)-1].fileSize < prevLogSize { - d.mu.log.queue[len(d.mu.log.queue)-1].fileSize = prevLogSize - } d.mu.Unlock() - - var err error // Close the previous log first. This writes an EOF trailer // signifying the end of the file and syncs it to disk. We must // close the previous log before linking the new log file, // otherwise a crash could leave both logs with unclean tails, and // Open will treat the previous log as corrupt. - err = d.mu.log.LogWriter.Close() - metrics := d.mu.log.LogWriter.Metrics() - d.mu.Lock() - if err := d.mu.log.metrics.Merge(metrics); err != nil { - d.opts.Logger.Infof("metrics error: %s", err) - } - d.mu.Unlock() - - newLogName := base.MakeFilepath(d.opts.FS, d.walDirname, fileTypeLog, newLogNum.DiskFileNum()) - - // Try to use a recycled log file. Recycling log files is an important - // performance optimization as it is faster to sync a file that has - // already been written, than one which is being written for the first - // time. This is due to the need to sync file metadata when a file is - // being written for the first time. Note this is true even if file - // preallocation is performed (e.g. fallocate). - var recycleLog fileInfo - var recycleOK bool - var newLogFile vfs.File - if err == nil { - recycleLog, recycleOK = d.logRecycler.peek() - if recycleOK { - recycleLogName := base.MakeFilepath(d.opts.FS, d.walDirname, fileTypeLog, recycleLog.fileNum) - newLogFile, err = d.opts.FS.ReuseForWrite(recycleLogName, newLogName) - base.MustExist(d.opts.FS, newLogName, d.opts.Logger, err) - } else { - newLogFile, err = d.opts.FS.Create(newLogName) - base.MustExist(d.opts.FS, newLogName, d.opts.Logger, err) - } - } - - var newLogSize uint64 - if err == nil && recycleOK { - // Figure out the recycled WAL size. This Stat is necessary - // because ReuseForWrite's contract allows for removing the - // old file and creating a new one. We don't know whether the - // WAL was actually recycled. - // TODO(jackson): Adding a boolean to the ReuseForWrite return - // value indicating whether or not the file was actually - // reused would allow us to skip the stat and use - // recycleLog.fileSize. - var finfo os.FileInfo - finfo, err = newLogFile.Stat() - if err == nil { - newLogSize = uint64(finfo.Size()) - } - } - - if err == nil { - // TODO(peter): RocksDB delays sync of the parent directory until the - // first time the log is synced. Is that worthwhile? - err = d.walDir.Sync() - } - - if err != nil && newLogFile != nil { - newLogFile.Close() - } else if err == nil { - newLogFile = vfs.NewSyncingFile(newLogFile, vfs.SyncingFileOptions{ - NoSyncOnClose: d.opts.NoSyncOnClose, - BytesPerSync: d.opts.WALBytesPerSync, - PreallocateSize: d.walPreallocateSize(), - }) - } - - if recycleOK { - err = firstError(err, d.logRecycler.pop(recycleLog.fileNum.FileNum())) - } - - d.opts.EventListener.WALCreated(WALCreateInfo{ - JobID: jobID, - Path: newLogName, - FileNum: newLogNum, - RecycledFileNum: recycleLog.fileNum.FileNum(), - Err: err, - }) - - d.mu.Lock() - - d.mu.versions.metrics.WAL.Files++ - + offset, err := d.mu.log.writer.Close() if err != nil { - // TODO(peter): avoid chewing through file numbers in a tight loop if there - // is an error here. - // // What to do here? Stumbling on doesn't seem worthwhile. If we failed to // close the previous log it is possible we lost a write. panic(err) } + prevLogSize = uint64(offset) + metrics := d.mu.log.writer.Metrics() - d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum.DiskFileNum(), fileSize: newLogSize}) - d.mu.log.LogWriter = record.NewLogWriter(newLogFile, newLogNum, record.LogWriterConfig{ - WALFsyncLatency: d.mu.log.metrics.fsyncLatency, - WALMinSyncInterval: d.opts.WALMinSyncInterval, - QueueSemChan: d.commit.logSyncQSem, - }) - if d.mu.log.registerLogWriterForTesting != nil { - d.mu.log.registerLogWriterForTesting(d.mu.log.LogWriter) + d.mu.Lock() + if err := d.mu.log.metrics.LogWriterMetrics.Merge(&metrics); err != nil { + d.opts.Logger.Errorf("metrics error: %s", err) } - return + d.mu.Unlock() + writer, err := d.mu.log.manager.Create(wal.NumWAL(newLogNum), int(jobID)) + if err != nil { + panic(err) + } + + d.mu.Lock() + d.mu.log.writer = writer + return newLogNum, prevLogSize } -func (d *DB) getEarliestUnflushedSeqNumLocked() uint64 { - seqNum := InternalKeySeqNumMax +func (d *DB) getEarliestUnflushedSeqNumLocked() base.SeqNum { + seqNum := base.SeqNumMax for i := range d.mu.mem.queue { logSeqNum := d.mu.mem.queue[i].logSeqNum if seqNum > logSeqNum { @@ -2703,20 +2851,10 @@ func (d *DB) getEarliestUnflushedSeqNumLocked() uint64 { return seqNum } -func (d *DB) getInProgressCompactionInfoLocked(finishing *compaction) (rv []compactionInfo) { +func (d *DB) getInProgressCompactionInfoLocked(finishing compaction) (rv []compactionInfo) { for c := range d.mu.compact.inProgress { - if len(c.flushing) == 0 && (finishing == nil || c != finishing) { - info := compactionInfo{ - versionEditApplied: c.versionEditApplied, - inputs: c.inputs, - smallest: c.smallest, - largest: c.largest, - outputLevel: -1, - } - if c.outputLevel != nil { - info.outputLevel = c.outputLevel.level - } - rv = append(rv, info) + if !c.IsFlush() && (finishing == nil || c != finishing) { + rv = append(rv, c.Info()) } } return @@ -2741,8 +2879,7 @@ func inProgressL0Compactions(inProgress []compactionInfo) []manifest.L0Compactio continue } compactions = append(compactions, manifest.L0Compaction{ - Smallest: info.smallest, - Largest: info.largest, + Bounds: *info.bounds, IsIntraL0: info.outputLevel == 0, }) } @@ -2838,6 +2975,10 @@ func (d *DB) ScanStatistics( tb := tokenbucket.TokenBucket{} if opts.LimitBytesPerSecond != 0 { + const minBytesPerSec = 100 * 1024 + if opts.LimitBytesPerSecond < minBytesPerSec { + return stats, errors.Newf("pebble: ScanStatistics read bandwidth limit %d is below minimum %d", opts.LimitBytesPerSecond, minBytesPerSec) + } // Each "token" roughly corresponds to a byte that was read. tb.Init(tokenbucket.TokensPerSecond(opts.LimitBytesPerSecond), tokenbucket.Tokens(1024)) rateLimitFunc = func(key *InternalKey, val LazyValue) error { @@ -2873,7 +3014,7 @@ func (d *DB) ScanStatistics( stats.BytesRead += uint64(key.Size() + value.Len()) return nil }, - visitRangeDel: func(start, end []byte, seqNum uint64) error { + visitRangeDel: func(start, end []byte, seqNum base.SeqNum) error { stats.Accumulated.KindsCount[InternalKeyKindRangeDelete]++ stats.BytesRead += uint64(len(start) + len(end)) return nil @@ -2894,10 +3035,13 @@ func (d *DB) ScanStatistics( }, rateLimitFunc: rateLimitFunc, } - iter := d.newInternalIter(snapshotIterOpts{}, scanInternalOpts) + iter, err := d.newInternalIter(ctx, snapshotIterOpts{}, scanInternalOpts) + if err != nil { + return LSMKeyStatistics{}, err + } defer iter.close() - err := scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts) + err = scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts) if err != nil { return LSMKeyStatistics{}, err @@ -2912,93 +3056,114 @@ func (d *DB) ObjProvider() objstorage.Provider { return d.objProvider } -func (d *DB) checkVirtualBounds(m *fileMetadata) { +func (d *DB) checkVirtualBounds(m *manifest.TableMetadata) { if !invariants.Enabled { return } - if m.HasPointKeys { - pointIter, rangeDelIter, err := d.newIters(context.TODO(), m, nil, internalIterOpts{}) - if err != nil { - panic(errors.Wrap(err, "pebble: error creating point iterator")) - } + objMeta, err := d.objProvider.Lookup(base.FileTypeTable, m.TableBacking.DiskFileNum) + if err != nil { + panic(err) + } + if objMeta.IsExternal() { + // Nothing to do; bounds are expected to be loose. + return + } - defer pointIter.Close() - if rangeDelIter != nil { - defer rangeDelIter.Close() - } + iters, err := d.newIters(context.TODO(), m, nil, internalIterOpts{}, iterPointKeys|iterRangeDeletions|iterRangeKeys) + if err != nil { + panic(errors.Wrap(err, "pebble: error creating iterators")) + } + defer func() { _ = iters.CloseAll() }() - pointKey, _ := pointIter.First() - var rangeDel *keyspan.Span - if rangeDelIter != nil { - rangeDel = rangeDelIter.First() - } + if m.HasPointKeys { + pointIter := iters.Point() + rangeDelIter := iters.RangeDeletion() // Check that the lower bound is tight. - if (rangeDel == nil || d.cmp(rangeDel.SmallestKey().UserKey, m.SmallestPointKey.UserKey) != 0) && - (pointKey == nil || d.cmp(pointKey.UserKey, m.SmallestPointKey.UserKey) != 0) { - panic(errors.Newf("pebble: virtual sstable %s lower point key bound is not tight", m.FileNum)) + pointKV := pointIter.First() + rangeDel, err := rangeDelIter.First() + if err != nil { + panic(err) } - - pointKey, _ = pointIter.Last() - rangeDel = nil - if rangeDelIter != nil { - rangeDel = rangeDelIter.Last() + if (rangeDel == nil || d.cmp(rangeDel.SmallestKey().UserKey, m.PointKeyBounds.Smallest().UserKey) != 0) && + (pointKV == nil || d.cmp(pointKV.K.UserKey, m.PointKeyBounds.Smallest().UserKey) != 0) { + panic(errors.Newf("pebble: virtual sstable %s lower point key bound is not tight", m.TableNum)) } // Check that the upper bound is tight. - if (rangeDel == nil || d.cmp(rangeDel.LargestKey().UserKey, m.LargestPointKey.UserKey) != 0) && - (pointKey == nil || d.cmp(pointKey.UserKey, m.LargestPointKey.UserKey) != 0) { - panic(errors.Newf("pebble: virtual sstable %s upper point key bound is not tight", m.FileNum)) + pointKV = pointIter.Last() + rangeDel, err = rangeDelIter.Last() + if err != nil { + panic(err) + } + if (rangeDel == nil || d.cmp(rangeDel.LargestKey().UserKey, m.PointKeyBounds.LargestUserKey()) != 0) && + (pointKV == nil || d.cmp(pointKV.K.UserKey, m.PointKeyBounds.Largest().UserKey) != 0) { + panic(errors.Newf("pebble: virtual sstable %s upper point key bound is not tight", m.TableNum)) } // Check that iterator keys are within bounds. - for key, _ := pointIter.First(); key != nil; key, _ = pointIter.Next() { - if d.cmp(key.UserKey, m.SmallestPointKey.UserKey) < 0 || d.cmp(key.UserKey, m.LargestPointKey.UserKey) > 0 { - panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.UserKey)) + for kv := pointIter.First(); kv != nil; kv = pointIter.Next() { + if d.cmp(kv.K.UserKey, m.PointKeyBounds.Smallest().UserKey) < 0 || d.cmp(kv.K.UserKey, m.PointKeyBounds.LargestUserKey()) > 0 { + panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.TableNum, kv.K.UserKey)) } } - - if rangeDelIter != nil { - for key := rangeDelIter.First(); key != nil; key = rangeDelIter.Next() { - if d.cmp(key.SmallestKey().UserKey, m.SmallestPointKey.UserKey) < 0 { - panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.SmallestKey().UserKey)) - } - - if d.cmp(key.LargestKey().UserKey, m.LargestPointKey.UserKey) > 0 { - panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.LargestKey().UserKey)) - } + s, err := rangeDelIter.First() + for ; s != nil; s, err = rangeDelIter.Next() { + if d.cmp(s.SmallestKey().UserKey, m.PointKeyBounds.Smallest().UserKey) < 0 { + panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.TableNum, s.SmallestKey().UserKey)) + } + if d.cmp(s.LargestKey().UserKey, m.PointKeyBounds.Largest().UserKey) > 0 { + panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.TableNum, s.LargestKey().UserKey)) } } + if err != nil { + panic(err) + } } if !m.HasRangeKeys { return } - - rangeKeyIter, err := d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{}) - defer rangeKeyIter.Close() - - if err != nil { - panic(errors.Wrap(err, "pebble: error creating range key iterator")) - } + rangeKeyIter := iters.RangeKey() // Check that the lower bound is tight. - if d.cmp(rangeKeyIter.First().SmallestKey().UserKey, m.SmallestRangeKey.UserKey) != 0 { - panic(errors.Newf("pebble: virtual sstable %s lower range key bound is not tight", m.FileNum)) + if s, err := rangeKeyIter.First(); err != nil { + panic(err) + } else if m.HasRangeKeys && d.cmp(s.SmallestKey().UserKey, m.RangeKeyBounds.SmallestUserKey()) != 0 { + panic(errors.Newf("pebble: virtual sstable %s lower range key bound is not tight", m.TableNum)) } // Check that upper bound is tight. - if d.cmp(rangeKeyIter.Last().LargestKey().UserKey, m.LargestRangeKey.UserKey) != 0 { - panic(errors.Newf("pebble: virtual sstable %s upper range key bound is not tight", m.FileNum)) + if s, err := rangeKeyIter.Last(); err != nil { + panic(err) + } else if d.cmp(s.LargestKey().UserKey, m.RangeKeyBounds.LargestUserKey()) != 0 { + panic(errors.Newf("pebble: virtual sstable %s upper range key bound is not tight", m.TableNum)) } - for key := rangeKeyIter.First(); key != nil; key = rangeKeyIter.Next() { - if d.cmp(key.SmallestKey().UserKey, m.SmallestRangeKey.UserKey) < 0 { - panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.SmallestKey().UserKey)) + s, err := rangeKeyIter.First() + for ; s != nil; s, err = rangeKeyIter.Next() { + if d.cmp(s.SmallestKey().UserKey, m.RangeKeyBounds.SmallestUserKey()) < 0 { + panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.TableNum, s.SmallestKey().UserKey)) } - if d.cmp(key.LargestKey().UserKey, m.LargestRangeKey.UserKey) > 0 { - panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.LargestKey().UserKey)) + if d.cmp(s.LargestKey().UserKey, m.RangeKeyBounds.LargestUserKey()) > 0 { + panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.TableNum, s.LargestKey().UserKey)) } } + if err != nil { + panic(err) + } +} + +// DebugString returns a debugging string describing the LSM. +func (d *DB) DebugString() string { + return d.DebugCurrentVersion().DebugString() +} + +// DebugCurrentVersion returns the current LSM tree metadata. Should only be +// used for testing/debugging. +func (d *DB) DebugCurrentVersion() *manifest.Version { + d.mu.Lock() + defer d.mu.Unlock() + return d.mu.versions.currentVersion() } diff --git a/vendor/github.com/cockroachdb/pebble/v2/db_internals.go b/vendor/github.com/cockroachdb/pebble/v2/db_internals.go new file mode 100644 index 0000000..57432ec --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/db_internals.go @@ -0,0 +1,24 @@ +// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +// JobID identifies a job (like a compaction). Job IDs are passed to event +// listener notifications and act as a mechanism for tying together the events +// and log messages for a single job such as a flush, compaction, or file +// ingestion. Job IDs are not serialized to disk or used for correctness. +type JobID int + +// newJobIDLocked returns a new JobID; DB.mu must be held. +func (d *DB) newJobIDLocked() JobID { + res := d.mu.nextJobID + d.mu.nextJobID++ + return res +} + +func (d *DB) newJobID() JobID { + d.mu.Lock() + defer d.mu.Unlock() + return d.newJobIDLocked() +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/download.go b/vendor/github.com/cockroachdb/pebble/v2/download.go new file mode 100644 index 0000000..b40ec24 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/download.go @@ -0,0 +1,553 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "cmp" + "context" + "fmt" + "slices" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/objstorage" +) + +// DownloadSpan is a key range passed to the Download method. +type DownloadSpan struct { + StartKey []byte + // EndKey is exclusive. + EndKey []byte + // ViaBackingFileDownload, if true, indicates the span should be downloaded by + // downloading any remote backing files byte-for-byte and replacing them with + // the downloaded local files, while otherwise leaving the virtual SSTables + // as-is. If false, a "normal" rewriting compaction of the span, that iterates + // the keys and produces a new SSTable, is used instead. Downloading raw files + // can be faster when the whole file is being downloaded, as it avoids some + // cpu-intensive steps involved in iteration and new file construction such as + // compression, however it can also be wasteful when only a small portion of a + // larger backing file is being used by a virtual file. Additionally, if the + // virtual file has expensive read-time transformations, such as prefix + // replacement, rewriting once can persist the result of these for future use + // while copying only the backing file will obligate future reads to continue + // to compute such transforms. + ViaBackingFileDownload bool +} + +// Download ensures that the LSM does not use any external sstables for the +// given key ranges. It does so by performing appropriate compactions so that +// all external data becomes available locally. +// +// Note that calling this method does not imply that all other compactions stop; +// it simply informs Pebble of a list of spans for which external data should be +// downloaded with high priority. +// +// The method returns once no external sstables overlap the given spans, the +// context is canceled, the db is closed, or an error is hit. +// +// Note that despite the best effort of this method, if external ingestions +// happen in parallel, a new external file can always appear right as we're +// returning. +// +// TODO(radu): consider passing a priority/impact knob to express how important +// the download is (versus live traffic performance, LSM health). +func (d *DB) Download(ctx context.Context, spans []DownloadSpan) error { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + if err := d.closed.Load(); err != nil { + panic(err) + } + if d.opts.ReadOnly { + return ErrReadOnly + } + info := DownloadInfo{ + JobID: int(d.newJobID()), + Spans: spans, + } + startTime := d.timeNow() + d.opts.EventListener.DownloadBegin(info) + + for info.RestartCount = 0; ; info.RestartCount++ { + tasks := d.createDownloadTasks(spans) + info.Duration = d.timeNow().Sub(startTime) + if len(tasks) == 0 { + // We are done. + info.Done = true + d.opts.EventListener.DownloadEnd(info) + return nil + } + if info.RestartCount > 0 { + d.opts.EventListener.DownloadBegin(info) + } + + // Install the tasks. + d.mu.Lock() + d.mu.compact.downloads = append(d.mu.compact.downloads, tasks...) + d.maybeScheduleCompaction() + d.mu.Unlock() + + err := d.waitForDownloadTasks(ctx, tasks) + for _, t := range tasks { + info.DownloadCompactionsLaunched += t.numLaunchedDownloads + } + + if err != nil { + info.Err = err + info.Duration = d.timeNow().Sub(startTime) + d.opts.EventListener.DownloadEnd(info) + return err + } + } +} + +// createDownloadTasks creates downloadSpanTasks for the download spans that +// overlap external files in the given version. +func (d *DB) createDownloadTasks(spans []DownloadSpan) []*downloadSpanTask { + d.mu.Lock() + vers := d.mu.versions.currentVersion() + d.mu.Unlock() + + tasks := make([]*downloadSpanTask, 0, len(spans)) + for i := range spans { + task, ok := d.newDownloadSpanTask(vers, spans[i]) + // If !ok, there are no external files in this span. + if ok { + tasks = append(tasks, task) + } + } + return tasks +} + +// waitForDownloadTasks waits until all download tasks complete. +func (d *DB) waitForDownloadTasks(ctx context.Context, tasks []*downloadSpanTask) error { + for i := range tasks { + select { + case <-ctx.Done(): + d.removeDownloadTasks(tasks) + return ctx.Err() + + case err := <-tasks[i].taskCompletedChan: + if err != nil { + d.removeDownloadTasks(tasks) + return err + } + } + } + return nil +} + +// removeDownloadTasks removes all tasks in the given slice from +// d.mu.compact.downloads. +func (d *DB) removeDownloadTasks(tasks []*downloadSpanTask) { + d.mu.Lock() + defer d.mu.Unlock() + d.mu.compact.downloads = slices.DeleteFunc(d.mu.compact.downloads, func(t *downloadSpanTask) bool { + return slices.Contains(tasks, t) + }) +} + +// downloadSpanTask tracks the task of downloading external files that overlap +// with a DownloadSpan. +// +// A downloadSpanTask is spawned only if at least one overlapping external file +// is found in the current version. +// +// When a downloadSpanTask completes (i.e. taskCompletedChan is signaled) +// without an error, it is guaranteed that all external files that were +// overlapping the download span at the beginning of the task are downloaded. +// +// == Implementation == +// +// A download span task moves through the LSM within the given bounds in +// top-down level order (L0, L1, etc.), and in Smallest.UserKey order within +// each level (and breaking ties in L0 according to LargestSeqNum). We introduce +// the concept of a "download cursor" to keep track of where we are in this +// process, in a way that is independent of any one version. A cursor stores the +// level, a user key which is a lower bound for Smallest.UserKey within that +// level, and a sequence number which is a lower bound for the LargestSeqNum for +// files on that level starting at exactly that key. +// +// While a download task is running, tables with external backings can disappear +// due to excises or compactions; tables can move *down* (to a lower LSM level); +// or tables can have their bounds shrink due to excises (and these will appear +// as new tables, even though they have the same backing). The top-down, +// left-to-right-start-key ordering ensures that we don't miss any table +// (instead, we may examine it multiple times). +// +// We use a cursor that advances as our download task makes progress. Each time +// we encounter a file that needs downloading, we create a "bookmark". A +// bookmark conceptually represents a key range within a level and it +// corresponds to the bounds of the file that we discovered. It is represented +// as a cursor position (corresponding to the start) and an end boundary key. We +// need to remember the bookmark because the download compaction can fail (e.g. +// it can get canceled by an excise) and the file might get excised so we need +// to look again at all files within the original key range. +// +// It is also possible that we encounter files that are already part of a +// compaction. These can be move compaction, or can get canceled, so we can't +// just ignore these files; we create bookmarks for such files as well. +// +// We maintain no more than maxConcurrentDownloads bookmarks - the idea being +// that files that are part of compactions are getting downloaded anyway and we +// can effectively count them toward the limit. When we cannot create any more +// bookmarks, we stop advancing the task cursor. Note that it is not this code's +// job to enforce the maximum concurrency, this is simply a reasonable limit - we +// don't want to accumulate arbitrarily many bookmarks, since we check each one +// whenever tryLaunchDownloadCompaction is called (after every compaction +// completing). +// +// This implementation achieves O(maxConcurrentDownloads * N) level iterator +// operations across the entire task, where N is the (average) number of files +// within the bounds. +type downloadSpanTask struct { + downloadSpan DownloadSpan + + // The download task pertains to sstables which *start* (as per + // Smallest.UserKey) within these bounds. + bounds base.UserKeyBounds + + // taskCompletedChan is signaled when we have finished download compactions + // for all external files encountered within the bounds, or when one of these + // compactions reports an error (other than ErrCancelledCompaction). + taskCompletedChan chan error + + numLaunchedDownloads int + + // Keeps track of the current position; all files up to these position were + // examined and were either downloaded or we have bookmarks for them. + cursor downloadCursor + + // Bookmarks remember areas which correspond to downloads that we started or + // files that were undergoing other compactions and which we need to check + // again before completing the task. + bookmarks []downloadBookmark + + // Testing hooks. + testing struct { + launchDownloadCompaction func(f *manifest.TableMetadata) (chan error, bool) + } +} + +// downloadBookmark represents an area that was swept by the task cursor which +// corresponds to a file that was part of a running compaction or download. +type downloadBookmark struct { + start downloadCursor + endBound base.UserKeyBoundary + // downloadDoneCh is set if this bookmark corresponds to a download we + // started; in this case the channel will report the status of that + // compaction. + downloadDoneCh chan error +} + +func (d *DB) newDownloadSpanTask( + vers *manifest.Version, sp DownloadSpan, +) (_ *downloadSpanTask, ok bool) { + bounds := base.UserKeyBoundsEndExclusive(sp.StartKey, sp.EndKey) + // We are interested in all external sstables that *overlap* with + // [sp.StartKey, sp.EndKey). Expand the bounds to the left so that we + // include the start keys of any external sstables that overlap with + // sp.StartKey. + for _, ls := range vers.AllLevelsAndSublevels() { + iter := ls.Iter() + if f := iter.SeekGE(d.cmp, sp.StartKey); f != nil && + objstorage.IsExternalTable(d.objProvider, f.TableBacking.DiskFileNum) && + d.cmp(f.Smallest().UserKey, bounds.Start) < 0 { + bounds.Start = f.Smallest().UserKey + } + } + startCursor := downloadCursor{ + level: 0, + key: bounds.Start, + seqNum: 0, + } + f, level := startCursor.NextExternalFile(d.cmp, d.objProvider, bounds, vers) + if f == nil { + // No external files in the given span. + return nil, false + } + + return &downloadSpanTask{ + downloadSpan: sp, + bounds: bounds, + taskCompletedChan: make(chan error, 1), + cursor: makeCursorAtFile(f, level), + }, true +} + +// downloadCursor represents a position in the download process, which does not +// depend on a specific version. +// +// The Download process scans for external files level-by-level (starting with +// L0), and left-to-right (in terms of Smallest.UserKey) within each level. In +// L0, we break ties by the LargestSeqNum. +// +// A cursor can be thought of as a boundary between two files in a version +// (ordered by level, then by Smallest.UserKey, then by LargestSeqNum). A file +// is either "before" or "after" the cursor. +type downloadCursor struct { + // LSM level (0 to NumLevels). When level=NumLevels, the cursor is at the end. + level int + // Inclusive lower bound for Smallest.UserKey for tables on level. + key []byte + // Inclusive lower bound for sequence number for tables on level with + // Smallest.UserKey equaling key. Used to break ties within L0, and also used + // to position a cursor immediately after a given file. + seqNum base.SeqNum +} + +var endCursor = downloadCursor{level: manifest.NumLevels} + +// AtEnd returns true if the cursor is after all relevant files. +func (c downloadCursor) AtEnd() bool { + return c.level >= manifest.NumLevels +} + +func (c downloadCursor) String() string { + return fmt.Sprintf("level=%d key=%q seqNum=%d", c.level, c.key, c.seqNum) +} + +// makeCursorAtFile returns a downloadCursor that is immediately before the +// given file. Calling nextExternalFile on the resulting cursor (using the same +// version) should return f. +func makeCursorAtFile(f *manifest.TableMetadata, level int) downloadCursor { + return downloadCursor{ + level: level, + key: f.Smallest().UserKey, + seqNum: f.LargestSeqNum, + } +} + +// makeCursorAfterFile returns a downloadCursor that is immediately +// after the given file. +func makeCursorAfterFile(f *manifest.TableMetadata, level int) downloadCursor { + return downloadCursor{ + level: level, + key: f.Smallest().UserKey, + seqNum: f.LargestSeqNum + 1, + } +} + +func (c downloadCursor) FileIsAfterCursor( + cmp base.Compare, f *manifest.TableMetadata, level int, +) bool { + return c.Compare(cmp, makeCursorAfterFile(f, level)) < 0 +} + +func (c downloadCursor) Compare(keyCmp base.Compare, other downloadCursor) int { + if c := cmp.Compare(c.level, other.level); c != 0 { + return c + } + if c := keyCmp(c.key, other.key); c != 0 { + return c + } + return cmp.Compare(c.seqNum, other.seqNum) +} + +// NextExternalFile returns the first file after the cursor, returning the file +// and the level. If no such file exists, returns nil fileMetadata. +func (c downloadCursor) NextExternalFile( + cmp base.Compare, objProvider objstorage.Provider, bounds base.UserKeyBounds, v *manifest.Version, +) (_ *manifest.TableMetadata, level int) { + for !c.AtEnd() { + if f := c.NextExternalFileOnLevel(cmp, objProvider, bounds.End, v); f != nil { + return f, c.level + } + // Go to the next level. + c.key = bounds.Start + c.seqNum = 0 + c.level++ + } + return nil, manifest.NumLevels +} + +// NextExternalFileOnLevel returns the first external file on c.level which is +// after c and with Smallest.UserKey within the end bound. +func (c downloadCursor) NextExternalFileOnLevel( + cmp base.Compare, + objProvider objstorage.Provider, + endBound base.UserKeyBoundary, + v *manifest.Version, +) *manifest.TableMetadata { + if c.level > 0 { + it := v.Levels[c.level].Iter() + return firstExternalFileInLevelIter(cmp, objProvider, c, it, endBound) + } + // For L0, we look at all sublevel iterators and take the first file. + var first *manifest.TableMetadata + var firstCursor downloadCursor + for _, sublevel := range v.L0SublevelFiles { + f := firstExternalFileInLevelIter(cmp, objProvider, c, sublevel.Iter(), endBound) + if f != nil { + c := makeCursorAtFile(f, c.level) + if first == nil || c.Compare(cmp, firstCursor) < 0 { + first = f + firstCursor = c + } + // Trim the end bound as an optimization. + endBound = base.UserKeyInclusive(f.Smallest().UserKey) + } + } + return first +} + +// firstExternalFileInLevelIter finds the first external file after the cursor +// but which starts before the endBound. It is assumed that the iterator +// corresponds to cursor.level. +func firstExternalFileInLevelIter( + cmp base.Compare, + objProvider objstorage.Provider, + cursor downloadCursor, + it manifest.LevelIterator, + endBound base.UserKeyBoundary, +) *manifest.TableMetadata { + f := it.SeekGE(cmp, cursor.key) + // Skip the file if it starts before cursor.key or is at that same key with lower + // sequence number. + for f != nil && !cursor.FileIsAfterCursor(cmp, f, cursor.level) { + f = it.Next() + } + for ; f != nil && endBound.IsUpperBoundFor(cmp, f.Smallest().UserKey); f = it.Next() { + if f.Virtual && objstorage.IsExternalTable(objProvider, f.TableBacking.DiskFileNum) { + return f + } + } + return nil +} + +// tryLaunchDownloadForFile attempt to launch a download compaction for the +// given file. Returns true on success, or false if the file is already +// involved in a compaction. +func (d *DB) tryLaunchDownloadForFile( + vers *manifest.Version, + l0Organizer *manifest.L0Organizer, + env compactionEnv, + download *downloadSpanTask, + level int, + f *manifest.TableMetadata, +) (doneCh chan error, ok bool) { + if f.IsCompacting() { + return nil, false + } + if download.testing.launchDownloadCompaction != nil { + return download.testing.launchDownloadCompaction(f) + } + kind := compactionKindRewrite + if download.downloadSpan.ViaBackingFileDownload { + kind = compactionKindCopy + } + pc := pickDownloadCompaction(vers, l0Organizer, d.opts, env, d.mu.versions.picker.getBaseLevel(), kind, level, f) + if pc == nil { + // We are not able to run this download compaction at this time. + return nil, false + } + + download.numLaunchedDownloads++ + doneCh = make(chan error, 1) + c := newCompaction(pc, d.opts, d.timeNow(), d.objProvider, noopGrantHandle{}, d.TableFormat(), d.determineCompactionValueSeparation) + c.isDownload = true + d.mu.compact.downloadingCount++ + c.AddInProgressLocked(d) + go d.compact(c, doneCh) + return doneCh, true +} + +type launchDownloadResult int8 + +const ( + launchedCompaction launchDownloadResult = iota + didNotLaunchCompaction + downloadTaskCompleted +) + +func (d *DB) tryLaunchDownloadCompaction( + download *downloadSpanTask, + vers *manifest.Version, + l0Organizer *manifest.L0Organizer, + env compactionEnv, + maxConcurrentDownloads int, +) launchDownloadResult { + // First, check the bookmarks. + for i := 0; i < len(download.bookmarks); i++ { + b := &download.bookmarks[i] + if b.downloadDoneCh != nil { + // First check if the compaction we launched completed. + select { + case compactionErr := <-b.downloadDoneCh: + if compactionErr != nil && !errors.Is(compactionErr, ErrCancelledCompaction) { + download.taskCompletedChan <- compactionErr + return downloadTaskCompleted + } + b.downloadDoneCh = nil + + // Even if the compaction finished without an error, we still want to + // check the rest of the bookmark range for external files. + // + // For example, say that we encounter a file ["a", "f"] and start a + // download (creating a bookmark). Then that file gets excised into new + // files ["a", "b"] and ["e", "f"] and the excise causes the download + // compaction to be cancelled. We will start another download compaction + // for ["a", "c"]; once that is complete, we still need to look at the + // rest of the bookmark range (i.e. up to "f") to discover the + // ["e", "f"] file. + + default: + // The compaction is still running, go to the next bookmark. + continue + } + } + + // If downloadDoneCh was nil, we are waiting on a compaction that we did not + // start. We are effectively polling the status by checking the external + // files within the bookmark. This is ok because this method is called (for + // this download task) at most once every time a compaction completes. + + f := b.start.NextExternalFileOnLevel(d.cmp, d.objProvider, b.endBound, vers) + if f == nil { + // No more external files for this bookmark, remove it. + download.bookmarks = slices.Delete(download.bookmarks, i, i+1) + i-- + continue + } + + // Move up the bookmark position to point at this file. + b.start = makeCursorAtFile(f, b.start.level) + doneCh, ok := d.tryLaunchDownloadForFile(vers, l0Organizer, env, download, b.start.level, f) + if ok { + b.downloadDoneCh = doneCh + return launchedCompaction + } + // We could not launch a download, which means the file is part of another + // compaction. We leave the bookmark in place and will poll the status in + // the code above. + } + + // Try to advance the cursor and launch more downloads. + for len(download.bookmarks) < maxConcurrentDownloads { + f, level := download.cursor.NextExternalFile(d.cmp, d.objProvider, download.bounds, vers) + if f == nil { + download.cursor = endCursor + if len(download.bookmarks) == 0 { + download.taskCompletedChan <- nil + return downloadTaskCompleted + } + return didNotLaunchCompaction + } + download.cursor = makeCursorAfterFile(f, level) + + download.bookmarks = append(download.bookmarks, downloadBookmark{ + start: makeCursorAtFile(f, level), + endBound: base.UserKeyInclusive(f.Largest().UserKey), + }) + doneCh, ok := d.tryLaunchDownloadForFile(vers, l0Organizer, env, download, level, f) + if ok { + // We launched a download for this file. + download.bookmarks[len(download.bookmarks)-1].downloadDoneCh = doneCh + return launchedCompaction + } + } + + return didNotLaunchCompaction +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/error_iter.go b/vendor/github.com/cockroachdb/pebble/v2/error_iter.go new file mode 100644 index 0000000..cab08c8 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/error_iter.go @@ -0,0 +1,97 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +type errorIter struct { + err error +} + +// errorIter implements the base.InternalIterator interface. +var _ internalIterator = (*errorIter)(nil) + +func (c *errorIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { + return nil +} + +func (c *errorIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + return c.SeekPrefixGEStrict(prefix, key, flags) +} + +func (c *errorIter) SeekPrefixGEStrict( + prefix, key []byte, flags base.SeekGEFlags, +) *base.InternalKV { + return nil +} + +func (c *errorIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { + return nil +} + +func (c *errorIter) First() *base.InternalKV { + return nil +} + +func (c *errorIter) Last() *base.InternalKV { + return nil +} + +func (c *errorIter) Next() *base.InternalKV { + return nil +} + +func (c *errorIter) Prev() *base.InternalKV { + return nil +} + +func (c *errorIter) NextPrefix([]byte) *base.InternalKV { + return nil +} + +func (c *errorIter) Error() error { + return c.err +} + +func (c *errorIter) Close() error { + return c.err +} + +func (c *errorIter) String() string { + return "error" +} + +func (c *errorIter) SetBounds(lower, upper []byte) {} + +func (c *errorIter) SetContext(_ context.Context) {} + +func (c *errorIter) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p)", c, c) +} + +type errorKeyspanIter struct { + err error +} + +// errorKeyspanIter implements the keyspan.FragmentIterator interface. +var _ keyspan.FragmentIterator = (*errorKeyspanIter)(nil) + +func (i *errorKeyspanIter) SeekGE(key []byte) (*keyspan.Span, error) { return nil, i.err } +func (i *errorKeyspanIter) SeekLT(key []byte) (*keyspan.Span, error) { return nil, i.err } +func (i *errorKeyspanIter) First() (*keyspan.Span, error) { return nil, i.err } +func (i *errorKeyspanIter) Last() (*keyspan.Span, error) { return nil, i.err } +func (i *errorKeyspanIter) Next() (*keyspan.Span, error) { return nil, i.err } +func (i *errorKeyspanIter) Prev() (*keyspan.Span, error) { return nil, i.err } +func (i *errorKeyspanIter) SetContext(ctx context.Context) {} +func (i *errorKeyspanIter) Close() {} +func (*errorKeyspanIter) String() string { return "error" } +func (*errorKeyspanIter) WrapChildren(wrap keyspan.WrapFn) {} +func (i *errorKeyspanIter) DebugTree(tp treeprinter.Node) { tp.Childf("%T(%p)", i, i) } diff --git a/vendor/github.com/cockroachdb/pebble/v2/event.go b/vendor/github.com/cockroachdb/pebble/v2/event.go new file mode 100644 index 0000000..a808868 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/event.go @@ -0,0 +1,1396 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "fmt" + "strings" + "sync" + "time" + + "github.com/cockroachdb/crlib/crtime" + "github.com/cockroachdb/errors" + errorsjoin "github.com/cockroachdb/errors/join" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/redact" +) + +// TableNum is an identifier for a table within a database. +type TableNum = base.TableNum + +// TableInfo exports the manifest.TableInfo type. +type TableInfo = manifest.TableInfo + +func tablesTotalSize(tables []TableInfo) uint64 { + var size uint64 + for i := range tables { + size += tables[i].Size + } + return size +} + +func formatFileNums(tables []TableInfo) string { + var buf strings.Builder + for i := range tables { + if i > 0 { + buf.WriteString(" ") + } + buf.WriteString(tables[i].FileNum.String()) + } + return buf.String() +} + +// DataCorruptionInfo contains the information for a DataCorruption event. +type DataCorruptionInfo struct { + // Path of the file that is corrupted. For remote files the path starts with + // "remote://". + Path string + IsRemote bool + // Locator is only set when IsRemote is true (note that an empty Locator is + // valid even then). + Locator remote.Locator + // Bounds indicates the keyspace range that is affected. + Bounds base.UserKeyBounds + // Details of the error. See cockroachdb/error for how to format with or + // without redaction. + Details error +} + +func (i DataCorruptionInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i DataCorruptionInfo) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("on-disk corruption: %s", redact.Safe(i.Path)) + if i.IsRemote { + w.Printf(" (remote locator %q)", redact.Safe(i.Locator)) + } + w.Printf("; bounds: %s; details: %+v", i.Bounds.String(), i.Details) +} + +// LevelInfo contains info pertaining to a particular level. +type LevelInfo struct { + Level int + Tables []TableInfo + Score float64 +} + +func (i LevelInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i LevelInfo) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("L%d [%s] (%s) Score=%.2f", + redact.Safe(i.Level), + redact.Safe(formatFileNums(i.Tables)), + redact.Safe(humanize.Bytes.Uint64(tablesTotalSize(i.Tables))), + redact.Safe(i.Score)) +} + +// BlobFileCreateInfo contains the info for a blob file creation event. +type BlobFileCreateInfo struct { + JobID int + // Reason is the reason for the table creation: "compacting", "flushing", or + // "ingesting". + Reason string + Path string + FileNum base.DiskFileNum +} + +func (i BlobFileCreateInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i BlobFileCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("[JOB %d] %s: blob file created %s", + redact.Safe(i.JobID), redact.Safe(i.Reason), i.FileNum) +} + +// BlobFileDeleteInfo contains the info for a blob file deletion event. +type BlobFileDeleteInfo struct { + JobID int + Path string + FileNum base.DiskFileNum + Err error +} + +func (i BlobFileDeleteInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i BlobFileDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] blob file delete error %s: %s", + redact.Safe(i.JobID), i.FileNum, i.Err) + return + } + w.Printf("[JOB %d] blob file deleted %s", redact.Safe(i.JobID), i.FileNum) +} + +// BlobFileRewriteInfo contains the info for a blob file rewrite event. +type BlobFileRewriteInfo struct { + // JobID is the ID of the job. + JobID int + // Input contains the input tables for the compaction organized by level. + Input BlobFileInfo + // Output contains the output tables generated by the compaction. The output + // info is empty for the compaction begin event. + Output BlobFileInfo + // Duration is the time spent compacting, including reading and writing + // files. + Duration time.Duration + // TotalDuration is the total wall-time duration of the compaction, + // including applying the compaction to the database. TotalDuration is + // always ≥ Duration. + TotalDuration time.Duration + Done bool + // Err is set only if Done is true. If non-nil, indicates that the compaction + // failed. Note that err can be ErrCancelledCompaction, which can happen + // during normal operation. + Err error +} + +func (i BlobFileRewriteInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i BlobFileRewriteInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] blob file (%s, %s) rewrite error: %s", + redact.Safe(i.JobID), i.Input.BlobFileID, i.Input.DiskFileNum, i.Err) + return + } + + if !i.Done { + w.Printf("[JOB %d] rewriting blob file %s (physical file %s)", + redact.Safe(i.JobID), i.Input.BlobFileID, i.Input.DiskFileNum) + return + } + w.Printf("[JOB %d] rewrote blob file (%s, %s) -> (%s, %s), in %.1fs (%.1fs total)", + redact.Safe(i.JobID), i.Input.BlobFileID, i.Input.DiskFileNum, + i.Output.BlobFileID, i.Output.DiskFileNum, + redact.Safe(i.Duration.Seconds()), + redact.Safe(i.TotalDuration.Seconds())) +} + +// BlobFileInfo describes a blob file. +type BlobFileInfo struct { + // BlobFileID is the logical ID of the blob file. + BlobFileID base.BlobFileID + // DiskFileNum is the file number of the blob file on disk. + DiskFileNum base.DiskFileNum + // Size is the physical size of the file in bytes. + Size uint64 + // ValueSize is the pre-compressed size of the values in the blob file in + // bytes. + ValueSize uint64 +} + +// CompactionInfo contains the info for a compaction event. +type CompactionInfo struct { + // JobID is the ID of the compaction job. + JobID int + // Reason is the reason for the compaction. + Reason string + // Input contains the input tables for the compaction organized by level. + Input []LevelInfo + // Output contains the output tables generated by the compaction. The output + // tables are empty for the compaction begin event. + Output LevelInfo + // Duration is the time spent compacting, including reading and writing + // sstables. + Duration time.Duration + // TotalDuration is the total wall-time duration of the compaction, + // including applying the compaction to the database. TotalDuration is + // always ≥ Duration. + TotalDuration time.Duration + Done bool + // Err is set only if Done is true. If non-nil, indicates that the compaction + // failed. Note that err can be ErrCancelledCompaction, which can happen + // during normal operation. + Err error + + SingleLevelOverlappingRatio float64 + MultiLevelOverlappingRatio float64 + + // Annotations specifies additional info to appear in a compaction's event log line + Annotations compactionAnnotations +} + +type compactionAnnotations []string + +// SafeFormat implements redact.SafeFormatter. +func (ca compactionAnnotations) SafeFormat(w redact.SafePrinter, _ rune) { + if len(ca) == 0 { + return + } + for i := range ca { + if i != 0 { + w.Print(" ") + } + w.Printf("%s", redact.SafeString(ca[i])) + } +} + +func (i CompactionInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i CompactionInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] compaction(%s) to L%d error: %s", + redact.Safe(i.JobID), redact.SafeString(i.Reason), redact.Safe(i.Output.Level), i.Err) + return + } + + if !i.Done { + w.Printf("[JOB %d] compacting(%s) ", + redact.Safe(i.JobID), + redact.SafeString(i.Reason)) + if len(i.Annotations) > 0 { + w.Printf("%s ", i.Annotations) + } + w.Printf("%s; ", levelInfos(i.Input)) + w.Printf("OverlappingRatio: Single %.2f, Multi %.2f", i.SingleLevelOverlappingRatio, i.MultiLevelOverlappingRatio) + return + } + outputSize := tablesTotalSize(i.Output.Tables) + w.Printf("[JOB %d] compacted(%s) ", redact.Safe(i.JobID), redact.SafeString(i.Reason)) + if len(i.Annotations) > 0 { + w.Printf("%s ", i.Annotations) + } + w.Print(levelInfos(i.Input)) + w.Printf(" -> L%d [%s] (%s), in %.1fs (%.1fs total), output rate %s/s", + redact.Safe(i.Output.Level), + redact.Safe(formatFileNums(i.Output.Tables)), + redact.Safe(humanize.Bytes.Uint64(outputSize)), + redact.Safe(i.Duration.Seconds()), + redact.Safe(i.TotalDuration.Seconds()), + redact.Safe(humanize.Bytes.Uint64(uint64(float64(outputSize)/i.Duration.Seconds())))) +} + +type levelInfos []LevelInfo + +func (i levelInfos) SafeFormat(w redact.SafePrinter, _ rune) { + for j, levelInfo := range i { + if j > 0 { + w.Printf(" + ") + } + w.Print(levelInfo) + } +} + +// DiskSlowInfo contains the info for a disk slowness event when writing to a +// file. +type DiskSlowInfo = vfs.DiskSlowInfo + +// FlushInfo contains the info for a flush event. +type FlushInfo struct { + // JobID is the ID of the flush job. + JobID int + // Reason is the reason for the flush. + Reason string + // Input contains the count of input memtables that were flushed. + Input int + // InputBytes contains the total in-memory size of the memtable(s) that were + // flushed. This size includes skiplist indexing data structures. + InputBytes uint64 + // Output contains the ouptut table generated by the flush. The output info + // is empty for the flush begin event. + Output []TableInfo + // Duration is the time spent flushing. This duration includes writing and + // syncing all of the flushed keys to sstables. + Duration time.Duration + // TotalDuration is the total wall-time duration of the flush, including + // applying the flush to the database. TotalDuration is always ≥ Duration. + TotalDuration time.Duration + // Ingest is set to true if the flush is handling tables that were added to + // the flushable queue via an ingestion operation. + Ingest bool + // IngestLevels are the output levels for each ingested table in the flush. + // This field is only populated when Ingest is true. + IngestLevels []int + Done bool + Err error +} + +func (i FlushInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i FlushInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] flush error: %s", redact.Safe(i.JobID), i.Err) + return + } + + plural := redact.SafeString("s") + if i.Input == 1 { + plural = "" + } + if !i.Done { + w.Printf("[JOB %d] ", redact.Safe(i.JobID)) + if !i.Ingest { + w.Printf("flushing %d memtable", redact.Safe(i.Input)) + w.SafeString(plural) + w.Printf(" (%s) to L0", redact.Safe(humanize.Bytes.Uint64(i.InputBytes))) + } else { + w.Printf("flushing %d ingested table%s", redact.Safe(i.Input), plural) + } + return + } + + outputSize := tablesTotalSize(i.Output) + if !i.Ingest { + if invariants.Enabled && len(i.IngestLevels) > 0 { + panic(errors.AssertionFailedf("pebble: expected len(IngestedLevels) == 0")) + } + w.Printf("[JOB %d] flushed %d memtable%s (%s) to L0 [%s] (%s), in %.1fs (%.1fs total), output rate %s/s", + redact.Safe(i.JobID), redact.Safe(i.Input), plural, + redact.Safe(humanize.Bytes.Uint64(i.InputBytes)), + redact.Safe(formatFileNums(i.Output)), + redact.Safe(humanize.Bytes.Uint64(outputSize)), + redact.Safe(i.Duration.Seconds()), + redact.Safe(i.TotalDuration.Seconds()), + redact.Safe(humanize.Bytes.Uint64(uint64(float64(outputSize)/i.Duration.Seconds())))) + } else { + if invariants.Enabled && len(i.IngestLevels) == 0 { + panic(errors.AssertionFailedf("pebble: expected len(IngestedLevels) > 0")) + } + w.Printf("[JOB %d] flushed %d ingested flushable%s", + redact.Safe(i.JobID), redact.Safe(len(i.Output)), plural) + for j, level := range i.IngestLevels { + file := i.Output[j] + if j > 0 { + w.Printf(" +") + } + w.Printf(" L%d:%s (%s)", level, file.FileNum, humanize.Bytes.Uint64(file.Size)) + } + w.Printf(" in %.1fs (%.1fs total), output rate %s/s", + redact.Safe(i.Duration.Seconds()), + redact.Safe(i.TotalDuration.Seconds()), + redact.Safe(humanize.Bytes.Uint64(uint64(float64(outputSize)/i.Duration.Seconds())))) + } +} + +// DownloadInfo contains the info for a DB.Download() event. +type DownloadInfo struct { + // JobID is the ID of the download job. + JobID int + + Spans []DownloadSpan + + // Duration is the time since the operation was started. + Duration time.Duration + DownloadCompactionsLaunched int + + // RestartCount indicates that the download operation restarted because it + // noticed that new external files were ingested. A DownloadBegin event with + // RestartCount = 0 is the start of the operation; each time we restart it we + // have another DownloadBegin event with RestartCount > 0. + RestartCount int + Done bool + Err error +} + +func (i DownloadInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i DownloadInfo) SafeFormat(w redact.SafePrinter, _ rune) { + switch { + case i.Err != nil: + w.Printf("[JOB %d] download error after %1.fs: %s", redact.Safe(i.JobID), redact.Safe(i.Duration.Seconds()), i.Err) + + case i.Done: + w.Printf("[JOB %d] download finished in %.1fs (launched %d compactions)", + redact.Safe(i.JobID), redact.Safe(i.Duration.Seconds()), redact.Safe(i.DownloadCompactionsLaunched)) + + default: + if i.RestartCount == 0 { + w.Printf("[JOB %d] starting download for %d spans", redact.Safe(i.JobID), redact.Safe(len(i.Spans))) + } else { + w.Printf("[JOB %d] restarting download (restart #%d, time so far %.1fs, launched %d compactions)", + redact.Safe(i.JobID), redact.Safe(i.RestartCount), redact.Safe(i.Duration.Seconds()), + redact.Safe(i.DownloadCompactionsLaunched)) + } + } +} + +// ManifestCreateInfo contains info about a manifest creation event. +type ManifestCreateInfo struct { + // JobID is the ID of the job the caused the manifest to be created. + JobID int + Path string + // The file number of the new Manifest. + FileNum base.DiskFileNum + Err error +} + +func (i ManifestCreateInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i ManifestCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] MANIFEST create error: %s", redact.Safe(i.JobID), i.Err) + return + } + w.Printf("[JOB %d] MANIFEST created %s", redact.Safe(i.JobID), i.FileNum) +} + +// ManifestDeleteInfo contains the info for a Manifest deletion event. +type ManifestDeleteInfo struct { + // JobID is the ID of the job the caused the Manifest to be deleted. + JobID int + Path string + FileNum base.DiskFileNum + Err error +} + +func (i ManifestDeleteInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i ManifestDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] MANIFEST delete error: %s", redact.Safe(i.JobID), i.Err) + return + } + w.Printf("[JOB %d] MANIFEST deleted %s", redact.Safe(i.JobID), i.FileNum) +} + +// TableCreateInfo contains the info for a table creation event. +type TableCreateInfo struct { + JobID int + // Reason is the reason for the table creation: "compacting", "flushing", or + // "ingesting". + Reason string + Path string + FileNum base.DiskFileNum +} + +func (i TableCreateInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i TableCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("[JOB %d] %s: sstable created %s", + redact.Safe(i.JobID), redact.Safe(i.Reason), i.FileNum) +} + +// TableDeleteInfo contains the info for a table deletion event. +type TableDeleteInfo struct { + JobID int + Path string + FileNum base.DiskFileNum + Err error +} + +func (i TableDeleteInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i TableDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] sstable delete error %s: %s", + redact.Safe(i.JobID), i.FileNum, i.Err) + return + } + w.Printf("[JOB %d] sstable deleted %s", redact.Safe(i.JobID), i.FileNum) +} + +// TableIngestInfo contains the info for a table ingestion event. +type TableIngestInfo struct { + // JobID is the ID of the job the caused the table to be ingested. + JobID int + Tables []struct { + TableInfo + Level int + } + // GlobalSeqNum is the sequence number that was assigned to all entries in + // the ingested table. + GlobalSeqNum base.SeqNum + // flushable indicates whether the ingested sstable was treated as a + // flushable. + flushable bool + Err error +} + +func (i TableIngestInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i TableIngestInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] ingest error: %s", redact.Safe(i.JobID), i.Err) + return + } + + if i.flushable { + w.Printf("[JOB %d] ingested as flushable", redact.Safe(i.JobID)) + } else { + w.Printf("[JOB %d] ingested", redact.Safe(i.JobID)) + } + + for j := range i.Tables { + t := &i.Tables[j] + if j > 0 { + w.Printf(",") + } + levelStr := "" + if !i.flushable { + levelStr = fmt.Sprintf("L%d:", t.Level) + } + w.Printf(" %s%s (%s)", redact.Safe(levelStr), t.FileNum, + redact.Safe(humanize.Bytes.Uint64(t.Size))) + } +} + +// TableStatsInfo contains the info for a table stats loaded event. +type TableStatsInfo struct { + // JobID is the ID of the job that finished loading the initial tables' + // stats. + JobID int +} + +func (i TableStatsInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i TableStatsInfo) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("[JOB %d] all initial table stats loaded", redact.Safe(i.JobID)) +} + +// TableValidatedInfo contains information on the result of a validation run +// on an sstable. +type TableValidatedInfo struct { + JobID int + Meta *manifest.TableMetadata +} + +func (i TableValidatedInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i TableValidatedInfo) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("[JOB %d] validated table: %s", redact.Safe(i.JobID), i.Meta) +} + +// WALCreateInfo contains info about a WAL creation event. +type WALCreateInfo struct { + // JobID is the ID of the job the caused the WAL to be created. + JobID int + Path string + // The file number of the new WAL. + FileNum base.DiskFileNum + // The file number of a previous WAL which was recycled to create this + // one. Zero if recycling did not take place. + RecycledFileNum base.DiskFileNum + Err error +} + +func (i WALCreateInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i WALCreateInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] WAL create error: %s", redact.Safe(i.JobID), i.Err) + return + } + + if i.RecycledFileNum == 0 { + w.Printf("[JOB %d] WAL created %s", redact.Safe(i.JobID), i.FileNum) + return + } + + w.Printf("[JOB %d] WAL created %s (recycled %s)", + redact.Safe(i.JobID), i.FileNum, i.RecycledFileNum) +} + +// WALDeleteInfo contains the info for a WAL deletion event. +// +// TODO(sumeer): extend WALDeleteInfo for the failover case in case the path +// is insufficient to infer whether primary or secondary. +type WALDeleteInfo struct { + // JobID is the ID of the job the caused the WAL to be deleted. + JobID int + Path string + FileNum base.DiskFileNum + Err error +} + +func (i WALDeleteInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i WALDeleteInfo) SafeFormat(w redact.SafePrinter, _ rune) { + if i.Err != nil { + w.Printf("[JOB %d] WAL delete error: %s", redact.Safe(i.JobID), i.Err) + return + } + w.Printf("[JOB %d] WAL deleted %s", redact.Safe(i.JobID), i.FileNum) +} + +// WriteStallBeginInfo contains the info for a write stall begin event. +type WriteStallBeginInfo struct { + Reason string +} + +func (i WriteStallBeginInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i WriteStallBeginInfo) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("write stall beginning: %s", redact.Safe(i.Reason)) +} + +// LowDiskSpaceInfo contains the information for a LowDiskSpace +// event. +type LowDiskSpaceInfo struct { + // AvailBytes is the disk space available to the current process in bytes. + AvailBytes uint64 + // TotalBytes is the total disk space in bytes. + TotalBytes uint64 + // PercentThreshold is one of a set of fixed percentages in the + // lowDiskSpaceThresholds below. This event was issued because the disk + // space went below this threshold. + PercentThreshold int +} + +func (i LowDiskSpaceInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i LowDiskSpaceInfo) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf( + "available disk space under %d%% (%s of %s)", + redact.Safe(i.PercentThreshold), + redact.Safe(humanize.Bytes.Uint64(i.AvailBytes)), + redact.Safe(humanize.Bytes.Uint64(i.TotalBytes)), + ) +} + +// PossibleAPIMisuseInfo contains the information for a PossibleAPIMisuse event. +type PossibleAPIMisuseInfo struct { + Kind APIMisuseKind + + // UserKey is set for the following kinds: + // - IneffectualSingleDelete, + // - NondeterministicSingleDelete, + // - MissizedDelete, + // - InvalidValue. + UserKey []byte + + // ExtraInfo is set for the following kinds: + // - MissizedDelete: contains "elidedSize=,expectedSize=" + // - InvalidValue: contains "callback=,value=,err=" + ExtraInfo redact.RedactableString +} + +func (i PossibleAPIMisuseInfo) String() string { + return redact.StringWithoutMarkers(i) +} + +// SafeFormat implements redact.SafeFormatter. +func (i PossibleAPIMisuseInfo) SafeFormat(w redact.SafePrinter, _ rune) { + switch i.Kind { + case IneffectualSingleDelete, NondeterministicSingleDelete: + w.Printf("possible API misuse: %s (key=%q)", redact.Safe(i.Kind), i.UserKey) + case MissizedDelete: + w.Printf("possible API misuse: %s (key=%q, %s)", redact.Safe(i.Kind), i.UserKey, i.ExtraInfo) + case InvalidValue: + w.Printf("possible API misuse: %s (key=%q, %s)", redact.Safe(i.Kind), i.UserKey, i.ExtraInfo) + default: + if invariants.Enabled { + panic("invalid API misuse event") + } + w.Printf("invalid API misuse event") + } +} + +// APIMisuseKind identifies the type of API misuse represented by a +// PossibleAPIMisuse event. +type APIMisuseKind int8 + +const ( + // IneffectualSingleDelete is emitted in compactions/flushes if any + // single delete is being elided without deleting a point set/merge. + // + // This event can sometimes be a false positive because of delete-only + // compactions which can cause a recent RANGEDEL to peek below an older + // SINGLEDEL and delete an arbitrary subset of data below that SINGLEDEL. + // + // Example: + // RANGEDEL [a, c)#10 in L0 + // SINGLEDEL b#5 in L1 + // SET b#3 in L6 + // + // If the L6 file containing the SET is narrow and the L1 file containing + // the SINGLEDEL is wide, a delete-only compaction can remove the file in + // L2 before the SINGLEDEL is compacted down. Then when the SINGLEDEL is + // compacted down, it will not find any SET to delete, resulting in the + // ineffectual callback. + IneffectualSingleDelete APIMisuseKind = iota + + // NondeterministicSingleDelete is emitted in compactions/flushes if any + // single delete has consumed a Set/Merge, and there is another immediately + // older Set/SetWithDelete/Merge. The user of Pebble has violated the + // invariant under which SingleDelete can be used correctly. + // + // Consider the sequence SingleDelete#3, Set#2, Set#1. There are three + // ways some of these keys can first meet in a compaction. + // + // - All 3 keys in the same compaction: this callback will detect the + // violation. + // + // - SingleDelete#3, Set#2 meet in a compaction first: Both keys will + // disappear. The violation will not be detected, and the DB will have + // Set#1 which is likely incorrect (from the user's perspective). + // + // - Set#2, Set#1 meet in a compaction first: The output will be Set#2, + // which will later be consumed by SingleDelete#3. The violation will + // not be detected and the DB will be correct. + // + // This event can sometimes be a false positive because of delete-only + // compactions which can cause a recent RANGEDEL to peek below an older + // SINGLEDEL and delete an arbitrary subset of data below that SINGLEDEL. + // + // Example: + // RANGEDEL [a, z)#60 in L0 + // SINGLEDEL g#50 in L1 + // SET g#40 in L2 + // RANGEDEL [g,h)#30 in L3 + // SET g#20 in L6 + // + // In this example, the two SETs represent the same user write, and the + // RANGEDELs are caused by the CockroachDB range being dropped. That is, + // the user wrote to g once, range was dropped, then added back, which + // caused the SET again, then at some point g was validly deleted using a + // SINGLEDEL, and then the range was dropped again. The older RANGEDEL can + // get fragmented due to compactions it has been part of. Say this L3 file + // containing the RANGEDEL is very narrow, while the L1, L2, L6 files are + // wider than the RANGEDEL in L0. Then the RANGEDEL in L3 can be dropped + // using a delete-only compaction, resulting in an LSM with state: + // + // RANGEDEL [a, z)#60 in L0 + // SINGLEDEL g#50 in L1 + // SET g#40 in L2 + // SET g#20 in L6 + // + // A multi-level compaction involving L1, L2, L6 will cause the invariant + // violation callback. This example doesn't need multi-level compactions: + // say there was a Pebble snapshot at g#21 preventing g#20 from being + // dropped when it meets g#40 in a compaction. That snapshot will not save + // RANGEDEL [g,h)#30, so we can have: + // + // SINGLEDEL g#50 in L1 + // SET g#40, SET g#20 in L6 + // + // And say the snapshot is removed and then the L1 and L6 compaction + // happens, resulting in the invariant violation callback. + NondeterministicSingleDelete + + // MissizedDelete is emitted when a DELSIZED tombstone is found that did + // not accurately record the size of the value it deleted. This can lead to + // incorrect behavior in compactions. + MissizedDelete + + // InvalidValue is emitted when a user-implemented callback (such as + // ShortAttributeExtractor) returns an error for a committed value. This + // suggests that either the callback is not implemented for all possible + // values or a malformed value was committed to the DB. + InvalidValue +) + +func (k APIMisuseKind) String() string { + switch k { + case IneffectualSingleDelete: + return "ineffectual SINGLEDEL" + case NondeterministicSingleDelete: + return "nondeterministic SINGLEDEL" + case MissizedDelete: + return "missized DELSIZED" + case InvalidValue: + return "invalid value" + default: + return "unknown" + } +} + +// EventListener contains a set of functions that will be invoked when various +// significant DB events occur. Note that the functions should not run for an +// excessive amount of time as they are invoked synchronously by the DB and may +// block continued DB work. For a similar reason it is advisable to not perform +// any synchronous calls back into the DB. +type EventListener struct { + // BackgroundError is invoked whenever an error occurs during a background + // operation such as flush or compaction. + BackgroundError func(error) + + // BlobFileCreated is invoked after a blob file has been created. + BlobFileCreated func(BlobFileCreateInfo) + + // BlobFileDeleted is invoked after a blob file has been deleted. + BlobFileDeleted func(BlobFileDeleteInfo) + + // BlobFileRewriteBegin is invoked when a blob file rewrite compaction begins. + BlobFileRewriteBegin func(BlobFileRewriteInfo) + + // BlobFileRewriteEnd is invoked when a blob file rewrite compaction ends. + BlobFileRewriteEnd func(BlobFileRewriteInfo) + + // DataCorruption is invoked when an on-disk corruption is detected. It should + // not block, as it is called synchronously in read paths. + DataCorruption func(DataCorruptionInfo) + + // CompactionBegin is invoked after the inputs to a compaction have been + // determined, but before the compaction has produced any output. + CompactionBegin func(CompactionInfo) + + // CompactionEnd is invoked after a compaction has completed and the result + // has been installed. + CompactionEnd func(CompactionInfo) + + // DiskSlow is invoked after a disk write operation on a file created with a + // disk health checking vfs.FS (see vfs.DefaultWithDiskHealthChecks) is + // observed to exceed the specified disk slowness threshold duration. DiskSlow + // is called on a goroutine that is monitoring slowness/stuckness. The callee + // MUST return without doing any IO, or blocking on anything (like a mutex) + // that is waiting on IO. This is imperative in order to reliably monitor for + // slowness, since if this goroutine gets stuck, the monitoring will stop + // working. + DiskSlow func(DiskSlowInfo) + + // FlushBegin is invoked after the inputs to a flush have been determined, + // but before the flush has produced any output. + FlushBegin func(FlushInfo) + + // FlushEnd is invoked after a flush has complated and the result has been + // installed. + FlushEnd func(FlushInfo) + + // DownloadBegin is invoked when a db.Download operation starts or restarts + // (restarts are caused by new external tables being ingested during the + // operation). + DownloadBegin func(DownloadInfo) + + // DownloadEnd is invoked when a db.Download operation completes. + DownloadEnd func(DownloadInfo) + + // FormatUpgrade is invoked after the database's FormatMajorVersion + // is upgraded. + FormatUpgrade func(FormatMajorVersion) + + // ManifestCreated is invoked after a manifest has been created. + ManifestCreated func(ManifestCreateInfo) + + // ManifestDeleted is invoked after a manifest has been deleted. + ManifestDeleted func(ManifestDeleteInfo) + + // TableCreated is invoked when a table has been created. + TableCreated func(TableCreateInfo) + + // TableDeleted is invoked after a table has been deleted. + TableDeleted func(TableDeleteInfo) + + // TableIngested is invoked after an externally created table has been + // ingested via a call to DB.Ingest(). + TableIngested func(TableIngestInfo) + + // TableStatsLoaded is invoked at most once, when the table stats + // collector has loaded statistics for all tables that existed at Open. + TableStatsLoaded func(TableStatsInfo) + + // TableValidated is invoked after validation runs on an sstable. + TableValidated func(TableValidatedInfo) + + // WALCreated is invoked after a WAL has been created. + WALCreated func(WALCreateInfo) + + // WALDeleted is invoked after a WAL has been deleted. + WALDeleted func(WALDeleteInfo) + + // WriteStallBegin is invoked when writes are intentionally delayed. + WriteStallBegin func(WriteStallBeginInfo) + + // WriteStallEnd is invoked when delayed writes are released. + WriteStallEnd func() + + // LowDiskSpace is invoked periodically when the disk space is running + // low. + LowDiskSpace func(LowDiskSpaceInfo) + + // PossibleAPIMisuse is invoked when a possible API misuse is detected. + PossibleAPIMisuse func(PossibleAPIMisuseInfo) +} + +// EnsureDefaults ensures that background error events are logged to the +// specified logger if a handler for those events hasn't been otherwise +// specified. Ensure all handlers are non-nil so that we don't have to check +// for nil-ness before invoking. +func (l *EventListener) EnsureDefaults(logger Logger) { + if l.BackgroundError == nil { + if logger != nil { + l.BackgroundError = func(err error) { + logger.Errorf("background error: %s", err) + } + } else { + l.BackgroundError = func(error) {} + } + } + if l.BlobFileCreated == nil { + l.BlobFileCreated = func(info BlobFileCreateInfo) {} + } + if l.BlobFileDeleted == nil { + l.BlobFileDeleted = func(info BlobFileDeleteInfo) {} + } + if l.BlobFileRewriteBegin == nil { + l.BlobFileRewriteBegin = func(info BlobFileRewriteInfo) {} + } + if l.BlobFileRewriteEnd == nil { + l.BlobFileRewriteEnd = func(info BlobFileRewriteInfo) {} + } + if l.DataCorruption == nil { + if logger != nil { + l.DataCorruption = func(info DataCorruptionInfo) { + logger.Fatalf("%s", info) + } + } else { + l.DataCorruption = func(info DataCorruptionInfo) {} + } + } + if l.CompactionBegin == nil { + l.CompactionBegin = func(info CompactionInfo) {} + } + if l.CompactionEnd == nil { + l.CompactionEnd = func(info CompactionInfo) {} + } + if l.DiskSlow == nil { + l.DiskSlow = func(info DiskSlowInfo) {} + } + if l.FlushBegin == nil { + l.FlushBegin = func(info FlushInfo) {} + } + if l.FlushEnd == nil { + l.FlushEnd = func(info FlushInfo) {} + } + if l.DownloadBegin == nil { + l.DownloadBegin = func(info DownloadInfo) {} + } + if l.DownloadEnd == nil { + l.DownloadEnd = func(info DownloadInfo) {} + } + if l.FormatUpgrade == nil { + l.FormatUpgrade = func(v FormatMajorVersion) {} + } + if l.ManifestCreated == nil { + l.ManifestCreated = func(info ManifestCreateInfo) {} + } + if l.ManifestDeleted == nil { + l.ManifestDeleted = func(info ManifestDeleteInfo) {} + } + if l.TableCreated == nil { + l.TableCreated = func(info TableCreateInfo) {} + } + if l.TableDeleted == nil { + l.TableDeleted = func(info TableDeleteInfo) {} + } + if l.TableIngested == nil { + l.TableIngested = func(info TableIngestInfo) {} + } + if l.TableStatsLoaded == nil { + l.TableStatsLoaded = func(info TableStatsInfo) {} + } + if l.TableValidated == nil { + l.TableValidated = func(validated TableValidatedInfo) {} + } + if l.WALCreated == nil { + l.WALCreated = func(info WALCreateInfo) {} + } + if l.WALDeleted == nil { + l.WALDeleted = func(info WALDeleteInfo) {} + } + if l.WriteStallBegin == nil { + l.WriteStallBegin = func(info WriteStallBeginInfo) {} + } + if l.WriteStallEnd == nil { + l.WriteStallEnd = func() {} + } + if l.LowDiskSpace == nil { + l.LowDiskSpace = func(info LowDiskSpaceInfo) {} + } + if l.PossibleAPIMisuse == nil { + l.PossibleAPIMisuse = func(info PossibleAPIMisuseInfo) {} + } +} + +// MakeLoggingEventListener creates an EventListener that logs all events to the +// specified logger. +func MakeLoggingEventListener(logger Logger) EventListener { + if logger == nil { + logger = DefaultLogger + } + + return EventListener{ + BackgroundError: func(err error) { + logger.Errorf("background error: %s", err) + }, + BlobFileCreated: func(info BlobFileCreateInfo) { + logger.Infof("%s", info) + }, + BlobFileDeleted: func(info BlobFileDeleteInfo) { + logger.Infof("%s", info) + }, + BlobFileRewriteBegin: func(info BlobFileRewriteInfo) { + logger.Infof("%s", info) + }, + BlobFileRewriteEnd: func(info BlobFileRewriteInfo) { + logger.Infof("%s", info) + }, + DataCorruption: func(info DataCorruptionInfo) { + logger.Errorf("%s", info) + }, + CompactionBegin: func(info CompactionInfo) { + logger.Infof("%s", info) + }, + CompactionEnd: func(info CompactionInfo) { + logger.Infof("%s", info) + }, + DiskSlow: func(info DiskSlowInfo) { + logger.Infof("%s", info) + }, + FlushBegin: func(info FlushInfo) { + logger.Infof("%s", info) + }, + FlushEnd: func(info FlushInfo) { + logger.Infof("%s", info) + }, + DownloadBegin: func(info DownloadInfo) { + logger.Infof("%s", info) + }, + DownloadEnd: func(info DownloadInfo) { + logger.Infof("%s", info) + }, + FormatUpgrade: func(v FormatMajorVersion) { + logger.Infof("upgraded to format version: %s", v) + }, + ManifestCreated: func(info ManifestCreateInfo) { + logger.Infof("%s", info) + }, + ManifestDeleted: func(info ManifestDeleteInfo) { + logger.Infof("%s", info) + }, + TableCreated: func(info TableCreateInfo) { + logger.Infof("%s", info) + }, + TableDeleted: func(info TableDeleteInfo) { + logger.Infof("%s", info) + }, + TableIngested: func(info TableIngestInfo) { + logger.Infof("%s", info) + }, + TableStatsLoaded: func(info TableStatsInfo) { + logger.Infof("%s", info) + }, + TableValidated: func(info TableValidatedInfo) { + logger.Infof("%s", info) + }, + WALCreated: func(info WALCreateInfo) { + logger.Infof("%s", info) + }, + WALDeleted: func(info WALDeleteInfo) { + logger.Infof("%s", info) + }, + WriteStallBegin: func(info WriteStallBeginInfo) { + logger.Infof("%s", info) + }, + WriteStallEnd: func() { + logger.Infof("write stall ending") + }, + LowDiskSpace: func(info LowDiskSpaceInfo) { + logger.Infof("%s", info) + }, + PossibleAPIMisuse: func(info PossibleAPIMisuseInfo) { + logger.Infof("%s", info) + }, + } +} + +// TeeEventListener wraps two EventListeners, forwarding all events to both. +func TeeEventListener(a, b EventListener) EventListener { + a.EnsureDefaults(nil) + b.EnsureDefaults(nil) + return EventListener{ + BackgroundError: func(err error) { + a.BackgroundError(err) + b.BackgroundError(err) + }, + BlobFileCreated: func(info BlobFileCreateInfo) { + a.BlobFileCreated(info) + b.BlobFileCreated(info) + }, + BlobFileDeleted: func(info BlobFileDeleteInfo) { + a.BlobFileDeleted(info) + b.BlobFileDeleted(info) + }, + BlobFileRewriteBegin: func(info BlobFileRewriteInfo) { + a.BlobFileRewriteBegin(info) + b.BlobFileRewriteBegin(info) + }, + BlobFileRewriteEnd: func(info BlobFileRewriteInfo) { + a.BlobFileRewriteEnd(info) + b.BlobFileRewriteEnd(info) + }, + DataCorruption: func(info DataCorruptionInfo) { + a.DataCorruption(info) + b.DataCorruption(info) + }, + CompactionBegin: func(info CompactionInfo) { + a.CompactionBegin(info) + b.CompactionBegin(info) + }, + CompactionEnd: func(info CompactionInfo) { + a.CompactionEnd(info) + b.CompactionEnd(info) + }, + DiskSlow: func(info DiskSlowInfo) { + a.DiskSlow(info) + b.DiskSlow(info) + }, + FlushBegin: func(info FlushInfo) { + a.FlushBegin(info) + b.FlushBegin(info) + }, + FlushEnd: func(info FlushInfo) { + a.FlushEnd(info) + b.FlushEnd(info) + }, + DownloadBegin: func(info DownloadInfo) { + a.DownloadBegin(info) + b.DownloadBegin(info) + }, + DownloadEnd: func(info DownloadInfo) { + a.DownloadEnd(info) + b.DownloadEnd(info) + }, + FormatUpgrade: func(v FormatMajorVersion) { + a.FormatUpgrade(v) + b.FormatUpgrade(v) + }, + ManifestCreated: func(info ManifestCreateInfo) { + a.ManifestCreated(info) + b.ManifestCreated(info) + }, + ManifestDeleted: func(info ManifestDeleteInfo) { + a.ManifestDeleted(info) + b.ManifestDeleted(info) + }, + TableCreated: func(info TableCreateInfo) { + a.TableCreated(info) + b.TableCreated(info) + }, + TableDeleted: func(info TableDeleteInfo) { + a.TableDeleted(info) + b.TableDeleted(info) + }, + TableIngested: func(info TableIngestInfo) { + a.TableIngested(info) + b.TableIngested(info) + }, + TableStatsLoaded: func(info TableStatsInfo) { + a.TableStatsLoaded(info) + b.TableStatsLoaded(info) + }, + TableValidated: func(info TableValidatedInfo) { + a.TableValidated(info) + b.TableValidated(info) + }, + WALCreated: func(info WALCreateInfo) { + a.WALCreated(info) + b.WALCreated(info) + }, + WALDeleted: func(info WALDeleteInfo) { + a.WALDeleted(info) + b.WALDeleted(info) + }, + WriteStallBegin: func(info WriteStallBeginInfo) { + a.WriteStallBegin(info) + b.WriteStallBegin(info) + }, + WriteStallEnd: func() { + a.WriteStallEnd() + b.WriteStallEnd() + }, + LowDiskSpace: func(info LowDiskSpaceInfo) { + a.LowDiskSpace(info) + b.LowDiskSpace(info) + }, + PossibleAPIMisuse: func(info PossibleAPIMisuseInfo) { + a.PossibleAPIMisuse(info) + b.PossibleAPIMisuse(info) + }, + } +} + +// lowDiskSpaceReporter contains the logic to report low disk space events. +// Report is called whenever we get the disk usage statistics. +// +// We define a few thresholds (10%, 5%, 3%, 2%, 1%) and we post an event +// whenever we reach a new threshold. We periodically repost the event every 30 +// minutes until we are above all thresholds. +type lowDiskSpaceReporter struct { + mu struct { + sync.Mutex + lastNoticeThreshold int + lastNoticeTime crtime.Mono + } +} + +var lowDiskSpaceThresholds = []int{10, 5, 3, 2, 1} + +const lowDiskSpaceFrequency = 30 * time.Minute + +func (r *lowDiskSpaceReporter) Report(availBytes, totalBytes uint64, el *EventListener) { + threshold, ok := r.findThreshold(availBytes, totalBytes) + if !ok { + // Normal path. + return + } + if r.shouldReport(threshold, crtime.NowMono()) { + el.LowDiskSpace(LowDiskSpaceInfo{ + AvailBytes: availBytes, + TotalBytes: totalBytes, + PercentThreshold: threshold, + }) + } +} + +// shouldReport returns true if we should report an event. Updates +// lastNoticeTime/lastNoticeThreshold appropriately. +func (r *lowDiskSpaceReporter) shouldReport(threshold int, now crtime.Mono) bool { + r.mu.Lock() + defer r.mu.Unlock() + if threshold < r.mu.lastNoticeThreshold || r.mu.lastNoticeTime == 0 || + now.Sub(r.mu.lastNoticeTime) >= lowDiskSpaceFrequency { + r.mu.lastNoticeThreshold = threshold + r.mu.lastNoticeTime = now + return true + } + return false +} + +// findThreshold returns the largest threshold in lowDiskSpaceThresholds which +// is >= the percentage ratio between availBytes and totalBytes (or ok=false if +// there is more free space than the highest threshold). +func (r *lowDiskSpaceReporter) findThreshold( + availBytes, totalBytes uint64, +) (threshold int, ok bool) { + // Note: in the normal path, we exit the loop during the first iteration. + for i, t := range lowDiskSpaceThresholds { + if availBytes*100 > totalBytes*uint64(lowDiskSpaceThresholds[i]) { + break + } + threshold = t + ok = true + } + return threshold, ok +} + +// reportCorruption reports a corruption of a TableMetadata or BlobFileMetadata +// to the event listener and also adds a DataCorruptionInfo payload to the error. +func (d *DB) reportCorruption(meta any, err error) error { + switch meta := meta.(type) { + case *manifest.TableMetadata: + return d.reportFileCorruption(base.FileTypeTable, meta.TableBacking.DiskFileNum, meta.UserKeyBounds(), err) + case *manifest.PhysicalBlobFile: + // TODO(jackson): Add bounds for blob files. + return d.reportFileCorruption(base.FileTypeBlob, meta.FileNum, base.UserKeyBounds{}, err) + default: + panic(fmt.Sprintf("unknown metadata type: %T", meta)) + } +} + +func (d *DB) reportFileCorruption( + fileType base.FileType, fileNum base.DiskFileNum, userKeyBounds base.UserKeyBounds, err error, +) error { + if invariants.Enabled && !IsCorruptionError(err) { + panic("not a corruption error") + } + + objMeta, lookupErr := d.objProvider.Lookup(fileType, fileNum) + if lookupErr != nil { + // If the object is not known to the provider, it must be a local object + // that was missing when we opened the store. Remote objects have their + // metadata in a catalog, so even if the backing object is deleted, the + // DiskFileNum would still be known. + objMeta = objstorage.ObjectMetadata{DiskFileNum: fileNum, FileType: fileType} + } + path := d.objProvider.Path(objMeta) + if objMeta.IsRemote() { + // Remote path (which include the locator and full path) might not always be + // safe. + err = errors.WithHintf(err, "path: %s", path) + } else { + // Local paths are safe: they start with the store directory and the + // filename is generated by Pebble. + err = errors.WithHintf(err, "path: %s", redact.Safe(path)) + } + info := DataCorruptionInfo{ + Path: path, + IsRemote: objMeta.IsRemote(), + Locator: objMeta.Remote.Locator, + Bounds: userKeyBounds, + Details: err, + } + d.opts.EventListener.DataCorruption(info) + // We don't use errors.Join() because that also annotates with this stack + // trace which would not be useful. + return errorsjoin.Join(err, &corruptionDetailError{info: info}) +} + +type corruptionDetailError struct { + info DataCorruptionInfo +} + +func (e *corruptionDetailError) Error() string { + return "" +} + +// ExtractDataCorruptionInfo extracts the DataCorruptionInfo details from a +// corruption error. Returns nil if there is no such detail. +func ExtractDataCorruptionInfo(err error) *DataCorruptionInfo { + var e *corruptionDetailError + if errors.As(err, &e) { + return &e.info + } + return nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/excise.go b/vendor/github.com/cockroachdb/pebble/v2/excise.go new file mode 100644 index 0000000..a285b7e --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/excise.go @@ -0,0 +1,499 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + "slices" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/objstorage" +) + +// Excise atomically deletes all data overlapping with the provided span. All +// data overlapping with the span is removed, including from open snapshots. +// Only currently-open iterators will still observe the removed data (because an +// open iterator pins all memtables and sstables in its view of the LSM until +// it's closed). Excise may initiate a flush if there exists unflushed data +// overlapping the excise span. +func (d *DB) Excise(ctx context.Context, span KeyRange) error { + if err := d.closed.Load(); err != nil { + panic(err) + } + if d.opts.ReadOnly { + return ErrReadOnly + } + // Excise is only supported on prefix keys. + if d.opts.Comparer.Split(span.Start) != len(span.Start) { + return errors.New("Excise called with suffixed start key") + } + if d.opts.Comparer.Split(span.End) != len(span.End) { + return errors.New("Excise called with suffixed end key") + } + if v := d.FormatMajorVersion(); v < FormatVirtualSSTables { + return errors.Newf( + "store has format major version %d; Excise requires at least %d", + v, FormatVirtualSSTables, + ) + } + _, err := d.ingest(ctx, ingestArgs{ExciseSpan: span, ExciseBoundsPolicy: tightExciseBoundsIfLocal}) + return err +} + +// exciseBoundsPolicy controls whether we open excised files to obtain tight +// bounds for the remaining file(s). +type exciseBoundsPolicy uint8 + +const ( + // tightExciseBounds means that we will always open the file to find the exact + // bounds of the remaining file(s). + tightExciseBounds exciseBoundsPolicy = iota + // looseExciseBounds means that we will not open the file and will assign bounds + // pessimistically. + looseExciseBounds + // tightExciseBoundsLocalOnly means that we will only open the file if it is + // local; otherwise we will assign loose bounds to the remaining file(s). + tightExciseBoundsIfLocal +) + +// exciseTable initializes up to two virtual tables for what is left over after +// excising the given span from the table. +// +// Returns the left and/or right tables, if they exist. The boundsPolicy controls +// whether we create iterators for m to determine tight bounds. Note that if the +// exciseBounds are end-inclusive, tight bounds will be used regardless of the +// policy. +// +// The file bounds must overlap with the excise span. +// +// This method is agnostic to whether d.mu is held or not. Some cases call it with +// the db mutex held (eg. ingest-time excises), while in the case of compactions +// the mutex is not held. +func (d *DB) exciseTable( + ctx context.Context, + exciseBounds base.UserKeyBounds, + m *manifest.TableMetadata, + level int, + boundsPolicy exciseBoundsPolicy, +) (leftTable, rightTable *manifest.TableMetadata, _ error) { + // Check if there's actually an overlap between m and exciseSpan. + mBounds := m.UserKeyBounds() + if !exciseBounds.Overlaps(d.cmp, &mBounds) { + return nil, nil, base.AssertionFailedf("excise span does not overlap table") + } + // Fast path: m sits entirely within the exciseSpan, so just delete it. + if exciseBounds.ContainsInternalKey(d.cmp, m.Smallest()) && exciseBounds.ContainsInternalKey(d.cmp, m.Largest()) { + return nil, nil, nil + } + + looseBounds := boundsPolicy == looseExciseBounds || + (boundsPolicy == tightExciseBoundsIfLocal && !objstorage.IsLocalTable(d.objProvider, m.TableBacking.DiskFileNum)) + + if exciseBounds.End.Kind == base.Inclusive { + // Loose bounds are not allowed with end-inclusive bounds. This can only + // happen for ingest splits. + looseBounds = false + } + + // The file partially overlaps the excise span; unless looseBounds is true, we + // will need to open it to determine tight bounds for the left-over table(s). + var iters iterSet + if !looseBounds { + var err error + iters, err = d.newIters(ctx, m, &IterOptions{ + Category: categoryIngest, + layer: manifest.Level(level), + }, internalIterOpts{}, iterPointKeys|iterRangeDeletions|iterRangeKeys) + if err != nil { + return nil, nil, err + } + defer func() { _ = iters.CloseAll() }() + } + + // Create a file to the left of the excise span, if necessary. + // The bounds of this file will be [m.Smallest, lastKeyBefore(exciseSpan.Start)]. + // + // We create bounds that are tight on user keys, and we make the effort to find + // the last key in the original sstable that's smaller than exciseSpan.Start + // even though it requires some sstable reads. We could choose to create + // virtual sstables on loose userKey bounds, in which case we could just set + // leftFile.Largest to an exclusive sentinel at exciseSpan.Start. The biggest + // issue with that approach would be that it'd lead to lots of small virtual + // sstables in the LSM that have no guarantee on containing even a single user + // key within the file bounds. This has the potential to increase both read and + // write-amp as we will be opening up these sstables only to find no relevant + // keys in the read path, and compacting sstables on top of them instead of + // directly into the space occupied by them. We choose to incur the cost of + // calculating tight bounds at this time instead of creating more work in the + // future. + // + // TODO(bilal): Some of this work can happen without grabbing the manifest + // lock; we could grab one currentVersion, release the lock, calculate excised + // files, then grab the lock again and recalculate for just the files that + // have changed since our previous calculation. Do this optimization as part of + // https://github.com/cockroachdb/pebble/v2/issues/2112 . + if d.cmp(m.Smallest().UserKey, exciseBounds.Start) < 0 { + leftTable = &manifest.TableMetadata{ + Virtual: true, + TableNum: d.mu.versions.getNextTableNum(), + // Note that these are loose bounds for smallest/largest seqnums, but they're + // sufficient for maintaining correctness. + SmallestSeqNum: m.SmallestSeqNum, + LargestSeqNum: m.LargestSeqNum, + LargestSeqNumAbsolute: m.LargestSeqNumAbsolute, + SyntheticPrefixAndSuffix: m.SyntheticPrefixAndSuffix, + BlobReferenceDepth: m.BlobReferenceDepth, + } + if looseBounds { + looseLeftTableBounds(d.cmp, m, leftTable, exciseBounds.Start) + } else if err := determineLeftTableBounds(d.cmp, m, leftTable, exciseBounds.Start, iters); err != nil { + return nil, nil, err + } + + if leftTable.HasRangeKeys || leftTable.HasPointKeys { + leftTable.AttachVirtualBacking(m.TableBacking) + if looseBounds { + // We don't want to access the object; make up a size. + leftTable.Size = (m.Size + 1) / 2 + } else if err := determineExcisedTableSize(d.fileCache, m, leftTable); err != nil { + return nil, nil, err + } + determineExcisedTableBlobReferences(m.BlobReferences, m.Size, leftTable) + if err := leftTable.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { + return nil, nil, err + } + leftTable.ValidateVirtual(m) + } else { + leftTable = nil + } + } + // Create a file to the right, if necessary. + if !exciseBounds.End.IsUpperBoundForInternalKey(d.cmp, m.Largest()) { + // Create a new file, rightFile, between [firstKeyAfter(exciseSpan.End), m.Largest]. + // + // See comment before the definition of leftFile for the motivation behind + // calculating tight user-key bounds. + rightTable = &manifest.TableMetadata{ + Virtual: true, + TableNum: d.mu.versions.getNextTableNum(), + // Note that these are loose bounds for smallest/largest seqnums, but they're + // sufficient for maintaining correctness. + SmallestSeqNum: m.SmallestSeqNum, + LargestSeqNum: m.LargestSeqNum, + LargestSeqNumAbsolute: m.LargestSeqNumAbsolute, + SyntheticPrefixAndSuffix: m.SyntheticPrefixAndSuffix, + BlobReferenceDepth: m.BlobReferenceDepth, + } + if looseBounds { + // We already checked that the end bound is exclusive. + looseRightTableBounds(d.cmp, m, rightTable, exciseBounds.End.Key) + } else if err := determineRightTableBounds(d.cmp, m, rightTable, exciseBounds.End, iters); err != nil { + return nil, nil, err + } + if rightTable.HasRangeKeys || rightTable.HasPointKeys { + rightTable.AttachVirtualBacking(m.TableBacking) + if looseBounds { + // We don't want to access the object; make up a size. + rightTable.Size = (m.Size + 1) / 2 + } else if err := determineExcisedTableSize(d.fileCache, m, rightTable); err != nil { + return nil, nil, err + } + determineExcisedTableBlobReferences(m.BlobReferences, m.Size, rightTable) + if err := rightTable.Validate(d.cmp, d.opts.Comparer.FormatKey); err != nil { + return nil, nil, err + } + rightTable.ValidateVirtual(m) + } else { + rightTable = nil + } + } + return leftTable, rightTable, nil +} + +// exciseOverlapBounds examines the provided list of snapshots, examining each +// eventually file-only snapshot in the list and its bounds. If the snapshot is +// visible at the excise's sequence number, then it accumulates all of the +// eventually file-only snapshot's protected ranges. +func exciseOverlapBounds( + cmp Compare, sl *snapshotList, exciseSpan KeyRange, exciseSeqNum base.SeqNum, +) []bounded { + var extended []bounded + for s := sl.root.next; s != &sl.root; s = s.next { + if s.efos == nil { + continue + } + if base.Visible(exciseSeqNum, s.efos.seqNum, base.SeqNumMax) { + // We only worry about snapshots older than the excise. Any snapshots + // created after the excise should see the excised view of the LSM + // anyway. + // + // Since we delay publishing the excise seqnum as visible until after + // the apply step, this case will never be hit in practice until we + // make excises flushable ingests. + continue + } + if invariants.Enabled { + if s.efos.hasTransitioned() { + panic("unexpected transitioned EFOS in snapshots list") + } + } + for i := range s.efos.protectedRanges { + if !s.efos.protectedRanges[i].OverlapsKeyRange(cmp, exciseSpan) { + continue + } + // Our excise conflicts with this EFOS. We need to add its protected + // ranges to our extended overlap bounds. Grow extended in one + // allocation if necesary. + extended = slices.Grow(extended, len(s.efos.protectedRanges)) + for i := range s.efos.protectedRanges { + extended = append(extended, &s.efos.protectedRanges[i]) + } + break + } + } + return extended +} + +// looseLeftTableBounds initializes the bounds for the table that remains to the +// left of the excise span after excising originalTable, without consulting the +// contents of originalTable. The resulting bounds are loose. +// +// Sets the smallest and largest keys, as well as HasPointKeys/HasRangeKeys in +// the leftFile. +func looseLeftTableBounds( + cmp Compare, originalTable, leftTable *manifest.TableMetadata, exciseSpanStart []byte, +) { + if originalTable.HasPointKeys { + largestPointKey := originalTable.PointKeyBounds.Largest() + if largestPointKey.IsUpperBoundFor(cmp, exciseSpanStart) { + largestPointKey = base.MakeRangeDeleteSentinelKey(exciseSpanStart) + } + leftTable.ExtendPointKeyBounds(cmp, originalTable.PointKeyBounds.Smallest(), largestPointKey) + } + if originalTable.HasRangeKeys { + largestRangeKey := originalTable.RangeKeyBounds.Largest() + if largestRangeKey.IsUpperBoundFor(cmp, exciseSpanStart) { + largestRangeKey = base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyMin, exciseSpanStart) + } + leftTable.ExtendRangeKeyBounds(cmp, originalTable.RangeKeyBounds.Smallest(), largestRangeKey) + } +} + +// looseRightTableBounds initializes the bounds for the table that remains to the +// right of the excise span after excising originalTable, without consulting the +// contents of originalTable. The resulting bounds are loose. +// +// Sets the smallest and largest keys, as well as HasPointKeys/HasRangeKeys in +// the rightFile. +// +// The excise span end bound is assumed to be exclusive; this function cannot be +// used with an inclusive end bound. +func looseRightTableBounds( + cmp Compare, originalTable, rightTable *manifest.TableMetadata, exciseSpanEnd []byte, +) { + if originalTable.HasPointKeys { + smallestPointKey := originalTable.PointKeyBounds.Smallest() + if !smallestPointKey.IsUpperBoundFor(cmp, exciseSpanEnd) { + smallestPointKey = base.MakeInternalKey(exciseSpanEnd, 0, base.InternalKeyKindMaxForSSTable) + } + rightTable.ExtendPointKeyBounds(cmp, smallestPointKey, originalTable.PointKeyBounds.Largest()) + } + if originalTable.HasRangeKeys { + smallestRangeKey := originalTable.RangeKeyBounds.Smallest() + if !smallestRangeKey.IsUpperBoundFor(cmp, exciseSpanEnd) { + smallestRangeKey = base.MakeInternalKey(exciseSpanEnd, 0, base.InternalKeyKindRangeKeyMax) + } + rightTable.ExtendRangeKeyBounds(cmp, smallestRangeKey, originalTable.RangeKeyBounds.Largest()) + } +} + +// determineLeftTableBounds calculates the bounds for the table that remains to +// the left of the excise span after excising originalTable. The bounds around +// the excise span are determined precisely by looking inside the file. +// +// Sets the smallest and largest keys, as well as HasPointKeys/HasRangeKeys in +// the leftFile. +func determineLeftTableBounds( + cmp Compare, + originalTable, leftTable *manifest.TableMetadata, + exciseSpanStart []byte, + iters iterSet, +) error { + if originalTable.HasPointKeys && cmp(originalTable.PointKeyBounds.Smallest().UserKey, exciseSpanStart) < 0 { + // This file will probably contain point keys. + if kv := iters.Point().SeekLT(exciseSpanStart, base.SeekLTFlagsNone); kv != nil { + leftTable.ExtendPointKeyBounds(cmp, originalTable.PointKeyBounds.Smallest(), kv.K.Clone()) + } + rdel, err := iters.RangeDeletion().SeekLT(exciseSpanStart) + if err != nil { + return err + } + if rdel != nil { + // Use the smaller of exciseSpanStart and rdel.End. + lastRangeDel := exciseSpanStart + if cmp(rdel.End, exciseSpanStart) < 0 { + // The key is owned by the range del iter, so we need to copy it. + lastRangeDel = slices.Clone(rdel.End) + } + leftTable.ExtendPointKeyBounds(cmp, originalTable.PointKeyBounds.Smallest(), + base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, lastRangeDel)) + } + } + + if originalTable.HasRangeKeys && cmp(originalTable.RangeKeyBounds.SmallestUserKey(), exciseSpanStart) < 0 { + rkey, err := iters.RangeKey().SeekLT(exciseSpanStart) + if err != nil { + return err + } + if rkey != nil { + // Use the smaller of exciseSpanStart and rkey.End. + lastRangeKey := exciseSpanStart + if cmp(rkey.End, exciseSpanStart) < 0 { + // The key is owned by the range key iter, so we need to copy it. + lastRangeKey = slices.Clone(rkey.End) + } + leftTable.ExtendRangeKeyBounds(cmp, originalTable.RangeKeyBounds.Smallest(), + base.MakeExclusiveSentinelKey(rkey.LargestKey().Kind(), lastRangeKey)) + } + } + return nil +} + +// determineRightTableBounds calculates the bounds for the table that remains to +// the right of the excise span after excising originalTable. The bounds around +// the excise span are determined precisely by looking inside the file. +// +// Sets the smallest and largest keys, as well as HasPointKeys/HasRangeKeys in +// the right. +// +// Note that the case where exciseSpanEnd is Inclusive is very restrictive; we +// are only allowed to excise if the original table has no keys or ranges +// overlapping exciseSpanEnd.Key. +func determineRightTableBounds( + cmp Compare, + originalTable, rightTable *manifest.TableMetadata, + exciseSpanEnd base.UserKeyBoundary, + iters iterSet, +) error { + if originalTable.HasPointKeys && !exciseSpanEnd.IsUpperBoundForInternalKey(cmp, originalTable.PointKeyBounds.Largest()) { + if kv := iters.Point().SeekGE(exciseSpanEnd.Key, base.SeekGEFlagsNone); kv != nil { + if exciseSpanEnd.Kind == base.Inclusive && cmp(exciseSpanEnd.Key, kv.K.UserKey) == 0 { + return base.AssertionFailedf("cannot excise with an inclusive end key and data overlap at end key") + } + rightTable.ExtendPointKeyBounds(cmp, kv.K.Clone(), originalTable.PointKeyBounds.Largest()) + } + rdel, err := iters.RangeDeletion().SeekGE(exciseSpanEnd.Key) + if err != nil { + return err + } + if rdel != nil { + // Use the larger of exciseSpanEnd.Key and rdel.Start. + firstRangeDel := exciseSpanEnd.Key + if cmp(rdel.Start, exciseSpanEnd.Key) > 0 { + // The key is owned by the range del iter, so we need to copy it. + firstRangeDel = slices.Clone(rdel.Start) + } else if exciseSpanEnd.Kind != base.Exclusive { + return base.AssertionFailedf("cannot truncate rangedel during excise with an inclusive upper bound") + } + rightTable.ExtendPointKeyBounds(cmp, base.InternalKey{ + UserKey: firstRangeDel, + Trailer: rdel.SmallestKey().Trailer, + }, originalTable.PointKeyBounds.Largest()) + } + } + if originalTable.HasRangeKeys && !exciseSpanEnd.IsUpperBoundForInternalKey(cmp, originalTable.RangeKeyBounds.Largest()) { + rkey, err := iters.RangeKey().SeekGE(exciseSpanEnd.Key) + if err != nil { + return err + } + if rkey != nil { + // Use the larger of exciseSpanEnd.Key and rkey.Start. + firstRangeKey := exciseSpanEnd.Key + if cmp(rkey.Start, exciseSpanEnd.Key) > 0 { + // The key is owned by the range key iter, so we need to copy it. + firstRangeKey = slices.Clone(rkey.Start) + } else if exciseSpanEnd.Kind != base.Exclusive { + return base.AssertionFailedf("cannot truncate range key during excise with an inclusive upper bound") + } + rightTable.ExtendRangeKeyBounds(cmp, base.InternalKey{ + UserKey: firstRangeKey, + Trailer: rkey.SmallestKey().Trailer, + }, originalTable.RangeKeyBounds.Largest()) + } + } + return nil +} + +func determineExcisedTableSize( + fc *fileCacheHandle, originalTable, excisedTable *manifest.TableMetadata, +) error { + size, err := fc.estimateSize(originalTable, excisedTable.Smallest().UserKey, excisedTable.Largest().UserKey) + if err != nil { + return err + } + excisedTable.Size = size + if size == 0 { + // On occasion, estimateSize gives us a low estimate, i.e. a 0 file size, + // such as if the excised file only has range keys/dels and no point + // keys. This can cause panics in places where we divide by file sizes. + // Correct for it here. + excisedTable.Size = 1 + } + return nil +} + +// determineExcisedTableBlobReferences copies blob references from the original +// table to the excised table, scaling each blob reference's value size +// proportionally based on the ratio of the excised table's size to the original +// table's size. +func determineExcisedTableBlobReferences( + originalBlobReferences manifest.BlobReferences, + originalSize uint64, + excisedTable *manifest.TableMetadata, +) { + if len(originalBlobReferences) == 0 { + return + } + newBlobReferences := make(manifest.BlobReferences, len(originalBlobReferences)) + for i, bf := range originalBlobReferences { + bf.ValueSize = max(bf.ValueSize*excisedTable.Size/originalSize, 1) + newBlobReferences[i] = bf + } + excisedTable.BlobReferences = newBlobReferences +} + +// applyExciseToVersionEdit updates ve with a table deletion for the original +// table and table additions for the left and/or right table. +// +// Either or both of leftTable/rightTable can be nil. +func applyExciseToVersionEdit( + ve *manifest.VersionEdit, originalTable, leftTable, rightTable *manifest.TableMetadata, level int, +) (newFiles []manifest.NewTableEntry) { + ve.DeletedTables[manifest.DeletedTableEntry{ + Level: level, + FileNum: originalTable.TableNum, + }] = originalTable + if leftTable == nil && rightTable == nil { + return + } + if !originalTable.Virtual { + // If the original table was virtual, then its file backing is already known + // to the manifest; we don't need to create another file backing. Note that + // there must be only one CreatedBackingTables entry per backing sstable. + // This is indicated by the VersionEdit.CreatedBackingTables invariant. + ve.CreatedBackingTables = append(ve.CreatedBackingTables, originalTable.TableBacking) + } + originalLen := len(ve.NewTables) + if leftTable != nil { + ve.NewTables = append(ve.NewTables, manifest.NewTableEntry{Level: level, Meta: leftTable}) + } + if rightTable != nil { + ve.NewTables = append(ve.NewTables, manifest.NewTableEntry{Level: level, Meta: rightTable}) + } + return ve.NewTables[originalLen:] +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/external_iterator.go b/vendor/github.com/cockroachdb/pebble/v2/external_iterator.go new file mode 100644 index 0000000..43fda3f --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/external_iterator.go @@ -0,0 +1,318 @@ +// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// NewExternalIter takes an input 2d array of sstable files which may overlap +// across subarrays but not within a subarray (at least as far as points are +// concerned; range keys are allowed to overlap arbitrarily even within a +// subarray), and returns an Iterator over the merged contents of the sstables. +// Input sstables may contain point keys, range keys, range deletions, etc. The +// input files slice must be sorted in reverse chronological ordering. A key in a +// file at a lower index subarray will shadow a key with an identical user key +// contained within a file at a higher index subarray. Each subarray must be +// sorted in internal key order, where lower index files contain keys that sort +// left of files with higher indexes. +// +// Input sstables must only contain keys with the zero sequence number and must +// not contain references to values in external blob files. +// +// Iterators constructed through NewExternalIter do not support all iterator +// options, including block-property and table filters. NewExternalIter errors +// if an incompatible option is set. +func NewExternalIter( + o *Options, iterOpts *IterOptions, files [][]sstable.ReadableFile, +) (it *Iterator, err error) { + return NewExternalIterWithContext(context.Background(), o, iterOpts, files) +} + +// NewExternalIterWithContext is like NewExternalIter, and additionally +// accepts a context for tracing. +func NewExternalIterWithContext( + ctx context.Context, o *Options, iterOpts *IterOptions, files [][]sstable.ReadableFile, +) (it *Iterator, err error) { + if iterOpts != nil { + if err := validateExternalIterOpts(iterOpts); err != nil { + return nil, err + } + } + + ro := o.MakeReaderOptions() + var readers [][]*sstable.Reader + for _, levelFiles := range files { + subReaders, err := openExternalTables(ctx, levelFiles, ro) + readers = append(readers, subReaders) + if err != nil { + // Close all the opened readers. + for i := range readers { + for j := range readers[i] { + _ = readers[i][j].Close() + } + } + return nil, err + } + } + + buf := iterAllocPool.Get().(*iterAlloc) + dbi := &buf.dbi + *dbi = Iterator{ + ctx: ctx, + alloc: buf, + merge: o.Merger.Merge, + comparer: *o.Comparer, + readState: nil, + keyBuf: buf.keyBuf, + prefixOrFullSeekKey: buf.prefixOrFullSeekKey, + boundsBuf: buf.boundsBuf, + batch: nil, + // Add the external iter state to the Iterator so that Close closes it, + // and SetOptions can re-construct iterators using its state. + externalIter: &externalIterState{readers: readers}, + newIters: func(context.Context, *manifest.TableMetadata, *IterOptions, + internalIterOpts, iterKinds) (iterSet, error) { + // NB: External iterators are currently constructed without any + // `levelIters`. newIters should never be called. When we support + // organizing multiple non-overlapping files into a single level + // (see TODO below), we'll need to adjust this tableNewIters + // implementation to open iterators by looking up f in a map + // of readers indexed by *fileMetadata. + panic("unreachable") + }, + seqNum: base.SeqNumMax, + } + dbi.externalIter.bufferPool.Init(2) + + if iterOpts != nil { + dbi.opts = *iterOpts + dbi.processBounds(iterOpts.LowerBound, iterOpts.UpperBound) + } + if err := finishInitializingExternal(ctx, dbi); err != nil { + _ = dbi.Close() + return nil, err + } + return dbi, nil +} + +// externalIterState encapsulates state that is specific to external iterators. +// An external *pebble.Iterator maintains a pointer to the externalIterState and +// calls Close when the Iterator is Closed, providing an opportuntity for the +// external iterator to release resources particular to external iterators. +type externalIterState struct { + bufferPool block.BufferPool + readers [][]*sstable.Reader +} + +func (e *externalIterState) Close() (err error) { + for _, readers := range e.readers { + for _, r := range readers { + err = firstError(err, r.Close()) + } + } + e.bufferPool.Release() + return err +} + +func validateExternalIterOpts(iterOpts *IterOptions) error { + switch { + case iterOpts.PointKeyFilters != nil: + return errors.Errorf("pebble: external iterator: PointKeyFilters unsupported") + case iterOpts.RangeKeyFilters != nil: + return errors.Errorf("pebble: external iterator: RangeKeyFilters unsupported") + case iterOpts.OnlyReadGuaranteedDurable: + return errors.Errorf("pebble: external iterator: OnlyReadGuaranteedDurable unsupported") + case iterOpts.UseL6Filters: + return errors.Errorf("pebble: external iterator: UseL6Filters unsupported") + } + return nil +} + +func createExternalPointIter( + ctx context.Context, it *Iterator, readEnv sstable.ReadEnv, +) (topLevelIterator, error) { + // TODO(jackson): In some instances we could generate fewer levels by using + // L0Sublevels code to organize nonoverlapping files into the same level. + // This would allow us to use levelIters and keep a smaller set of data and + // files in-memory. However, it would also require us to identify the bounds + // of all the files upfront. + + if !it.opts.pointKeys() { + return emptyIter, nil + } else if it.pointIter != nil { + return it.pointIter, nil + } + mlevels := it.alloc.mlevels[:0] + + if len(it.externalIter.readers) > cap(mlevels) { + mlevels = make([]mergingIterLevel, 0, len(it.externalIter.readers)) + } + // We set a synthetic sequence number, with lower levels having higer numbers. + seqNum := 0 + for _, readers := range it.externalIter.readers { + seqNum += len(readers) + } + for _, readers := range it.externalIter.readers { + for _, r := range readers { + var ( + rangeDelIter keyspan.FragmentIterator + pointIter internalIterator + err error + ) + // We could set hideObsoletePoints=true, since we are reading at + // InternalKeySeqNumMax, but we don't bother since these sstables should + // not have obsolete points (so the performance optimization is + // unnecessary), and we don't want to bother constructing a + // BlockPropertiesFilterer that includes obsoleteKeyBlockPropertyFilter. + transforms := sstable.IterTransforms{SyntheticSeqNum: sstable.SyntheticSeqNum(seqNum)} + seqNum-- + pointIter, err = r.NewPointIter(ctx, sstable.IterOptions{ + Lower: it.opts.LowerBound, + Upper: it.opts.UpperBound, + Transforms: transforms, + FilterBlockSizeLimit: sstable.NeverUseFilterBlock, + Env: readEnv, + ReaderProvider: sstable.MakeTrivialReaderProvider(r), + }) + if err == nil { + rangeDelIter, err = r.NewRawRangeDelIter(ctx, sstable.FragmentIterTransforms{ + SyntheticSeqNum: sstable.SyntheticSeqNum(seqNum), + }, readEnv) + } + if err != nil { + if pointIter != nil { + _ = pointIter.Close() + } + for i := range mlevels { + _ = mlevels[i].iter.Close() + if mlevels[i].rangeDelIter != nil { + mlevels[i].rangeDelIter.Close() + } + } + return nil, err + } + mlevels = append(mlevels, mergingIterLevel{ + iter: pointIter, + rangeDelIter: rangeDelIter, + }) + } + } + + it.alloc.merging.init(&it.opts, &it.stats.InternalStats, it.comparer.Compare, it.comparer.Split, mlevels...) + it.alloc.merging.snapshot = base.SeqNumMax + if len(mlevels) <= cap(it.alloc.levelsPositioned) { + it.alloc.merging.levelsPositioned = it.alloc.levelsPositioned[:len(mlevels)] + } + return &it.alloc.merging, nil +} + +func finishInitializingExternal(ctx context.Context, it *Iterator) error { + readEnv := sstable.ReadEnv{ + Block: block.ReadEnv{ + Stats: &it.stats.InternalStats, + // TODO(jackson): External iterators never provide categorized iterator + // stats today because they exist outside the context of a *DB. If the + // sstables being read are on the physical filesystem, we may still want to + // thread a CategoryStatsCollector through so that we collect their stats. + IterStats: nil, + BufferPool: &it.externalIter.bufferPool, + }, + } + pointIter, err := createExternalPointIter(ctx, it, readEnv) + if err != nil { + return err + } + it.pointIter = pointIter + it.iter = it.pointIter + + if it.opts.rangeKeys() { + it.rangeKeyMasking.init(it, &it.comparer) + var rangeKeyIters []keyspan.FragmentIterator + if it.rangeKey == nil { + // We could take advantage of the lack of overlaps in range keys within + // each slice in it.externalReaders, and generate keyspanimpl.LevelIters + // out of those. However, since range keys are expected to be sparse to + // begin with, the performance gain might not be significant enough to + // warrant it. + // + // TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not + // operate on TableMetadatas (similar to simpleLevelIter), and implements + // this optimization. + // We set a synthetic sequence number, with lower levels having higer numbers. + seqNum := 0 + for _, readers := range it.externalIter.readers { + seqNum += len(readers) + } + for _, readers := range it.externalIter.readers { + for _, r := range readers { + transforms := sstable.FragmentIterTransforms{SyntheticSeqNum: sstable.SyntheticSeqNum(seqNum)} + seqNum-- + rki, err := r.NewRawRangeKeyIter(ctx, transforms, readEnv) + if err != nil { + for _, iter := range rangeKeyIters { + iter.Close() + } + return err + } + if rki != nil { + rangeKeyIters = append(rangeKeyIters, rki) + } + } + } + if len(rangeKeyIters) > 0 { + it.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) + it.rangeKey.rangeKeyIter = it.rangeKey.iterConfig.Init( + &it.comparer, + base.SeqNumMax, + it.opts.LowerBound, it.opts.UpperBound, + &it.hasPrefix, &it.prefixOrFullSeekKey, + false /* internalKeys */, &it.rangeKey.internal, + ) + for i := range rangeKeyIters { + it.rangeKey.iterConfig.AddLevel(rangeKeyIters[i]) + } + } + } + if it.rangeKey != nil { + it.rangeKey.iiter.Init(&it.comparer, it.iter, it.rangeKey.rangeKeyIter, + keyspan.InterleavingIterOpts{ + Mask: &it.rangeKeyMasking, + LowerBound: it.opts.LowerBound, + UpperBound: it.opts.UpperBound, + }) + it.iter = &it.rangeKey.iiter + } + } + return nil +} + +func openExternalTables( + ctx context.Context, files []sstable.ReadableFile, readerOpts sstable.ReaderOptions, +) (readers []*sstable.Reader, err error) { + readers = make([]*sstable.Reader, 0, len(files)) + for i := range files { + readable, err := sstable.NewSimpleReadable(files[i]) + if err != nil { + return readers, err + } + r, err := sstable.NewReader(ctx, readable, readerOpts) + if err != nil { + return readers, errors.CombineErrors(err, readable.Close()) + } + if r.Attributes.Has(sstable.AttributeBlobValues) { + return readers, errors.Newf("pebble: NewExternalIter does not support blob references") + } + readers = append(readers, r) + } + return readers, err +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/file_cache.go b/vendor/github.com/cockroachdb/pebble/v2/file_cache.go new file mode 100644 index 0000000..38acd1c --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/file_cache.go @@ -0,0 +1,1069 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "bytes" + "context" + "fmt" + "io" + "runtime/debug" + "sync" + "sync/atomic" + "unsafe" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/cache" + "github.com/cockroachdb/pebble/v2/internal/genericcache" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/sstableinternal" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/valblk" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/redact" +) + +// FileCacheMetrics contains metrics for the file cache. Note that the file +// cache is normally shared between all the stores on a node. +type FileCacheMetrics struct { + // The number of bytes inuse by the cache. + Size int64 + TableCount int64 + BlobFileCount int64 + Hits int64 + Misses int64 +} + +var emptyIter = &errorIter{err: nil} +var emptyKeyspanIter = &errorKeyspanIter{err: nil} + +// tableNewIters creates new iterators (point, range deletion and/or range key) +// for the given table metadata. Which of the various iterator kinds the user is +// requesting is specified with the iterKinds bitmap. +// +// On success, the requested subset of iters.{point,rangeDel,rangeKey} are +// populated with iterators. +// +// If a point iterator is requested and the operation was successful, +// iters.point is guaranteed to be non-nil and must be closed when the caller is +// finished. +// +// If a range deletion or range key iterator is requested, the corresponding +// iterator may be nil if the table does not contain any keys of the +// corresponding kind. The returned iterSet type provides RangeDeletion() and +// RangeKey() convenience methods that return non-nil empty iterators that may +// be used if the caller requires a non-nil iterator. +// +// On error, all iterators are nil. +// +// The only (non-test) implementation of tableNewIters is +// fileCacheHandle.newIters(). +type tableNewIters func( + ctx context.Context, + file *manifest.TableMetadata, + opts *IterOptions, + internalOpts internalIterOpts, + kinds iterKinds, +) (iterSet, error) + +// tableNewRangeDelIter takes a tableNewIters and returns a TableNewSpanIter +// for the rangedel iterator returned by tableNewIters. +func tableNewRangeDelIter(newIters tableNewIters) keyspanimpl.TableNewSpanIter { + return func(ctx context.Context, file *manifest.TableMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { + iters, err := newIters(ctx, file, nil, internalIterOpts{}, iterRangeDeletions) + if err != nil { + return nil, err + } + return iters.RangeDeletion(), nil + } +} + +// tableNewRangeKeyIter takes a tableNewIters and returns a TableNewSpanIter +// for the range key iterator returned by tableNewIters. +func tableNewRangeKeyIter(newIters tableNewIters) keyspanimpl.TableNewSpanIter { + return func(ctx context.Context, file *manifest.TableMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { + iters, err := newIters(ctx, file, nil, internalIterOpts{}, iterRangeKeys) + if err != nil { + return nil, err + } + return iters.RangeKey(), nil + } +} + +// fileCacheHandle is used to access the file cache. Each DB has its own handle. +type fileCacheHandle struct { + fileCache *FileCache + + // The handle contains fields which are unique to each DB. Note that these get + // accessed from all shards, so keep read-only fields separate for read-write + // fields. + loggerAndTracer LoggerAndTracer + blockCacheHandle *cache.Handle + objProvider objstorage.Provider + readerOpts sstable.ReaderOptions + + // iterCount keeps track of how many iterators are open. It is used to keep + // track of leaked iterators on a per-db level. + iterCount atomic.Int32 + sstStatsCollector block.CategoryStatsCollector + + // reportCorruptionFn is used for block.ReadEnv.ReportCorruptionFn. It expects + // the first argument to be a `*TableMetadata`. It returns an error that + // contains more details. + reportCorruptionFn func(any, error) error + + // This struct is only populated in race builds. + raceMu struct { + sync.Mutex + // nextRefID is the next ID to allocate for a new reference. + nextRefID uint64 + // openRefs maps reference IDs to the stack trace recorded at creation + // time. It's used to track which call paths leaked open references to + // files. + openRefs map[uint64][]byte + } +} + +// Assert that *fileCacheHandle implements blob.ReaderProvider. +var _ blob.ReaderProvider = (*fileCacheHandle)(nil) + +// newHandle creates a handle for the FileCache which has its own options. Each +// handle has its own set of files in the cache, separate from those of other +// handles. +func (c *FileCache) newHandle( + cacheHandle *cache.Handle, + objProvider objstorage.Provider, + loggerAndTracer LoggerAndTracer, + readerOpts sstable.ReaderOptions, + reportCorruptionFn func(any, error) error, +) *fileCacheHandle { + c.Ref() + + t := &fileCacheHandle{ + fileCache: c, + loggerAndTracer: loggerAndTracer, + blockCacheHandle: cacheHandle, + objProvider: objProvider, + } + t.readerOpts = readerOpts + t.readerOpts.FilterMetricsTracker = &sstable.FilterMetricsTracker{} + t.reportCorruptionFn = reportCorruptionFn + if invariants.RaceEnabled { + t.raceMu.openRefs = make(map[uint64][]byte) + } + return t +} + +// Close the handle, make sure that there will be no further need +// to access any of the files associated with the store. +func (h *fileCacheHandle) Close() error { + // We want to do some cleanup work here. Check for leaked iterators + // by the DB using this container. Note that we'll still perform cleanup + // below in the case that there are leaked iterators. + var err error + if v := h.iterCount.Load(); v > 0 { + if !invariants.RaceEnabled { + err = errors.Errorf("leaked iterators: %d", errors.Safe(v)) + } else { + var buf bytes.Buffer + for _, stack := range h.raceMu.openRefs { + fmt.Fprintf(&buf, "%s\n", stack) + } + err = errors.Errorf("leaked iterators: %d\n%s", errors.Safe(v), buf.String()) + } + } + + // EvictAll would panic if there are still outstanding references. + if err == nil { + keys := h.fileCache.c.EvictAll(func(key fileCacheKey) bool { + return key.handle == h + }) + // Evict any associated blocks in the cache. + for i := range keys { + h.blockCacheHandle.EvictFile(keys[i].fileNum) + } + } + + h.fileCache.Unref() + // TODO(radu): we have to tolerate metrics() calls after close (see + // https://github.com/cockroachdb/cockroach/issues/140454). + // *h = fileCacheHandle{} + return err +} + +// openFile is called when we insert a new entry in the file cache. +func (h *fileCacheHandle) openFile( + ctx context.Context, fileNum base.DiskFileNum, fileType base.FileType, +) (io.Closer, objstorage.ObjectMetadata, error) { + f, err := h.objProvider.OpenForReading( + ctx, fileType, fileNum, objstorage.OpenOptions{MustExist: true}, + ) + if err != nil { + return nil, objstorage.ObjectMetadata{}, err + } + objMeta, err := h.objProvider.Lookup(fileType, fileNum) + if err != nil { + return nil, objstorage.ObjectMetadata{}, err + } + + o := h.readerOpts + o.CacheOpts = sstableinternal.CacheOptions{ + CacheHandle: h.blockCacheHandle, + FileNum: fileNum, + } + switch fileType { + case base.FileTypeTable: + r, err := sstable.NewReader(ctx, f, o) + if err != nil { + // If opening the sstable reader fails, we're responsible for + // closing the objstorage.Readable. + return nil, objMeta, errors.CombineErrors(err, f.Close()) + } + return r, objMeta, nil + case base.FileTypeBlob: + r, err := blob.NewFileReader(ctx, f, blob.FileReaderOptions{ + ReaderOptions: o.ReaderOptions, + }) + if err != nil { + // If opening the blob file reader fails, we're responsible for + // closing the objstorage.Readable. + return nil, objMeta, errors.CombineErrors(err, f.Close()) + } + return r, objMeta, nil + default: + panic(errors.AssertionFailedf("pebble: unexpected file cache file type: %s", fileType)) + } +} + +// findOrCreateTable retrieves an existing sstable reader or creates a new one +// for the backing file of the given table. If a corruption error is +// encountered, reportCorruptionFn() is called. +func (h *fileCacheHandle) findOrCreateTable( + ctx context.Context, meta *manifest.TableMetadata, +) (genericcache.ValueRef[fileCacheKey, fileCacheValue], error) { + key := fileCacheKey{ + handle: h, + fileNum: meta.TableBacking.DiskFileNum, + fileType: base.FileTypeTable, + } + valRef, err := h.fileCache.c.FindOrCreate(ctx, key) + if err != nil && IsCorruptionError(err) { + err = h.reportCorruptionFn(meta, err) + } + return valRef, err +} + +// findOrCreateBlob retrieves an existing blob reader or creates a new one for +// the given blob file. If a corruption error is encountered, +// reportCorruptionFn() is called. +func (h *fileCacheHandle) findOrCreateBlob( + ctx context.Context, fileNum base.DiskFileNum, +) (genericcache.ValueRef[fileCacheKey, fileCacheValue], error) { + key := fileCacheKey{ + handle: h, + fileNum: fileNum, + fileType: base.FileTypeBlob, + } + valRef, err := h.fileCache.c.FindOrCreate(ctx, key) + // TODO(jackson): Propagate a blob metadata object here. + if err != nil && IsCorruptionError(err) { + err = h.reportCorruptionFn(nil, err) + } + return valRef, err +} + +// Evict the given file from the file cache and the block cache. +func (h *fileCacheHandle) Evict(fileNum base.DiskFileNum, fileType base.FileType) { + defer func() { + if p := recover(); p != nil { + panic(fmt.Sprintf("pebble: evicting in-use file %s(%s): %v", fileNum, fileType, p)) + } + }() + h.fileCache.c.Evict(fileCacheKey{handle: h, fileNum: fileNum, fileType: fileType}) + h.blockCacheHandle.EvictFile(fileNum) +} + +func (h *fileCacheHandle) SSTStatsCollector() *block.CategoryStatsCollector { + return &h.sstStatsCollector +} + +// Metrics returns metrics for the file cache. Note that the CacheMetrics track +// the global cache which is shared between multiple handles (stores). The +// FilterMetrics are per-handle. +func (h *fileCacheHandle) Metrics() (FileCacheMetrics, FilterMetrics) { + m := h.fileCache.c.Metrics() + + // The generic cache maintains a count of entries, but it doesn't know which + // entries are sstables and which are blob files, which affects the memory + // footprint of the table cache. So the FileCache maintains its own counts, + // incremented when initializing a new value and decremented by the + // releasing func. + countSSTables := h.fileCache.counts.sstables.Load() + countBlobFiles := h.fileCache.counts.blobFiles.Load() + + cm := FileCacheMetrics{ + TableCount: countSSTables, + BlobFileCount: countBlobFiles, + Hits: m.Hits, + Misses: m.Misses, + Size: m.Size + countSSTables*int64(unsafe.Sizeof(sstable.Reader{})) + + countBlobFiles*int64(unsafe.Sizeof(blob.FileReader{})), + } + fm := h.readerOpts.FilterMetricsTracker.Load() + return cm, fm +} + +func (h *fileCacheHandle) estimateSize( + meta *manifest.TableMetadata, lower, upper []byte, +) (size uint64, err error) { + err = h.withReader(context.TODO(), block.NoReadEnv, meta, func(r *sstable.Reader, env sstable.ReadEnv) error { + size, err = r.EstimateDiskUsage(lower, upper, env) + return err + }) + return size, err +} + +func createReader( + v *fileCacheValue, meta *manifest.TableMetadata, +) (*sstable.Reader, sstable.ReadEnv) { + r := v.mustSSTableReader() + env := sstable.ReadEnv{} + if meta.Virtual { + if invariants.Enabled { + if meta.VirtualParams.FileNum == 0 || meta.VirtualParams.Lower.UserKey == nil || meta.VirtualParams.Upper.UserKey == nil { + panic("virtual params not initialized") + } + } + env.Virtual = meta.VirtualParams + env.IsSharedIngested = v.isShared && meta.SyntheticSeqNum() != 0 + } + return r, env +} + +func (h *fileCacheHandle) withReader( + ctx context.Context, + blockEnv block.ReadEnv, + meta *manifest.TableMetadata, + fn func(*sstable.Reader, sstable.ReadEnv) error, +) error { + ref, err := h.findOrCreateTable(ctx, meta) + if err != nil { + return err + } + defer ref.Unref() + v := ref.Value() + blockEnv.ReportCorruptionFn = h.reportCorruptionFn + blockEnv.ReportCorruptionArg = meta + env := sstable.ReadEnv{Block: blockEnv} + + r := v.mustSSTableReader() + if meta.Virtual { + if invariants.Enabled { + if meta.VirtualParams.FileNum == 0 || meta.VirtualParams.Lower.UserKey == nil || meta.VirtualParams.Upper.UserKey == nil { + panic("virtual params not initialized") + } + } + env.Virtual = meta.VirtualParams + env.IsSharedIngested = v.isShared && meta.SyntheticSeqNum() != 0 + } + + return fn(r, env) + +} + +func (h *fileCacheHandle) IterCount() int64 { + return int64(h.iterCount.Load()) +} + +// GetValueReader returns a blob.ValueReader for blob file identified by fileNum. +func (h *fileCacheHandle) GetValueReader( + ctx context.Context, fileNum base.DiskFileNum, +) (r blob.ValueReader, closeFunc func(), err error) { + ref, err := h.findOrCreateBlob(ctx, fileNum) + if err != nil { + return nil, nil, err + } + v := ref.Value() + r = v.mustBlob() + // NB: The call to findOrCreateBlob incremented the value's reference count. + // The closeHook (v.closeHook) takes responsibility for unreferencing the + // value. Take care to avoid introducing an allocation here by adding a + // closure. + closeHook := h.addReference(v) + return r, closeHook, nil +} + +// FileCache is a shareable cache for open files. Open files are exclusively +// sstable files today. +type FileCache struct { + refs atomic.Int64 + counts struct { + sstables atomic.Int64 + blobFiles atomic.Int64 + } + + c genericcache.Cache[fileCacheKey, fileCacheValue] +} + +// Ref adds a reference to the file cache. Once a file cache is constructed, the +// cache only remains valid if there is at least one reference to it. +func (c *FileCache) Ref() { + v := c.refs.Add(1) + // We don't want the reference count to ever go from 0 -> 1, + // cause a reference count of 0 implies that we've closed the cache. + if v <= 1 { + panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) + } +} + +// Unref removes a reference to the file cache. +func (c *FileCache) Unref() { + v := c.refs.Add(-1) + switch { + case v < 0: + panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) + case v == 0: + c.c.Close() + c.c = genericcache.Cache[fileCacheKey, fileCacheValue]{} + } +} + +// NewFileCache will create a new file cache with one outstanding reference. It +// is the callers responsibility to call Unref if they will no longer hold a +// reference to the file cache. +func NewFileCache(numShards int, size int) *FileCache { + if size == 0 { + panic("pebble: cannot create a file cache of size 0") + } else if numShards == 0 { + panic("pebble: cannot create a file cache with 0 shards") + } + + c := &FileCache{} + + // initFn is used whenever a new entry is added to the file cache. + initFn := func(ctx context.Context, key fileCacheKey, vRef genericcache.ValueRef[fileCacheKey, fileCacheValue]) error { + v := vRef.Value() + handle := key.handle + v.readerProvider.init(c, key) + v.closeHook = func() { + // closeHook is called when an iterator is closed; the initialization of + // an iterator with this value will happen after a FindOrCreate() call + // with returns the same vRef. + vRef.Unref() + handle.iterCount.Add(-1) + } + reader, objMeta, err := handle.openFile(ctx, key.fileNum, key.fileType) + if err != nil { + return errors.Wrapf(err, "pebble: backing file %s error", redact.Safe(key.fileNum)) + } + v.reader = reader + v.isShared = objMeta.IsShared() + switch key.fileType { + case base.FileTypeTable: + c.counts.sstables.Add(1) + case base.FileTypeBlob: + c.counts.blobFiles.Add(1) + default: + panic("unexpected file type") + } + return nil + } + + releaseFn := func(v *fileCacheValue) { + if v.reader != nil { + switch v.reader.(type) { + case *sstable.Reader: + c.counts.sstables.Add(-1) + case *blob.FileReader: + c.counts.blobFiles.Add(-1) + } + _ = v.reader.Close() + v.reader = nil + } + } + + c.c.Init(size, numShards, initFn, releaseFn) + + // Hold a ref to the cache here. + c.refs.Store(1) + + return c +} + +type fileCacheKey struct { + handle *fileCacheHandle + fileNum base.DiskFileNum + // fileType describes the type of file being cached (blob or sstable). A + // file number alone uniquely identifies every file within a DB, but we need + // to propagate the type so the file cache looks for the correct file in + // object storage / the filesystem. + fileType base.FileType +} + +// Shard implements the genericcache.Key interface. +func (k fileCacheKey) Shard(numShards int) int { + // TODO(radu): maybe incorporate a handle ID. + return int(uint64(k.fileNum) % uint64(numShards)) +} + +// checkAndIntersectFilters checks the specific table and block property filters +// for intersection with any available table and block-level properties. Returns +// true for ok if this table should be read by this iterator. +func checkAndIntersectFilters( + r *sstable.Reader, + blockPropertyFilters []BlockPropertyFilter, + boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter, + syntheticSuffix sstable.SyntheticSuffix, +) (ok bool, filterer *sstable.BlockPropertiesFilterer, err error) { + if boundLimitedFilter != nil || len(blockPropertyFilters) > 0 { + filterer, err = sstable.IntersectsTable( + blockPropertyFilters, + boundLimitedFilter, + r.UserProperties, + syntheticSuffix, + ) + // NB: IntersectsTable will return a nil filterer if the table-level + // properties indicate there's no intersection with the provided filters. + if filterer == nil || err != nil { + return false, nil, err + } + } + return true, filterer, nil +} + +func (h *fileCacheHandle) newIters( + ctx context.Context, + file *manifest.TableMetadata, + opts *IterOptions, + internalOpts internalIterOpts, + kinds iterKinds, +) (iterSet, error) { + // Calling findOrCreate gives us the responsibility of Unref()ing vRef. + vRef, err := h.findOrCreateTable(ctx, file) + if err != nil { + return iterSet{}, err + } + + internalOpts.readEnv.Block.ReportCorruptionFn = h.reportCorruptionFn + internalOpts.readEnv.Block.ReportCorruptionArg = file + + v := vRef.Value() + r, env := createReader(v, file) + internalOpts.readEnv.Virtual = env.Virtual + internalOpts.readEnv.IsSharedIngested = env.IsSharedIngested + + var iters iterSet + if kinds.RangeKey() && file.HasRangeKeys { + iters.rangeKey, err = newRangeKeyIter(ctx, file, r, opts.SpanIterOptions(), internalOpts) + } + if kinds.RangeDeletion() && file.HasPointKeys && err == nil { + iters.rangeDeletion, err = newRangeDelIter(ctx, file, r, h, internalOpts) + } + if kinds.Point() && err == nil { + iters.point, err = h.newPointIter(ctx, v, file, r, opts, internalOpts, h) + } + if err != nil { + // NB: There's a subtlety here: Because the point iterator is the last + // iterator we attempt to create, it's not possible for: + // err != nil && iters.point != nil + // If it were possible, we'd need to account for it to avoid double + // unref-ing here, once during CloseAll and once during `unrefValue`. + _ = iters.CloseAll() + vRef.Unref() + return iterSet{}, err + } + // Only point iterators ever require the reader stay pinned in the cache. If + // we're not returning a point iterator to the caller, we need to unref v. + // + // For point iterators, v.closeHook will be called which will release the ref. + if iters.point == nil { + vRef.Unref() + } + return iters, nil +} + +// For flushable ingests, we decide whether to use the bloom filter base on +// size. +const filterBlockSizeLimitForFlushableIngests = 64 * 1024 + +// newPointIter is an internal helper that constructs a point iterator over a +// sstable. This function is for internal use only, and callers should use +// newIters instead. +func (h *fileCacheHandle) newPointIter( + ctx context.Context, + v *fileCacheValue, + file *manifest.TableMetadata, + reader *sstable.Reader, + opts *IterOptions, + internalOpts internalIterOpts, + handle *fileCacheHandle, +) (internalIterator, error) { + var ( + hideObsoletePoints bool = false + pointKeyFilters []BlockPropertyFilter + filterer *sstable.BlockPropertiesFilterer + ) + r := v.mustSSTableReader() + if opts != nil { + // This code is appending (at most one filter) in-place to + // opts.PointKeyFilters even though the slice is shared for iterators in + // the same iterator tree. This is acceptable since all the following + // properties are true: + // - The iterator tree is single threaded, so the shared backing for the + // slice is being mutated in a single threaded manner. + // - Each shallow copy of the slice has its own notion of length. + // - The appended element is always the obsoleteKeyBlockPropertyFilter + // struct, which is stateless, so overwriting that struct when creating + // one sstable iterator is harmless to other sstable iterators that are + // relying on that struct. + // + // An alternative would be to have different slices for different sstable + // iterators, but that requires more work to avoid allocations. + // + // TODO(bilal): for compaction reads of foreign sstables, we do hide + // obsolete points (see sstable.Reader.newCompactionIter) but we don't + // apply the obsolete block property filter. We could optimize this by + // applying the filter. + hideObsoletePoints, pointKeyFilters = + r.TryAddBlockPropertyFilterForHideObsoletePoints( + opts.snapshotForHideObsoletePoints, file.LargestSeqNum, opts.PointKeyFilters) + + var ok bool + var err error + ok, filterer, err = checkAndIntersectFilters(r, pointKeyFilters, + internalOpts.boundLimitedFilter, file.SyntheticPrefixAndSuffix.Suffix()) + if err != nil { + return nil, err + } else if !ok { + // No point keys within the table match the filters. + return nil, nil + } + } + + var iter sstable.Iterator + filterBlockSizeLimit := sstable.AlwaysUseFilterBlock + if opts != nil { + // By default, we don't use block filters for L6 and restrict the size for + // flushable ingests, as these blocks can be very big. + if !opts.UseL6Filters { + if opts.layer == manifest.Level(6) { + filterBlockSizeLimit = sstable.NeverUseFilterBlock + } else if opts.layer.IsFlushableIngests() { + filterBlockSizeLimit = filterBlockSizeLimitForFlushableIngests + } + } + if opts.layer.IsSet() && !opts.layer.IsFlushableIngests() { + ctx = objiotracing.WithLevel(ctx, opts.layer.Level()) + } + } + + if v.isShared && file.SyntheticSeqNum() != 0 { + // The table is shared and ingested. + hideObsoletePoints = true + } + transforms := file.IterTransforms() + transforms.HideObsoletePoints = hideObsoletePoints + if internalOpts.readEnv.Block.IterStats == nil && opts != nil { + internalOpts.readEnv.Block.IterStats = handle.SSTStatsCollector().Accumulator(uint64(uintptr(unsafe.Pointer(r))), opts.Category) + } + var blobReferences sstable.BlobReferences + if r.Attributes.Has(sstable.AttributeBlobValues) { + if len(file.BlobReferences) == 0 { + return nil, errors.AssertionFailedf("pebble: sstable %s has blob values but no blob references", file.TableNum) + } + blobReferences = &file.BlobReferences + } + var err error + if internalOpts.compaction { + iter, err = reader.NewCompactionIter(transforms, internalOpts.readEnv, + &v.readerProvider, sstable.TableBlobContext{ + ValueFetcher: internalOpts.blobValueFetcher, + References: blobReferences, + }) + } else { + iter, err = reader.NewPointIter(ctx, sstable.IterOptions{ + Lower: opts.GetLowerBound(), + Upper: opts.GetUpperBound(), + Transforms: transforms, + FilterBlockSizeLimit: filterBlockSizeLimit, + Filterer: filterer, + Env: internalOpts.readEnv, + ReaderProvider: &v.readerProvider, + BlobContext: sstable.TableBlobContext{ + ValueFetcher: internalOpts.blobValueFetcher, + References: blobReferences, + }, + }) + } + if err != nil { + return nil, err + } + // NB: closeHook (v.closeHook) takes responsibility for calling + // unrefValue(v) here. Take care to avoid introducing an allocation here by + // adding a closure. + closeHook := h.addReference(v) + iter.SetCloseHook(closeHook) + return iter, nil +} + +func (h *fileCacheHandle) addReference(v *fileCacheValue) (closeHook func()) { + h.iterCount.Add(1) + closeHook = v.closeHook + if invariants.RaceEnabled { + stack := debug.Stack() + h.raceMu.Lock() + refID := h.raceMu.nextRefID + h.raceMu.openRefs[refID] = stack + h.raceMu.nextRefID++ + h.raceMu.Unlock() + // In race builds, this closeHook closure will force an allocation. + // Race builds are already unperformant (and allocate a stack trace), so + // we don't care. + closeHook = func() { + v.closeHook() + h.raceMu.Lock() + defer h.raceMu.Unlock() + delete(h.raceMu.openRefs, refID) + } + } + return closeHook +} + +// SetupBlobReaderProvider creates a fileCachHandle blob.ReaderProvider for +// reading blob files. The caller is responsible for calling the returned cleanup +// function. +// +// NB: This function is intended for testing and tooling purposes only. It +// provides blob file access outside of normal database operations and is not +// used by databases opened through Open(). +func SetupBlobReaderProvider( + fs vfs.FS, path string, opts *Options, readOpts sstable.ReaderOptions, +) (blob.ReaderProvider, func(), error) { + var fc *FileCache + var c *cache.Cache + var ch *cache.Handle + var objProvider objstorage.Provider + var provider *fileCacheHandle + + // Helper to clean up resources in case of error. + cleanup := func() { + if provider != nil { + _ = provider.Close() + } + if objProvider != nil { + _ = objProvider.Close() + } + if ch != nil { + ch.Close() + } + if c != nil { + c.Unref() + } + if fc != nil { + fc.Unref() + } + } + + fileCacheSize := FileCacheSize(opts.MaxOpenFiles) + if opts.FileCache == nil { + fc = NewFileCache(opts.Experimental.FileCacheShards, fileCacheSize) + } else { + fc = opts.FileCache + fc.Ref() + } + + if opts.Cache == nil { + c = cache.New(opts.CacheSize) + } else { + c = opts.Cache + c.Ref() + } + ch = c.NewHandle() + + var err error + objProvider, err = objstorageprovider.Open(objstorageprovider.DefaultSettings(fs, path)) + if err != nil { + cleanup() + return nil, nil, err + } + + provider = fc.newHandle( + ch, + objProvider, + opts.LoggerAndTracer, + readOpts, + func(any, error) error { return nil }, + ) + + return provider, cleanup, nil +} + +// newRangeDelIter is an internal helper that constructs an iterator over a +// sstable's range deletions. This function is for table-cache internal use +// only, and callers should use newIters instead. +func newRangeDelIter( + ctx context.Context, + file *manifest.TableMetadata, + r *sstable.Reader, + handle *fileCacheHandle, + internalOpts internalIterOpts, +) (keyspan.FragmentIterator, error) { + // NB: range-del iterator does not maintain a reference to the table, nor + // does it need to read from it after creation. + rangeDelIter, err := r.NewRawRangeDelIter(ctx, file.FragmentIterTransforms(), internalOpts.readEnv) + if err != nil { + return nil, err + } + // Assert expected bounds in tests. + if invariants.Sometimes(50) && rangeDelIter != nil { + cmp := base.DefaultComparer.Compare + if handle.readerOpts.Comparer != nil { + cmp = handle.readerOpts.Comparer.Compare + } + rangeDelIter = keyspan.AssertBounds( + rangeDelIter, file.PointKeyBounds.Smallest(), file.PointKeyBounds.LargestUserKey(), cmp, + ) + } + return rangeDelIter, nil +} + +// newRangeKeyIter is an internal helper that constructs an iterator over a +// sstable's range keys. This function is for table-cache internal use only, and +// callers should use newIters instead. +func newRangeKeyIter( + ctx context.Context, + file *manifest.TableMetadata, + r *sstable.Reader, + opts keyspan.SpanIterOptions, + internalOpts internalIterOpts, +) (keyspan.FragmentIterator, error) { + transforms := file.FragmentIterTransforms() + // Don't filter a table's range keys if the file contains RANGEKEYDELs. + // The RANGEKEYDELs may delete range keys in other levels. Skipping the + // file's range key blocks may surface deleted range keys below. This is + // done here, rather than deferring to the block-property collector in order + // to maintain parity with point keys and the treatment of RANGEDELs. + if !r.Attributes.Has(sstable.AttributeRangeKeyDels) && len(opts.RangeKeyFilters) > 0 { + ok, _, err := checkAndIntersectFilters(r, opts.RangeKeyFilters, nil, transforms.SyntheticSuffix()) + if err != nil { + return nil, err + } else if !ok { + return nil, nil + } + } + // TODO(radu): wrap in an AssertBounds. + return r.NewRawRangeKeyIter(ctx, transforms, internalOpts.readEnv) +} + +// tableCacheShardReaderProvider implements sstable.ReaderProvider for a +// specific table. +type tableCacheShardReaderProvider struct { + c *genericcache.Cache[fileCacheKey, fileCacheValue] + key fileCacheKey + + mu struct { + sync.Mutex + // r is the result of c.FindOrCreate(), only set iff refCount > 0. + r genericcache.ValueRef[fileCacheKey, fileCacheValue] + // refCount is the number of GetReader() calls that have not received a + // corresponding Close(). + refCount int + } +} + +var _ valblk.ReaderProvider = &tableCacheShardReaderProvider{} + +func (rp *tableCacheShardReaderProvider) init(fc *FileCache, key fileCacheKey) { + rp.c = &fc.c + rp.key = key + rp.mu.r = genericcache.ValueRef[fileCacheKey, fileCacheValue]{} + rp.mu.refCount = 0 +} + +// GetReader implements sstable.ReaderProvider. Note that it is not the +// responsibility of tableCacheShardReaderProvider to ensure that the file +// continues to exist. The ReaderProvider is used in iterators where the +// top-level iterator is pinning the read state and preventing the files from +// being deleted. +// +// The caller must call tableCacheShardReaderProvider.Close. +// +// Note that currently the Reader returned here is only used to read value +// blocks. This reader shouldn't be used for other purposes like reading keys +// outside of virtual sstable bounds. +// +// TODO(bananabrick): We could return a wrapper over the Reader to ensure +// that the reader isn't used for other purposes. +func (rp *tableCacheShardReaderProvider) GetReader( + ctx context.Context, +) (valblk.ExternalBlockReader, error) { + rp.mu.Lock() + defer rp.mu.Unlock() + + if rp.mu.refCount > 0 { + // We already have a value. + rp.mu.refCount++ + return rp.mu.r.Value().mustSSTableReader(), nil + } + + // Calling FindOrCreate gives us the responsibility of Unref()ing r, which + // will happen when rp.mu.refCount reaches 0. Note that if the table is no + // longer in the cache, FindOrCreate will need to do IO (through initFn in + // NewFileCache) to initialize a new Reader. We hold rp.mu during this time so + // that concurrent GetReader calls block until the Reader is created. + r, err := rp.c.FindOrCreate(ctx, rp.key) + if err != nil { + return nil, err + } + rp.mu.r = r + rp.mu.refCount = 1 + return r.Value().mustSSTableReader(), nil +} + +// Close implements sstable.ReaderProvider. +func (rp *tableCacheShardReaderProvider) Close() { + rp.mu.Lock() + defer rp.mu.Unlock() + rp.mu.refCount-- + if rp.mu.refCount <= 0 { + if rp.mu.refCount < 0 { + panic("pebble: sstable.ReaderProvider misuse") + } + rp.mu.r.Unref() + rp.mu.r = genericcache.ValueRef[fileCacheKey, fileCacheValue]{} + } +} + +// getTableProperties returns sst table properties for the backing file. +// +// WARNING! If file is a virtual table, we return the properties of the physical +// table. +func (h *fileCacheHandle) getTableProperties( + file *manifest.TableMetadata, +) (*sstable.Properties, error) { + // Calling findOrCreateTable gives us the responsibility of decrementing v's + // refCount here + v, err := h.findOrCreateTable(context.TODO(), file) + if err != nil { + return nil, err + } + defer v.Unref() + + r := v.Value().mustSSTableReader() + props, err := r.ReadPropertiesBlock(context.TODO(), nil /* buffer pool */) + if err != nil { + return nil, err + } + return &props, nil +} + +type fileCacheValue struct { + closeHook func() + reader io.Closer // *sstable.Reader or *blob.FileReader + isShared bool + + // readerProvider is embedded here so that we only allocate it once as long as + // the table stays in the cache. Its state is not always logically tied to + // this specific fileCacheShard - if a table goes out of the cache and then + // comes back in, the readerProvider in a now-defunct fileCacheValue can + // still be used and will internally refer to the new fileCacheValue. + readerProvider tableCacheShardReaderProvider +} + +// mustSSTable retrieves the value's *sstable.Reader. It panics if the cached +// file is not a sstable (i.e., it is a blob file). +func (v *fileCacheValue) mustSSTableReader() *sstable.Reader { + return v.reader.(*sstable.Reader) +} + +// mustBlob retrieves the value's *blob.FileReader. It panics if the cached file +// is not a blob file. +func (v *fileCacheValue) mustBlob() *blob.FileReader { + return v.reader.(*blob.FileReader) +} + +// iterSet holds a set of iterators of various key kinds, all constructed over +// the same data structure (eg, an sstable). A subset of the fields may be +// populated depending on the `iterKinds` passed to newIters. +type iterSet struct { + point internalIterator + rangeDeletion keyspan.FragmentIterator + rangeKey keyspan.FragmentIterator +} + +// TODO(jackson): Consider adding methods for fast paths that check whether an +// iterator of a particular kind is nil, so that these call sites don't need to +// reach into the struct's fields directly. + +// Point returns the contained point iterator. If there is no point iterator, +// Point returns a non-nil empty point iterator. +func (s *iterSet) Point() internalIterator { + if s.point == nil { + return emptyIter + } + return s.point +} + +// RangeDeletion returns the contained range deletion iterator. If there is no +// range deletion iterator, RangeDeletion returns a non-nil empty keyspan +// iterator. +func (s *iterSet) RangeDeletion() keyspan.FragmentIterator { + if s.rangeDeletion == nil { + return emptyKeyspanIter + } + return s.rangeDeletion +} + +// RangeKey returns the contained range key iterator. If there is no range key +// iterator, RangeKey returns a non-nil empty keyspan iterator. +func (s *iterSet) RangeKey() keyspan.FragmentIterator { + if s.rangeKey == nil { + return emptyKeyspanIter + } + return s.rangeKey +} + +// CloseAll closes all of the held iterators. If CloseAll is called, then Close +// must be not be called on the constituent iterators. +func (s *iterSet) CloseAll() error { + var err error + if s.point != nil { + err = s.point.Close() + s.point = nil + } + if s.rangeDeletion != nil { + s.rangeDeletion.Close() + s.rangeDeletion = nil + } + if s.rangeKey != nil { + s.rangeKey.Close() + s.rangeKey = nil + } + return err +} + +// iterKinds is a bitmap indicating a set of kinds of iterators. Callers may +// bitwise-OR iterPointKeys, iterRangeDeletions and/or iterRangeKeys together to +// represent a set of desired iterator kinds. +type iterKinds uint8 + +func (t iterKinds) Point() bool { return (t & iterPointKeys) != 0 } +func (t iterKinds) RangeDeletion() bool { return (t & iterRangeDeletions) != 0 } +func (t iterKinds) RangeKey() bool { return (t & iterRangeKeys) != 0 } + +const ( + iterPointKeys iterKinds = 1 << iota + iterRangeDeletions + iterRangeKeys +) diff --git a/vendor/github.com/cockroachdb/pebble/v2/flushable.go b/vendor/github.com/cockroachdb/pebble/v2/flushable.go new file mode 100644 index 0000000..f8fd6e6 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/flushable.go @@ -0,0 +1,474 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + "fmt" + "sync/atomic" + "time" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl" + "github.com/cockroachdb/pebble/v2/internal/manifest" +) + +// flushable defines the interface for immutable memtables. +type flushable interface { + newIter(o *IterOptions) internalIterator + newFlushIter(o *IterOptions) internalIterator + newRangeDelIter(o *IterOptions) keyspan.FragmentIterator + newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator + containsRangeKeys() bool + // inuseBytes returns the number of inuse bytes by the flushable. + inuseBytes() uint64 + // totalBytes returns the total number of bytes allocated by the flushable. + totalBytes() uint64 + // readyForFlush returns true when the flushable is ready for flushing. See + // memTable.readyForFlush for one implementation which needs to check whether + // there are any outstanding write references. + readyForFlush() bool + // computePossibleOverlaps determines whether the flushable's keys overlap + // with the bounds of any of the provided bounded items. If an item overlaps + // or might overlap but it's not possible to determine overlap cheaply, + // computePossibleOverlaps invokes the provided function with the object + // that might overlap. computePossibleOverlaps must not perform any I/O and + // implementations should invoke the provided function for items that would + // require I/O to determine overlap. + computePossibleOverlaps(overlaps func(bounded) shouldContinue, bounded ...bounded) +} + +type shouldContinue bool + +const ( + continueIteration shouldContinue = true + stopIteration = false +) + +type bounded interface { + UserKeyBounds() base.UserKeyBounds +} + +var _ bounded = (*manifest.TableMetadata)(nil) +var _ bounded = KeyRange{} + +func sliceAsBounded[B bounded](s []B) []bounded { + ret := make([]bounded, len(s)) + for i := 0; i < len(s); i++ { + ret[i] = s[i] + } + return ret +} + +// flushableEntry wraps a flushable and adds additional metadata and +// functionality that is common to all flushables. +type flushableEntry struct { + flushable + // Channel which is closed when the flushable has been flushed. + flushed chan struct{} + // flushForced indicates whether a flush was forced on this memtable (either + // manual, or due to ingestion). Protected by DB.mu. + flushForced bool + // delayedFlushForcedAt indicates whether a timer has been set to force a + // flush on this memtable at some point in the future. Protected by DB.mu. + // Holds the timestamp of when the flush will be issued. + delayedFlushForcedAt time.Time + // logNum corresponds to the WAL that contains the records present in the + // receiver. + logNum base.DiskFileNum + // logSize is the size in bytes of the associated WAL. Protected by DB.mu. + logSize uint64 + // The current logSeqNum at the time the memtable was created. This is + // guaranteed to be less than or equal to any seqnum stored in the memtable. + logSeqNum base.SeqNum + // readerRefs tracks the read references on the flushable. The two sources of + // reader references are DB.mu.mem.queue and readState.memtables. The memory + // reserved by the flushable in the cache is released when the reader refs + // drop to zero. If the flushable is referencing sstables, then the file + // refount is also decreased once the reader refs drops to 0. If the + // flushable is a memTable, when the reader refs drops to zero, the writer + // refs will already be zero because the memtable will have been flushed and + // that only occurs once the writer refs drops to zero. + readerRefs atomic.Int32 + // Closure to invoke to release memory accounting. + releaseMemAccounting func() + // unrefFiles, if not nil, should be invoked to decrease the ref count of + // files which are backing the flushable. + unrefFiles func(*manifest.ObsoleteFiles) + // deleteFnLocked should be called if the caller is holding DB.mu. + deleteFnLocked func(manifest.ObsoleteFiles) + // deleteFn should be called if the caller is not holding DB.mu. + deleteFn func(manifest.ObsoleteFiles) +} + +func (e *flushableEntry) readerRef() { + switch v := e.readerRefs.Add(1); { + case v <= 1: + panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) + } +} + +// db.mu must not be held when this is called. +func (e *flushableEntry) readerUnref(deleteFiles bool) { + e.readerUnrefHelper(deleteFiles, e.deleteFn) +} + +// db.mu must be held when this is called. +func (e *flushableEntry) readerUnrefLocked(deleteFiles bool) { + e.readerUnrefHelper(deleteFiles, e.deleteFnLocked) +} + +func (e *flushableEntry) readerUnrefHelper( + deleteFiles bool, deleteFn func(manifest.ObsoleteFiles), +) { + switch v := e.readerRefs.Add(-1); { + case v < 0: + panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) + case v == 0: + if e.releaseMemAccounting == nil { + panic("pebble: memtable reservation already released") + } + e.releaseMemAccounting() + e.releaseMemAccounting = nil + if e.unrefFiles != nil { + var obsolete manifest.ObsoleteFiles + e.unrefFiles(&obsolete) + e.unrefFiles = nil + if deleteFiles { + deleteFn(obsolete) + } + } + } +} + +type flushableList []*flushableEntry + +// ingestedFlushable is the implementation of the flushable interface for the +// ingesting sstables which are added to the flushable list. +type ingestedFlushable struct { + // files are non-overlapping and ordered (according to their bounds). + files []*manifest.TableMetadata + comparer *Comparer + newIters tableNewIters + newRangeKeyIters keyspanimpl.TableNewSpanIter + + // Since the level slice is immutable, we construct and set it once. It + // should be safe to read from slice in future reads. + slice manifest.LevelSlice + // hasRangeKeys is set on ingestedFlushable construction. + hasRangeKeys bool + // exciseSpan is populated if an excise operation should be performed during + // flush. + exciseSpan KeyRange + exciseSeqNum base.SeqNum +} + +func newIngestedFlushable( + files []*manifest.TableMetadata, + comparer *Comparer, + newIters tableNewIters, + newRangeKeyIters keyspanimpl.TableNewSpanIter, + exciseSpan KeyRange, + seqNum base.SeqNum, +) *ingestedFlushable { + if invariants.Enabled { + for i := 1; i < len(files); i++ { + prev := files[i-1].UserKeyBounds() + this := files[i].UserKeyBounds() + if prev.End.IsUpperBoundFor(comparer.Compare, this.Start) { + panic(errors.AssertionFailedf("ingested flushable files overlap: %s %s", prev, this)) + } + } + } + var physicalFiles []*manifest.TableMetadata + var hasRangeKeys bool + for _, f := range files { + if f.HasRangeKeys { + hasRangeKeys = true + } + physicalFiles = append(physicalFiles, f.PhysicalMeta()) + } + + ret := &ingestedFlushable{ + files: physicalFiles, + comparer: comparer, + newIters: newIters, + newRangeKeyIters: newRangeKeyIters, + // slice is immutable and can be set once and used many times. + slice: manifest.NewLevelSliceKeySorted(comparer.Compare, files), + hasRangeKeys: hasRangeKeys, + exciseSpan: exciseSpan, + exciseSeqNum: seqNum, + } + + return ret +} + +// TODO(sumeer): ingestedFlushable iters also need to plumb context for +// tracing. + +// newIter is part of the flushable interface. +func (s *ingestedFlushable) newIter(o *IterOptions) internalIterator { + var opts IterOptions + if o != nil { + opts = *o + } + return newLevelIter( + context.Background(), opts, s.comparer, s.newIters, s.slice.Iter(), manifest.FlushableIngestsLayer(), + internalIterOpts{}, + ) +} + +// newFlushIter is part of the flushable interface. +func (s *ingestedFlushable) newFlushIter(*IterOptions) internalIterator { + // newFlushIter is only used for writing memtables to disk as sstables. + // Since ingested sstables are already present on disk, they don't need to + // make use of a flush iter. + panic("pebble: not implemented") +} + +func (s *ingestedFlushable) constructRangeDelIter( + ctx context.Context, file *manifest.TableMetadata, _ keyspan.SpanIterOptions, +) (keyspan.FragmentIterator, error) { + iters, err := s.newIters(ctx, file, nil, internalIterOpts{}, iterRangeDeletions) + if err != nil { + return nil, err + } + return iters.RangeDeletion(), nil +} + +// newRangeDelIter is part of the flushable interface. +// TODO(bananabrick): Using a level iter instead of a keyspan level iter to +// surface range deletes is more efficient. +// +// TODO(sumeer): *IterOptions are being ignored, so the index block load for +// the point iterator in constructRangeDeIter is not tracked. +func (s *ingestedFlushable) newRangeDelIter(_ *IterOptions) keyspan.FragmentIterator { + liter := keyspanimpl.NewLevelIter( + context.TODO(), + keyspan.SpanIterOptions{}, s.comparer.Compare, + s.constructRangeDelIter, s.slice.Iter(), manifest.FlushableIngestsLayer(), + manifest.KeyTypePoint, + ) + if !s.exciseSpan.Valid() { + return liter + } + // We have an excise span to weave into the rangedel iterators. + // + // TODO(bilal): should this be pooled? + miter := &keyspanimpl.MergingIter{} + rdel := keyspan.Span{ + Start: s.exciseSpan.Start, + End: s.exciseSpan.End, + Keys: []keyspan.Key{{Trailer: base.MakeTrailer(s.exciseSeqNum, base.InternalKeyKindRangeDelete)}}, + } + rdelIter := keyspan.NewIter(s.comparer.Compare, []keyspan.Span{rdel}) + miter.Init(s.comparer, keyspan.NoopTransform, new(keyspanimpl.MergingBuffers), liter, rdelIter) + return miter +} + +// newRangeKeyIter is part of the flushable interface. +func (s *ingestedFlushable) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator { + var rkeydelIter keyspan.FragmentIterator + if s.exciseSpan.Valid() { + // We have an excise span to weave into the rangekey iterators. + rkeydel := keyspan.Span{ + Start: s.exciseSpan.Start, + End: s.exciseSpan.End, + Keys: []keyspan.Key{{Trailer: base.MakeTrailer(s.exciseSeqNum, base.InternalKeyKindRangeKeyDelete)}}, + } + rkeydelIter = keyspan.NewIter(s.comparer.Compare, []keyspan.Span{rkeydel}) + } + + if !s.hasRangeKeys { + if rkeydelIter == nil { + // NB: we have to return the nil literal as opposed to the nil + // value of rkeydelIter, otherwise callers of this function will + // have the return value fail == nil checks. + return nil + } + return rkeydelIter + } + + liter := keyspanimpl.NewLevelIter( + context.TODO(), + keyspan.SpanIterOptions{}, s.comparer.Compare, s.newRangeKeyIters, + s.slice.Iter(), manifest.FlushableIngestsLayer(), manifest.KeyTypeRange, + ) + if rkeydelIter == nil { + return liter + } + // TODO(bilal): should this be pooled? + miter := &keyspanimpl.MergingIter{} + miter.Init(s.comparer, keyspan.NoopTransform, new(keyspanimpl.MergingBuffers), liter, rkeydelIter) + return miter +} + +// containsRangeKeys is part of the flushable interface. +func (s *ingestedFlushable) containsRangeKeys() bool { + return s.hasRangeKeys || s.exciseSpan.Valid() +} + +// inuseBytes is part of the flushable interface. +func (s *ingestedFlushable) inuseBytes() uint64 { + // inuseBytes is only used when memtables are flushed to disk as sstables. + panic("pebble: not implemented") +} + +// totalBytes is part of the flushable interface. +func (s *ingestedFlushable) totalBytes() uint64 { + // We don't allocate additional bytes for the ingestedFlushable. + return 0 +} + +// readyForFlush is part of the flushable interface. +func (s *ingestedFlushable) readyForFlush() bool { + // ingestedFlushable should always be ready to flush. However, note that + // memtables before the ingested sstables in the memtable queue must be + // flushed before an ingestedFlushable can be flushed. This is because the + // ingested sstables need an updated view of the Version to + // determine where to place the files in the lsm. + return true +} + +// computePossibleOverlaps is part of the flushable interface. +func (s *ingestedFlushable) computePossibleOverlaps( + fn func(bounded) shouldContinue, bounded ...bounded, +) { + for _, b := range bounded { + if s.anyFileOverlaps(b.UserKeyBounds()) { + // Some file overlaps in key boundaries. The file doesn't necessarily + // contain any keys within the key range, but we would need to perform I/O + // to know for sure. The flushable interface dictates that we're not + // permitted to perform I/O here, so err towards assuming overlap. + if !fn(b) { + return + } + } + } +} + +// anyFileBoundsOverlap returns true if there is at least a file in s.files with +// bounds that overlap the given bounds. +func (s *ingestedFlushable) anyFileOverlaps(bounds base.UserKeyBounds) bool { + // Note that s.files are non-overlapping and sorted. + for _, f := range s.files { + fileBounds := f.UserKeyBounds() + if !fileBounds.End.IsUpperBoundFor(s.comparer.Compare, bounds.Start) { + // The file ends before the bounds start. Go to the next file. + continue + } + if !bounds.End.IsUpperBoundFor(s.comparer.Compare, fileBounds.Start) { + // The file starts after the bounds end. There is no overlap, and + // further files will not overlap either (the files are sorted). + break + } + // There is overlap. Note that UserKeyBounds.Overlaps() performs exactly the + // checks above. + return true + } + if s.exciseSpan.Valid() { + uk := s.exciseSpan.UserKeyBounds() + return uk.Overlaps(s.comparer.Compare, &bounds) + } + return false +} + +// computePossibleOverlapsGenericImpl is an implementation of the flushable +// interface's computePossibleOverlaps function for flushable implementations +// with only in-memory state that do not have special requirements and should +// read through the ordinary flushable iterators. +// +// This function must only be used with implementations that are infallible (eg, +// memtable iterators) and will panic if an error is encountered. +func computePossibleOverlapsGenericImpl[F flushable]( + f F, cmp Compare, fn func(bounded) shouldContinue, bounded []bounded, +) { + iter := f.newIter(nil) + rangeDelIter := f.newRangeDelIter(nil) + rangeKeyIter := f.newRangeKeyIter(nil) + for _, b := range bounded { + overlap, err := determineOverlapAllIters(cmp, b.UserKeyBounds(), iter, rangeDelIter, rangeKeyIter) + if invariants.Enabled && err != nil { + panic(errors.AssertionFailedf("expected iterator to be infallible: %v", err)) + } + if overlap { + if !fn(b) { + break + } + } + } + + if iter != nil { + if err := iter.Close(); err != nil { + // This implementation must be used in circumstances where + // reading through the iterator is infallible. + panic(err) + } + } + if rangeDelIter != nil { + rangeDelIter.Close() + } + if rangeKeyIter != nil { + rangeKeyIter.Close() + } +} + +// determineOverlapAllIters checks for overlap in a point iterator, range +// deletion iterator and range key iterator. +func determineOverlapAllIters( + cmp base.Compare, + bounds base.UserKeyBounds, + pointIter base.InternalIterator, + rangeDelIter, rangeKeyIter keyspan.FragmentIterator, +) (bool, error) { + if pointIter != nil { + if pointOverlap, err := determineOverlapPointIterator(cmp, bounds, pointIter); pointOverlap || err != nil { + return pointOverlap, err + } + } + if rangeDelIter != nil { + if rangeDelOverlap, err := determineOverlapKeyspanIterator(cmp, bounds, rangeDelIter); rangeDelOverlap || err != nil { + return rangeDelOverlap, err + } + } + if rangeKeyIter != nil { + return determineOverlapKeyspanIterator(cmp, bounds, rangeKeyIter) + } + return false, nil +} + +func determineOverlapPointIterator( + cmp base.Compare, bounds base.UserKeyBounds, iter internalIterator, +) (bool, error) { + kv := iter.SeekGE(bounds.Start, base.SeekGEFlagsNone) + if kv == nil { + return false, iter.Error() + } + return bounds.End.IsUpperBoundForInternalKey(cmp, kv.K), nil +} + +func determineOverlapKeyspanIterator( + cmp base.Compare, bounds base.UserKeyBounds, iter keyspan.FragmentIterator, +) (bool, error) { + // NB: The spans surfaced by the fragment iterator are non-overlapping. + span, err := iter.SeekGE(bounds.Start) + if err != nil { + return false, err + } + for ; span != nil; span, err = iter.Next() { + if !bounds.End.IsUpperBoundFor(cmp, span.Start) { + // The span starts after our bounds. + return false, nil + } + if !span.Empty() { + return true, nil + } + } + return false, err +} diff --git a/vendor/github.com/cockroachdb/pebble/format_major_version.go b/vendor/github.com/cockroachdb/pebble/v2/format_major_version.go similarity index 53% rename from vendor/github.com/cockroachdb/pebble/format_major_version.go rename to vendor/github.com/cockroachdb/pebble/v2/format_major_version.go index 45f1bc9..0ea5dc7 100644 --- a/vendor/github.com/cockroachdb/pebble/format_major_version.go +++ b/vendor/github.com/cockroachdb/pebble/v2/format_major_version.go @@ -9,11 +9,11 @@ import ( "strconv" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/sstable" - "github.com/cockroachdb/pebble/vfs" - "github.com/cockroachdb/pebble/vfs/atomicfs" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/pebble/v2/vfs/atomicfs" ) // FormatMajorVersion is a constant controlling the format of persisted @@ -43,15 +43,18 @@ func (v FormatMajorVersion) String() string { } const ( + // FormatDefault leaves the format version unspecified. When used to create a + // new store, Pebble will choose the earliest format version it supports. + FormatDefault FormatMajorVersion = iota + // 21.2 versions. - // FormatDefault leaves the format version unspecified. The - // FormatDefault constant may be ratcheted upwards over time. - FormatDefault FormatMajorVersion = iota // FormatMostCompatible maintains the most backwards compatibility, // maintaining bi-directional compatibility with RocksDB 6.2.1 in // the particular configuration described in the Pebble README. - FormatMostCompatible + // Deprecated. + _ // FormatMostCompatible + // formatVersionedManifestMarker is the first // backwards-incompatible change made to Pebble, introducing the // format-version marker file for handling backwards-incompatible @@ -63,28 +66,36 @@ const ( // format major version. Clients should use FormatVersioned which // also ensures earlier versions of Pebble fail to open a database // written in a future format major version. - formatVersionedManifestMarker + // Deprecated. + _ // formatVersionedManifestMarker + // FormatVersioned is a new format major version that replaces the // old `CURRENT` file with a new 'marker' file scheme. Previous // Pebble versions will be unable to open the database unless // they're aware of format versions. - FormatVersioned + // Deprecated. + _ // FormatVersioned + // FormatSetWithDelete is a format major version that introduces a new key // kind, base.InternalKeyKindSetWithDelete. Previous Pebble versions will be // unable to open this database. - FormatSetWithDelete + // Deprecated. + _ // FormatSetWithDelete // 22.1 versions. // FormatBlockPropertyCollector is a format major version that introduces // BlockPropertyCollectors. - FormatBlockPropertyCollector + // Deprecated. + _ // FormatBlockPropertyCollector + // FormatSplitUserKeysMarked is a format major version that guarantees that // all files that share user keys with neighbors are marked for compaction // in the manifest. Ratcheting to FormatSplitUserKeysMarked will block // (without holding mutexes) until the scan of the LSM is complete and the // manifest has been rotated. - FormatSplitUserKeysMarked + // Deprecated. + _ // FormatSplitUserKeysMarked // 22.2 versions. @@ -95,21 +106,28 @@ const ( // across multiple files within a level L1+. Ratcheting to this format version // will block (without holding mutexes) until all necessary compactions for // files marked for compaction are complete. - FormatSplitUserKeysMarkedCompacted + // Deprecated. + _ // FormatSplitUserKeysMarkedCompacted + // FormatRangeKeys is a format major version that introduces range keys. - FormatRangeKeys + // Deprecated. + _ // FormatRangeKeys + // FormatMinTableFormatPebblev1 is a format major version that guarantees that // tables created by or ingested into the DB at or above this format major // version will have a table format version of at least Pebblev1 (Block // Properties). - FormatMinTableFormatPebblev1 + // Deprecated. + _ // FormatMinTableFormatPebblev1 + // FormatPrePebblev1Marked is a format major version that guarantees that all // sstables with a table format version pre-Pebblev1 (i.e. those that are // guaranteed to not contain block properties) are marked for compaction in // the manifest. Ratcheting to FormatPrePebblev1Marked will block (without // holding mutexes) until the scan of the LSM is complete and the manifest has // been rotated. - FormatPrePebblev1Marked + // Deprecated. + _ // FormatPrePebblev1Marked // 23.1 versions. @@ -118,21 +136,13 @@ const ( // release. It was later decided that this should be deferred until a // subsequent release. The original ordering is preserved so as not to // introduce breaking changes in Cockroach. - formatUnusedPrePebblev1MarkedCompacted + _ // formatUnusedPrePebblev1MarkedCompacted // FormatSSTableValueBlocks is a format major version that adds support for // storing values in value blocks in the sstable. Value block support is not // necessarily enabled when writing sstables, when running with this format // major version. - // - // WARNING: In development, so no production code should upgrade to this - // format, since a DB with this format major version will not actually - // interoperate correctly with another DB with the same format major - // version. This format major version is introduced so that tests can start - // being executed up to this version. Note that these tests succeed despite - // the incomplete support since they do not enable value blocks and use - // TableFormatPebblev2. - FormatSSTableValueBlocks + _ // FormatSSTableValueBlocks // FormatFlushableIngest is a format major version that enables lazy // addition of ingested sstables into the LSM structure. When an ingest @@ -169,32 +179,105 @@ const ( // a format major version. FormatVirtualSSTables - // internalFormatNewest holds the newest format major version, including - // experimental ones excluded from the exported FormatNewest constant until - // they've stabilized. Used in tests. - internalFormatNewest FormatMajorVersion = iota - 1 + // FormatSyntheticPrefixSuffix is a format major version that adds support for + // sstables to have their content exposed in a different prefix or suffix of + // keyspace than the actual prefix/suffix persisted in the keys in such + // sstables. The prefix and suffix replacement information is stored in new + // fields in the Manifest and thus requires a format major version. + FormatSyntheticPrefixSuffix + + // FormatFlushableIngestExcises is a format major version that adds support for + // having excises unconditionally being written as flushable ingestions. This + // is implemented through adding a new key kind that can go in the same batches + // as flushable ingested sstables. + FormatFlushableIngestExcises + + // FormatColumnarBlocks is a format major version enabling use of the + // TableFormatPebblev5 table format, that encodes sstable data blocks, index + // blocks and keyspan blocks by organizing the KVs into columns within the + // block. + FormatColumnarBlocks + + // FormatWALSyncChunks is a format major version enabling the writing of + // WAL sync chunks. These new chunks are used to disambiguate between corruption + // and logical EOF during WAL replay. This is implemented by adding a new + // chunk wire format that encodes an additional "Synced Offset" field which acts + // as a commitment that the WAL should have been synced up until the offset. + FormatWALSyncChunks + + // FormatTableFormatV6 is a format major version enabling the sstable table + // format TableFormatPebblev6. + // + // The TableFormatPebblev6 sstable format introduces a checksum within the + // sstable footer, allows inclusion of blob handle references within the + // value column of a sstable block, and supports columnar meta index + + // properties blocks. + // + // This format major version does not yet enable use of value separation. + FormatTableFormatV6 - // FormatNewest always contains the most recent format major version. - FormatNewest FormatMajorVersion = internalFormatNewest + // formatDeprecatedExperimentalValueSeparation was used to enable an + // experimental version of value separation, separating values into external + // blob files that do not participate in every compaction. + // + // Value separation now depends on TableFormatPebblev7 which this format + // major version precedes. This format major version is deprecated and + // unexported, and value separation now requires FormatValueSeparation. + formatDeprecatedExperimentalValueSeparation + + // formatFooterAttributes is a format major version that adds support for + // writing sstable.Attributes in the footer of sstables. + formatFooterAttributes + + // FormatValueSeparation is a format major version that adds support for + // value separation, separating values into external blob files that do not + // participate in every compaction. + FormatValueSeparation + + // -- Add new versions here -- + + // FormatNewest is the most recent format major version. + FormatNewest FormatMajorVersion = iota - 1 + + // Experimental versions, which are excluded by FormatNewest (but can be used + // in tests) can be defined here. + + // -- Add experimental versions here -- + + // internalFormatNewest is the most recent, possibly experimental format major + // version. + internalFormatNewest FormatMajorVersion = iota - 2 ) +// FormatMinSupported is the minimum format version that is supported by this +// Pebble version. +const FormatMinSupported = FormatFlushableIngest + +// FormatMinForSharedObjects it the minimum format version that supports shared +// objects (see CreateOnShared option). +const FormatMinForSharedObjects = FormatVirtualSSTables + +// IsSupported returns true if the version is supported by the current Pebble +// version. +func (v FormatMajorVersion) IsSupported() bool { + return v == FormatDefault && v >= FormatMinSupported && v <= internalFormatNewest +} + // MaxTableFormat returns the maximum sstable.TableFormat that can be used at // this FormatMajorVersion. func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat { switch v { - case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker, - FormatVersioned, FormatSetWithDelete: - return sstable.TableFormatRocksDBv2 - case FormatBlockPropertyCollector, FormatSplitUserKeysMarked, - FormatSplitUserKeysMarkedCompacted: - return sstable.TableFormatPebblev1 - case FormatRangeKeys, FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, - formatUnusedPrePebblev1MarkedCompacted: - return sstable.TableFormatPebblev2 - case FormatSSTableValueBlocks, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted: + case FormatDefault, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted: return sstable.TableFormatPebblev3 - case FormatDeleteSizedAndObsolete, FormatVirtualSSTables: + case FormatDeleteSizedAndObsolete, FormatVirtualSSTables, FormatSyntheticPrefixSuffix, + FormatFlushableIngestExcises: return sstable.TableFormatPebblev4 + case FormatColumnarBlocks, FormatWALSyncChunks: + return sstable.TableFormatPebblev5 + case FormatTableFormatV6, formatDeprecatedExperimentalValueSeparation: + return sstable.TableFormatPebblev6 + case formatFooterAttributes, FormatValueSeparation: + return sstable.TableFormatPebblev7 default: panic(fmt.Sprintf("pebble: unsupported format major version: %s", v)) } @@ -204,33 +287,17 @@ func (v FormatMajorVersion) MaxTableFormat() sstable.TableFormat { // this FormatMajorVersion. func (v FormatMajorVersion) MinTableFormat() sstable.TableFormat { switch v { - case FormatDefault, FormatMostCompatible, formatVersionedManifestMarker, - FormatVersioned, FormatSetWithDelete, FormatBlockPropertyCollector, - FormatSplitUserKeysMarked, FormatSplitUserKeysMarkedCompacted, - FormatRangeKeys: - return sstable.TableFormatLevelDB - case FormatMinTableFormatPebblev1, FormatPrePebblev1Marked, - formatUnusedPrePebblev1MarkedCompacted, FormatSSTableValueBlocks, - FormatFlushableIngest, FormatPrePebblev1MarkedCompacted, - FormatDeleteSizedAndObsolete, FormatVirtualSSTables: + case FormatDefault, FormatFlushableIngest, FormatPrePebblev1MarkedCompacted, + FormatDeleteSizedAndObsolete, FormatVirtualSSTables, FormatSyntheticPrefixSuffix, + FormatFlushableIngestExcises, FormatColumnarBlocks, FormatWALSyncChunks, + FormatTableFormatV6, formatDeprecatedExperimentalValueSeparation, formatFooterAttributes, + FormatValueSeparation: return sstable.TableFormatPebblev1 default: panic(fmt.Sprintf("pebble: unsupported format major version: %s", v)) } } -// orderingInvariants returns an enum encoding the set of invariants that must -// hold within the receiver format major version. Invariants only get stricter -// as the format major version advances, so it is okay to retrieve the -// invariants from the current format major version and by the time the -// invariants are enforced, the format major version has advanced. -func (v FormatMajorVersion) orderingInvariants() manifest.OrderingInvariants { - if v < FormatSplitUserKeysMarkedCompacted { - return manifest.AllowSplitUserKeys - } - return manifest.ProhibitSplitUserKeys -} - // formatMajorVersionMigrations defines the migrations from one format // major version to the next. Each migration is defined as a closure // which will be invoked on the database before the new format major @@ -242,140 +309,65 @@ func (v FormatMajorVersion) orderingInvariants() manifest.OrderingInvariants { // panic if a migration returns a nil error but fails to finalize the // new format major version. var formatMajorVersionMigrations = map[FormatMajorVersion]func(*DB) error{ - FormatMostCompatible: func(d *DB) error { return nil }, - formatVersionedManifestMarker: func(d *DB) error { - // formatVersionedManifestMarker introduces the use of a marker - // file for pointing to the current MANIFEST file. - - // Lock the manifest. - d.mu.versions.logLock() - defer d.mu.versions.logUnlock() - - // Construct the filename of the currently active manifest and - // move the manifest marker to that filename. The marker is - // guaranteed to exist, because we unconditionally locate it - // during Open. - manifestFileNum := d.mu.versions.manifestFileNum - filename := base.MakeFilename(fileTypeManifest, manifestFileNum.DiskFileNum()) - if err := d.mu.versions.manifestMarker.Move(filename); err != nil { - return errors.Wrap(err, "moving manifest marker") - } - - // Now that we have a manifest marker file in place and pointing - // to the current MANIFEST, finalize the upgrade. If we fail for - // some reason, a retry of this migration is guaranteed to again - // move the manifest marker file to the latest manifest. If - // we're unable to finalize the upgrade, a subsequent call to - // Open will ignore the manifest marker. - if err := d.finalizeFormatVersUpgrade(formatVersionedManifestMarker); err != nil { - return err - } - - // We've finalized the upgrade. All subsequent Open calls will - // ignore the CURRENT file and instead read the manifest marker. - // Before we unlock the manifest, we need to update versionSet - // to use the manifest marker on future rotations. - d.mu.versions.setCurrent = setCurrentFuncMarker( - d.mu.versions.manifestMarker, - d.mu.versions.fs, - d.mu.versions.dirname) - return nil - }, - // The FormatVersioned version is split into two, each with their - // own migration to ensure the post-migration cleanup happens even - // if there's a crash immediately after finalizing the version. Once - // a new format major version is finalized, its migration will never - // run again. Post-migration cleanup like the one in the migration - // below must be performed in a separate migration or every time the - // database opens. - FormatVersioned: func(d *DB) error { - // Replace the `CURRENT` file with one that points to the - // nonexistent `MANIFEST-000000` file. If an earlier Pebble - // version that does not know about format major versions - // attempts to open the database, it will error avoiding - // accidental corruption. - if err := setCurrentFile(d.mu.versions.dirname, d.mu.versions.fs, base.FileNum(0).DiskFileNum()); err != nil { - return err - } - return d.finalizeFormatVersUpgrade(FormatVersioned) - }, - // As SetWithDelete is a new key kind, there is nothing to migrate. We can - // simply finalize the format version and we're done. - FormatSetWithDelete: func(d *DB) error { - return d.finalizeFormatVersUpgrade(FormatSetWithDelete) - }, - FormatBlockPropertyCollector: func(d *DB) error { - return d.finalizeFormatVersUpgrade(FormatBlockPropertyCollector) - }, - FormatSplitUserKeysMarked: func(d *DB) error { - // Mark any unmarked files with split-user keys. Note all format major - // versions migrations are invoked with DB.mu locked. - if err := d.markFilesLocked(markFilesWithSplitUserKeys(d.opts.Comparer.Equal)); err != nil { - return err - } - return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarked) - }, - FormatSplitUserKeysMarkedCompacted: func(d *DB) error { + FormatFlushableIngest: func(d *DB) error { return nil }, + FormatPrePebblev1MarkedCompacted: func(d *DB) error { // Before finalizing the format major version, rewrite any sstables // still marked for compaction. Note all format major versions // migrations are invoked with DB.mu locked. if err := d.compactMarkedFilesLocked(); err != nil { return err } - return d.finalizeFormatVersUpgrade(FormatSplitUserKeysMarkedCompacted) + return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted) }, - FormatRangeKeys: func(d *DB) error { - return d.finalizeFormatVersUpgrade(FormatRangeKeys) + FormatDeleteSizedAndObsolete: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete) }, - FormatMinTableFormatPebblev1: func(d *DB) error { - return d.finalizeFormatVersUpgrade(FormatMinTableFormatPebblev1) + FormatVirtualSSTables: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatVirtualSSTables) }, - FormatPrePebblev1Marked: func(d *DB) error { - // Mark any unmarked files that contain only table properties. Note all - // format major versions migrations are invoked with DB.mu locked. - if err := d.markFilesLocked(markFilesPrePebblev1(d.tableCache)); err != nil { - return err - } - return d.finalizeFormatVersUpgrade(FormatPrePebblev1Marked) + FormatSyntheticPrefixSuffix: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatSyntheticPrefixSuffix) }, - formatUnusedPrePebblev1MarkedCompacted: func(d *DB) error { - // Intentional no-op. - return d.finalizeFormatVersUpgrade(formatUnusedPrePebblev1MarkedCompacted) + FormatFlushableIngestExcises: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatFlushableIngestExcises) }, - FormatSSTableValueBlocks: func(d *DB) error { - return d.finalizeFormatVersUpgrade(FormatSSTableValueBlocks) + FormatColumnarBlocks: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatColumnarBlocks) }, - FormatFlushableIngest: func(d *DB) error { - return d.finalizeFormatVersUpgrade(FormatFlushableIngest) + FormatWALSyncChunks: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatWALSyncChunks) }, - FormatPrePebblev1MarkedCompacted: func(d *DB) error { - // Before finalizing the format major version, rewrite any sstables - // still marked for compaction. Note all format major versions - // migrations are invoked with DB.mu locked. - if err := d.compactMarkedFilesLocked(); err != nil { - return err - } - return d.finalizeFormatVersUpgrade(FormatPrePebblev1MarkedCompacted) + FormatTableFormatV6: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatTableFormatV6) }, - FormatDeleteSizedAndObsolete: func(d *DB) error { - return d.finalizeFormatVersUpgrade(FormatDeleteSizedAndObsolete) + formatDeprecatedExperimentalValueSeparation: func(d *DB) error { + return d.finalizeFormatVersUpgrade(formatDeprecatedExperimentalValueSeparation) }, - FormatVirtualSSTables: func(d *DB) error { - return d.finalizeFormatVersUpgrade(FormatVirtualSSTables) + formatFooterAttributes: func(d *DB) error { + return d.finalizeFormatVersUpgrade(formatFooterAttributes) + }, + FormatValueSeparation: func(d *DB) error { + return d.finalizeFormatVersUpgrade(FormatValueSeparation) }, } const formatVersionMarkerName = `format-version` +// lookupFormatMajorVersion retrieves the format version from the format version +// marker file. +// +// If such a file does not exist, returns FormatDefault. Note that this case is +// only acceptable if we are creating a new store (we no longer support +// FormatMostCompatible which is the only one with no version marker file). func lookupFormatMajorVersion( - fs vfs.FS, dirname string, + fs vfs.FS, dirname string, ls []string, ) (FormatMajorVersion, *atomicfs.Marker, error) { - m, versString, err := atomicfs.LocateMarker(fs, dirname, formatVersionMarkerName) + m, versString, err := atomicfs.LocateMarkerInListing(fs, dirname, formatVersionMarkerName, ls) if err != nil { return 0, nil, err } if versString == "" { - return FormatMostCompatible, m, nil + return FormatDefault, m, nil } v, err := strconv.ParseUint(versString, 10, 64) if err != nil { @@ -386,7 +378,10 @@ func lookupFormatMajorVersion( return 0, nil, errors.Newf("pebble: default format major version should not persisted", vers) } if vers > internalFormatNewest { - return 0, nil, errors.Newf("pebble: database %q written in format major version %d", dirname, vers) + return 0, nil, errors.Newf("pebble: database %q written in unknown format major version %d", dirname, vers) + } + if vers < FormatMinSupported { + return 0, nil, errors.Newf("pebble: database %q written in format major version %d which is no longer supported", dirname, vers) } return vers, m, nil } @@ -399,6 +394,39 @@ func (d *DB) FormatMajorVersion() FormatMajorVersion { return FormatMajorVersion(d.mu.formatVers.vers.Load()) } +// TableFormat returns the TableFormat that the database is currently using when +// writing sstables. The table format is determined by the database's format +// major version, as well as experimental settings like EnableValueBlocks and +// EnableColumnarBlocks. +func (d *DB) TableFormat() sstable.TableFormat { + // The table is typically written at the maximum allowable format implied by + // the current format major version of the DB. + f := d.FormatMajorVersion().MaxTableFormat() + switch f { + case sstable.TableFormatPebblev3: + // In format major versions with maximum table formats of Pebblev3, + // value blocks were conditional on an experimental setting. In format + // major versions with maximum table formats of Pebblev4 and higher, + // value blocks are always enabled. + if d.opts.Experimental.EnableValueBlocks == nil || !d.opts.Experimental.EnableValueBlocks() { + f = sstable.TableFormatPebblev2 + } + default: + if f.BlockColumnar() && (d.opts.Experimental.EnableColumnarBlocks == nil || + !d.opts.Experimental.EnableColumnarBlocks()) { + f = sstable.TableFormatPebblev4 + } + } + return f +} + +// shouldCreateShared returns true if the database should use shared objects +// when creating new objects on the given level. +func (d *DB) shouldCreateShared(targetLevel int) bool { + return remote.ShouldCreateShared(d.opts.Experimental.CreateOnShared, targetLevel) && + d.FormatMajorVersion() >= FormatMinForSharedObjects +} + // RatchetFormatMajorVersion ratchets the opened database's format major // version to the provided version. It errors if the provided format // major version is below the database's current version. Once a @@ -456,11 +484,7 @@ func (d *DB) ratchetFormatMajorVersionLocked(formatVers FormatMajorVersion) erro // // See formatMajorVersionMigrations. func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error { - // We use the marker to encode the active format version in the - // marker filename. Unlike other uses of the atomic marker, there is - // no file with the filename `formatVers.String()` on the - // filesystem. - if err := d.mu.formatVers.marker.Move(formatVers.String()); err != nil { + if err := d.writeFormatVersionMarker(formatVers); err != nil { return err } d.mu.formatVers.vers.Store(uint64(formatVers)) @@ -468,6 +492,14 @@ func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error { return nil } +func (d *DB) writeFormatVersionMarker(formatVers FormatMajorVersion) error { + // We use the marker to encode the active format version in the + // marker filename. Unlike other uses of the atomic marker, there is + // no file with the filename `formatVers.String()` on the + // filesystem. + return d.mu.formatVers.marker.Move(formatVers.String()) +} + // compactMarkedFilesLocked performs a migration that schedules rewrite // compactions to compact away any sstables marked for compaction. // compactMarkedFilesLocked is run while ratcheting the database's format major @@ -478,19 +510,23 @@ func (d *DB) finalizeFormatVersUpgrade(formatVers FormatMajorVersion) error { // waiting for compactions to complete (or for slots to free up). func (d *DB) compactMarkedFilesLocked() error { curr := d.mu.versions.currentVersion() + if curr.Stats.MarkedForCompaction == 0 { + return nil + } + // Attempt to schedule a compaction to rewrite a file marked for compaction. + // We simply call maybeScheduleCompaction since it also picks rewrite + // compactions. Note that we don't need to call this repeatedly in the for + // loop below since the completion of a compaction either starts a new one + // or ensures a compaction is queued for scheduling. By calling + // maybeScheduleCompaction here we are simply kicking off this behavior. + d.maybeScheduleCompaction() + + // The above attempt might succeed and schedule a rewrite compaction. Or + // there might not be available compaction concurrency to schedule the + // compaction. Or compaction of the file might have already been in + // progress. In any scenario, wait until there's some change in the + // state of active compactions. for curr.Stats.MarkedForCompaction > 0 { - // Attempt to schedule a compaction to rewrite a file marked for - // compaction. - d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction { - return picker.pickRewriteCompaction(env) - }) - - // The above attempt might succeed and schedule a rewrite compaction. Or - // there might not be available compaction concurrency to schedule the - // compaction. Or compaction of the file might have already been in - // progress. In any scenario, wait until there's some change in the - // state of active compactions. - // Before waiting, check that the database hasn't been closed. Trying to // schedule the compaction may have dropped d.mu while waiting for a // manifest write to complete. In that dropped interim, the database may @@ -507,9 +543,10 @@ func (d *DB) compactMarkedFilesLocked() error { // Only wait on compactions if there are files still marked for compaction. // NB: Waiting on this condition variable drops d.mu while blocked. if curr.Stats.MarkedForCompaction > 0 { - if d.mu.compact.compactingCount == 0 { - panic("expected a compaction of marked files in progress") - } + // NB: we cannot assert that d.mu.compact.compactingCount > 0, since + // with a CompactionScheduler a DB may not have even one ongoing + // compaction (if other competing activities are being preferred by the + // scheduler). d.mu.compact.cond.Wait() // Refresh the current version again. curr = d.mu.versions.currentVersion() @@ -521,79 +558,16 @@ func (d *DB) compactMarkedFilesLocked() error { // findFilesFunc scans the LSM for files, returning true if at least one // file was found. The returned array contains the matched files, if any, per // level. -type findFilesFunc func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) - -// markFilesWithSplitUserKeys scans the LSM's levels 1 through 6 for adjacent -// files that contain the same user key. Such arrangements of files were -// permitted in RocksDB and in Pebble up to SHA a860bbad. -var markFilesWithSplitUserKeys = func(equal Equal) findFilesFunc { - return func(v *version) (found bool, files [numLevels][]*fileMetadata, _ error) { - // Files with split user keys are expected to be rare and performing key - // comparisons for every file within the LSM is expensive, so drop the - // database lock while scanning the file metadata. - for l := numLevels - 1; l > 0; l-- { - iter := v.Levels[l].Iter() - var prevFile *fileMetadata - var prevUserKey []byte - for f := iter.First(); f != nil; f = iter.Next() { - if prevUserKey != nil && equal(prevUserKey, f.Smallest.UserKey) { - // NB: We may append a file twice, once as prevFile and once - // as f. That's okay, and handled below. - files[l] = append(files[l], prevFile, f) - found = true - } - if f.Largest.IsExclusiveSentinel() { - prevUserKey = nil - prevFile = nil - } else { - prevUserKey = f.Largest.UserKey - prevFile = f - } - } - } - return - } -} +type findFilesFunc func(v *manifest.Version) (found bool, files [numLevels][]*manifest.TableMetadata, _ error) -// markFilesPrePebblev1 scans the LSM for files that do not support block -// properties (i.e. a table format version pre-Pebblev1). -var markFilesPrePebblev1 = func(tc *tableCacheContainer) findFilesFunc { - return func(v *version) (found bool, files [numLevels][]*fileMetadata, err error) { - for l := numLevels - 1; l > 0; l-- { - iter := v.Levels[l].Iter() - for f := iter.First(); f != nil; f = iter.Next() { - if f.Virtual { - // Any physical sstable which has been virtualized must - // have already undergone this migration, and we don't - // need to worry about the virtual sstable themselves. - panic("pebble: unexpected virtual sstable during migration") - } - err = tc.withReader( - f.PhysicalMeta(), func(r *sstable.Reader) error { - tf, err := r.TableFormat() - if err != nil { - return err - } - if tf < sstable.TableFormatPebblev1 { - found = true - files[l] = append(files[l], f) - } - return nil - }) - if err != nil { - return - } - } - } - return - } -} +// This method is not used currently, but it will be useful the next time we need +// to mark files for compaction. +var _ = (*DB)(nil).markFilesLocked -// markFilesLock durably marks the files that match the given findFilesFunc for +// markFilesLocked durably marks the files that match the given findFilesFunc for // compaction. func (d *DB) markFilesLocked(findFn findFilesFunc) error { - jobID := d.mu.nextJobID - d.mu.nextJobID++ + jobID := d.newJobIDLocked() // Acquire a read state to have a view of the LSM and a guarantee that none // of the referenced files will be deleted until we've unreferenced the read @@ -602,7 +576,7 @@ func (d *DB) markFilesLocked(findFn findFilesFunc) error { rs := d.loadReadState() var ( found bool - files [numLevels][]*fileMetadata + files [numLevels][]*manifest.TableMetadata err error ) func() { @@ -632,47 +606,45 @@ func (d *DB) markFilesLocked(findFn findFilesFunc) error { // Lock the manifest for a coherent view of the LSM. The database lock has // been re-acquired by the defer within the above anonymous function. - d.mu.versions.logLock() - vers := d.mu.versions.currentVersion() - for l, filesToMark := range files { - if len(filesToMark) == 0 { - continue - } - for _, f := range filesToMark { - // Ignore files to be marked that have already been compacted or marked. - if f.CompactionState == manifest.CompactionStateCompacted || - f.MarkedForCompaction { + return d.mu.versions.UpdateVersionLocked(func() (versionUpdate, error) { + vers := d.mu.versions.currentVersion() + for l, filesToMark := range files { + if len(filesToMark) == 0 { continue } - // Else, mark the file for compaction in this version. - vers.Stats.MarkedForCompaction++ - f.MarkedForCompaction = true + for _, f := range filesToMark { + // Ignore files to be marked that have already been compacted or marked. + if f.CompactionState == manifest.CompactionStateCompacted || + f.MarkedForCompaction { + continue + } + // Else, mark the file for compaction in this version. + vers.Stats.MarkedForCompaction++ + f.MarkedForCompaction = true + } + // The compaction picker uses the markedForCompactionAnnotator to + // quickly find files marked for compaction, or to quickly determine + // that there are no such files marked for compaction within a level. + // A b-tree node may be annotated with an annotation recording that + // there are no files marked for compaction within the node's subtree, + // based on the assumption that it's static. + // + // Since we're marking files for compaction, these b-tree nodes' + // annotations will be out of date. Clear the compaction-picking + // annotation, so that it's recomputed the next time the compaction + // picker looks for a file marked for compaction. + markedForCompactionAnnotator.InvalidateLevelAnnotation(vers.Levels[l]) } - // The compaction picker uses the markedForCompactionAnnotator to - // quickly find files marked for compaction, or to quickly determine - // that there are no such files marked for compaction within a level. - // A b-tree node may be annotated with an annotation recording that - // there are no files marked for compaction within the node's subtree, - // based on the assumption that it's static. - // - // Since we're marking files for compaction, these b-tree nodes' - // annotations will be out of date. Clear the compaction-picking - // annotation, so that it's recomputed the next time the compaction - // picker looks for a file marked for compaction. - vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{}) - } - - // The 'marked-for-compaction' bit is persisted in the MANIFEST file - // metadata. We've already modified the in-memory file metadata, but the - // manifest hasn't been updated. Force rotation to a new MANIFEST file, - // which will write every file metadata to the new manifest file and ensure - // that the now marked-for-compaction file metadata are persisted as marked. - // NB: This call to logAndApply will unlockthe MANIFEST, which we locked up - // above before obtaining `vers`. - return d.mu.versions.logAndApply( - jobID, - &manifest.VersionEdit{}, - map[int]*LevelMetrics{}, - true, /* forceRotation */ - func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) }) + // The 'marked-for-compaction' bit is persisted in the MANIFEST file + // metadata. We've already modified the in-memory table metadata, but the + // manifest hasn't been updated. Force rotation to a new MANIFEST file, + // which will write every table metadata to the new manifest file and ensure + // that the now marked-for-compaction table metadata are persisted as marked. + return versionUpdate{ + VE: &manifest.VersionEdit{}, + JobID: jobID, + ForceManifestRotation: true, + InProgressCompactionsFn: func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) }, + }, nil + }) } diff --git a/vendor/github.com/cockroachdb/pebble/v2/get_iter.go b/vendor/github.com/cockroachdb/pebble/v2/get_iter.go new file mode 100644 index 0000000..371f541 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/get_iter.go @@ -0,0 +1,303 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + "fmt" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// getIter is an internal iterator used to perform gets. It iterates through +// the values for a particular key, level by level. It is not a general purpose +// internalIterator, but specialized for Get operations so that it loads data +// lazily. +type getIter struct { + comparer *Comparer + newIters tableNewIters + snapshot base.SeqNum + iterOpts IterOptions + iiopts internalIterOpts + key []byte + prefix []byte + iter internalIterator + level int + batch *Batch + mem flushableList + l0 []manifest.LevelSlice + version *manifest.Version + iterKV *base.InternalKV + // tombstoned and tombstonedSeqNum track whether the key has been deleted by + // a range delete tombstone. The first visible (at getIter.snapshot) range + // deletion encounterd transitions tombstoned to true. The tombstonedSeqNum + // field is updated to hold the sequence number of the tombstone. + tombstoned bool + tombstonedSeqNum base.SeqNum + err error +} + +// TODO(sumeer): CockroachDB code doesn't use getIter, but, for completeness, +// make this implement InternalIteratorWithStats. + +// getIter implements the base.InternalIterator interface. +var _ base.InternalIterator = (*getIter)(nil) + +func (g *getIter) String() string { + return fmt.Sprintf("len(l0)=%d, len(mem)=%d, level=%d", len(g.l0), len(g.mem), g.level) +} + +func (g *getIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { + panic("pebble: SeekGE unimplemented") +} + +func (g *getIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + return g.SeekPrefixGEStrict(prefix, key, flags) +} + +func (g *getIter) SeekPrefixGEStrict(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + panic("pebble: SeekPrefixGE unimplemented") +} + +func (g *getIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { + panic("pebble: SeekLT unimplemented") +} + +func (g *getIter) First() *base.InternalKV { + return g.Next() +} + +func (g *getIter) Last() *base.InternalKV { + panic("pebble: Last unimplemented") +} + +func (g *getIter) Next() *base.InternalKV { + // If g.iter != nil, we're already iterating through a level. Next. Note + // that it's possible the next key within the level is still relevant (eg, + // MERGE keys written in the presence of an LSM snapshot). + // + // NB: We can't perform this Next below, in the for loop, because when we + // open an iterator into the next level, we need to seek to the key. + if g.iter != nil { + g.iterKV = g.iter.Next() + if err := g.iter.Error(); err != nil { + g.err = err + return nil + } + } + + // This for loop finds the next internal key in the LSM that is equal to + // g.key, visible at g.snapshot and not shadowed by a range deletion. If it + // exhausts a level, it initializes iterators for the next level. + for { + if g.iter != nil { + if g.iterKV != nil { + // Check if the current KV pair is deleted by a range deletion. + if g.tombstoned && g.tombstonedSeqNum > g.iterKV.SeqNum() { + // We have a range tombstone covering this key. Rather than + // return a point or range deletion here, we return nil and + // close our internal iterator stopping iteration. + g.err = g.iter.Close() + g.iter = nil + return nil + } + + // Is this the correct user key? + if g.comparer.Equal(g.key, g.iterKV.K.UserKey) { + // If the KV pair is not visible at the get's snapshot, + // Next. The level may still contain older keys with the + // same user key that are visible. + if !g.iterKV.Visible(g.snapshot, base.SeqNumMax) { + g.iterKV = g.iter.Next() + continue + } + return g.iterKV + } + } + // We've advanced the iterator passed the desired key. Move on to the + // next memtable / level. + g.err = g.iter.Close() + g.iter = nil + if g.err != nil { + return nil + } + } + // g.iter == nil; we need to initialize the next iterator. + if !g.initializeNextIterator() { + return nil + } + g.iterKV = g.iter.SeekPrefixGE(g.prefix, g.key, base.SeekGEFlagsNone) + } +} + +func (g *getIter) Prev() *base.InternalKV { + panic("pebble: Prev unimplemented") +} + +func (g *getIter) NextPrefix([]byte) *base.InternalKV { + panic("pebble: NextPrefix unimplemented") +} + +func (g *getIter) Error() error { + return g.err +} + +func (g *getIter) Close() error { + if g.iter != nil { + if err := g.iter.Close(); err != nil && g.err == nil { + g.err = err + } + g.iter = nil + } + return g.err +} + +func (g *getIter) SetBounds(lower, upper []byte) { + panic("pebble: SetBounds unimplemented") +} + +func (g *getIter) SetContext(_ context.Context) {} + +// DebugTree is part of the InternalIterator interface. +func (g *getIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", g, g) + if g.iter != nil { + g.iter.DebugTree(n) + } +} + +func (g *getIter) initializeNextIterator() (ok bool) { + // A batch's keys shadow all other keys, so we visit the batch first. + if g.batch != nil { + if g.batch.index == nil { + g.err = ErrNotIndexed + g.iterKV = nil + return false + } + g.iter = g.batch.newInternalIter(nil) + if !g.maybeSetTombstone(g.batch.newRangeDelIter(nil, + // Get always reads the entirety of the batch's history, so no + // batch keys should be filtered. + base.SeqNumMax, + )) { + return false + } + g.batch = nil + return true + } + + // If we're trying to initialize the next level of the iterator stack but + // have a tombstone from a previous level, it is guaranteed to delete keys + // in lower levels. This key is deleted. + if g.tombstoned { + return false + } + + // Create iterators from memtables from newest to oldest. + if n := len(g.mem); n > 0 { + m := g.mem[n-1] + g.iter = m.newIter(nil) + if !g.maybeSetTombstone(m.newRangeDelIter(nil)) { + return false + } + g.mem = g.mem[:n-1] + return true + } + + // Visit each sublevel of L0 individually, so that we only need to read + // at most one file per sublevel. + if g.level == 0 { + // Create iterators from L0 from newest to oldest. + if n := len(g.l0); n > 0 { + files := g.l0[n-1].Iter() + g.l0 = g.l0[:n-1] + + iter, rangeDelIter, err := g.getSSTableIterators(files, manifest.L0Sublevel(n)) + if err != nil { + g.err = firstError(g.err, err) + return false + } + if !g.maybeSetTombstone(rangeDelIter) { + return false + } + g.iter = iter + return true + } + // We've exhausted all the sublevels of L0. Progress to L1. + g.level++ + } + for g.level < numLevels { + if g.version.Levels[g.level].Empty() { + g.level++ + continue + } + // Open the next level of the LSM. + iter, rangeDelIter, err := g.getSSTableIterators(g.version.Levels[g.level].Iter(), manifest.Level(g.level)) + if err != nil { + g.err = firstError(g.err, err) + return false + } + if !g.maybeSetTombstone(rangeDelIter) { + return false + } + g.level++ + g.iter = iter + return true + } + // We've exhausted all levels of the LSM. + return false +} + +// getSSTableIterators returns a point iterator and a range deletion iterator +// for the sstable in files that overlaps with the key g.key. Pebble does not +// split user keys across adjacent sstables within a level, ensuring that at +// most one sstable overlaps g.key. +func (g *getIter) getSSTableIterators( + files manifest.LevelIterator, level manifest.Layer, +) (internalIterator, keyspan.FragmentIterator, error) { + files = files.Filter(manifest.KeyTypePoint) + m := files.SeekGE(g.comparer.Compare, g.key) + if m == nil { + return emptyIter, nil, nil + } + // m is now positioned at the file containing the first point key ≥ `g.key`. + // Does it exist and possibly contain point keys with the user key 'g.key'? + if m == nil || !m.HasPointKeys || g.comparer.Compare(m.PointKeyBounds.SmallestUserKey(), g.key) > 0 { + return emptyIter, nil, nil + } + // m may possibly contain point (or range deletion) keys relevant to g.key. + g.iterOpts.layer = level + iters, err := g.newIters(context.Background(), m, &g.iterOpts, g.iiopts, iterPointKeys|iterRangeDeletions) + if err != nil { + return emptyIter, nil, err + } + return iters.Point(), iters.RangeDeletion(), nil +} + +// maybeSetTombstone updates g.tombstoned[SeqNum] to reflect the presence of a +// range deletion covering g.key, if there are any. It returns true if +// successful, or false if an error occurred and the caller should abort +// iteration. +func (g *getIter) maybeSetTombstone(rangeDelIter keyspan.FragmentIterator) (ok bool) { + if rangeDelIter == nil { + // Nothing to do. + return true + } + // Find the range deletion that covers the sought key, if any. + t, err := keyspan.Get(g.comparer.Compare, rangeDelIter, g.key) + if err != nil { + g.err = firstError(g.err, err) + return false + } + // Find the most recent visible range deletion's sequence number. We only + // care about the most recent range deletion that's visible because it's the + // "most powerful." + g.tombstonedSeqNum, g.tombstoned = t.LargestVisibleSeqNum(g.snapshot) + rangeDelIter.Close() + return true +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/ingest.go b/vendor/github.com/cockroachdb/pebble/v2/ingest.go new file mode 100644 index 0000000..d692648 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/ingest.go @@ -0,0 +1,2328 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + "fmt" + "slices" + "sort" + "time" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/cache" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/overlap" + "github.com/cockroachdb/pebble/v2/internal/sstableinternal" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +func sstableKeyCompare(userCmp Compare, a, b InternalKey) int { + c := userCmp(a.UserKey, b.UserKey) + if c != 0 { + return c + } + if a.IsExclusiveSentinel() { + if !b.IsExclusiveSentinel() { + return -1 + } + } else if b.IsExclusiveSentinel() { + return +1 + } + return 0 +} + +func ingestValidateKey(opts *Options, key *InternalKey) error { + if key.Kind() == InternalKeyKindInvalid { + return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s", + key.Pretty(opts.Comparer.FormatKey)) + } + if key.SeqNum() != 0 { + return base.CorruptionErrorf("pebble: external sstable has non-zero seqnum: %s", + key.Pretty(opts.Comparer.FormatKey)) + } + if err := opts.Comparer.ValidateKey.Validate(key.UserKey); err != nil { + return base.CorruptionErrorf("pebble: external sstable has corrupted key: %s, %w", + key.Pretty(opts.Comparer.FormatKey), err) + } + return nil +} + +// ingestSynthesizeShared constructs a fileMetadata for one shared sstable owned +// or shared by another node. +func ingestSynthesizeShared( + opts *Options, sm SharedSSTMeta, tableNum base.TableNum, +) (*manifest.TableMetadata, error) { + if sm.Size == 0 { + // Disallow 0 file sizes + return nil, errors.New("pebble: cannot ingest shared file with size 0") + } + // Don't load table stats. Doing a round trip to shared storage, one SST + // at a time is not worth it as it slows down ingestion. + meta := &manifest.TableMetadata{ + TableNum: tableNum, + CreationTime: time.Now().Unix(), + Virtual: true, + Size: sm.Size, + } + if sm.LargestPointKey.Valid() && sm.LargestPointKey.UserKey != nil { + // Initialize meta.{HasPointKeys,Smallest,Largest}, etc. + // + // NB: We create new internal keys and pass them into ExtendPointKeyBounds + // so that we can sub a zero sequence number into the bounds. We can set + // the sequence number to anything here; it'll be reset in ingestUpdateSeqNum + // anyway. However, we do need to use the same sequence number across all + // bound keys at this step so that we end up with bounds that are consistent + // across point/range keys. + // + // Because of the sequence number rewriting, we cannot use the Kind of + // sm.SmallestPointKey. For example, the original SST might start with + // a.SET.2 and a.RANGEDEL.1 (with a.SET.2 being the smallest key); after + // rewriting the sequence numbers, these keys become a.SET.100 and + // a.RANGEDEL.100, with a.RANGEDEL.100 being the smallest key. To create a + // correct bound, we just use the maximum key kind (which sorts first). + // Similarly, we use the smallest key kind for the largest key. + smallestPointKey := base.MakeInternalKey(sm.SmallestPointKey.UserKey, 0, base.InternalKeyKindMaxForSSTable) + largestPointKey := base.MakeInternalKey(sm.LargestPointKey.UserKey, 0, 0) + if sm.LargestPointKey.IsExclusiveSentinel() { + largestPointKey = base.MakeRangeDeleteSentinelKey(sm.LargestPointKey.UserKey) + } + if opts.Comparer.Equal(smallestPointKey.UserKey, largestPointKey.UserKey) && + smallestPointKey.Trailer < largestPointKey.Trailer { + // We get kinds from the sender, however we substitute our own sequence + // numbers. This can result in cases where an sstable [b#5,SET-b#4,DELSIZED] + // becomes [b#0,SET-b#0,DELSIZED] when we synthesize it here, but the + // kinds need to be reversed now because DelSized > Set. + smallestPointKey, largestPointKey = largestPointKey, smallestPointKey + } + meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallestPointKey, largestPointKey) + } + if sm.LargestRangeKey.Valid() && sm.LargestRangeKey.UserKey != nil { + // Initialize meta.{HasRangeKeys,Smallest,Largest}, etc. + // + // See comment above on why we use a zero sequence number and these key + // kinds here. + smallestRangeKey := base.MakeInternalKey(sm.SmallestRangeKey.UserKey, 0, base.InternalKeyKindRangeKeyMax) + largestRangeKey := base.MakeExclusiveSentinelKey(base.InternalKeyKindRangeKeyMin, sm.LargestRangeKey.UserKey) + meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallestRangeKey, largestRangeKey) + } + + // For simplicity, we use the same number for both the FileNum and the + // DiskFileNum (even though this is a virtual sstable). Pass the underlying + // TableBacking's size to the same size as the virtualized view of the sstable. + // This ensures that we don't over-prioritize this sstable for compaction just + // yet, as we do not have a clear sense of what parts of this sstable are + // referenced by other nodes. + meta.InitVirtualBacking(base.DiskFileNum(tableNum), sm.Size) + + if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { + return nil, err + } + return meta, nil +} + +// ingestLoad1External loads the fileMetadata for one external sstable. +// Sequence number and target level calculation happens during prepare/apply. +func ingestLoad1External( + opts *Options, e ExternalFile, tableNum base.TableNum, +) (*manifest.TableMetadata, error) { + if e.Size == 0 { + return nil, errors.New("pebble: cannot ingest external file with size 0") + } + if !e.HasRangeKey && !e.HasPointKey { + return nil, errors.New("pebble: cannot ingest external file with no point or range keys") + } + + if opts.Comparer.Compare(e.StartKey, e.EndKey) > 0 { + return nil, errors.Newf("pebble: external file bounds [%q, %q) are invalid", e.StartKey, e.EndKey) + } + if opts.Comparer.Compare(e.StartKey, e.EndKey) == 0 && !e.EndKeyIsInclusive { + return nil, errors.Newf("pebble: external file bounds [%q, %q) are invalid", e.StartKey, e.EndKey) + } + if n := opts.Comparer.Split(e.StartKey); n != len(e.StartKey) { + return nil, errors.Newf("pebble: external file bounds start key %q has suffix", e.StartKey) + } + if n := opts.Comparer.Split(e.EndKey); n != len(e.EndKey) { + return nil, errors.Newf("pebble: external file bounds end key %q has suffix", e.EndKey) + } + + // Don't load table stats. Doing a round trip to shared storage, one SST + // at a time is not worth it as it slows down ingestion. + meta := &manifest.TableMetadata{ + TableNum: tableNum, + CreationTime: time.Now().Unix(), + Size: e.Size, + Virtual: true, + } + // In the name of keeping this ingestion as fast as possible, we avoid *all* + // existence checks and synthesize a table metadata with smallest/largest + // keys that overlap whatever the passed-in span was. + smallestCopy := slices.Clone(e.StartKey) + largestCopy := slices.Clone(e.EndKey) + if e.HasPointKey { + // Sequence numbers are updated later by + // ingestUpdateSeqNum, applying a squence number that + // is applied to all keys in the sstable. + if e.EndKeyIsInclusive { + meta.ExtendPointKeyBounds( + opts.Comparer.Compare, + base.MakeInternalKey(smallestCopy, 0, base.InternalKeyKindMaxForSSTable), + base.MakeInternalKey(largestCopy, 0, 0)) + } else { + meta.ExtendPointKeyBounds( + opts.Comparer.Compare, + base.MakeInternalKey(smallestCopy, 0, base.InternalKeyKindMaxForSSTable), + base.MakeRangeDeleteSentinelKey(largestCopy)) + } + } + if e.HasRangeKey { + meta.ExtendRangeKeyBounds( + opts.Comparer.Compare, + base.MakeInternalKey(smallestCopy, 0, InternalKeyKindRangeKeyMax), + base.MakeExclusiveSentinelKey(InternalKeyKindRangeKeyMin, largestCopy), + ) + } + + meta.SyntheticPrefixAndSuffix = sstable.MakeSyntheticPrefixAndSuffix(e.SyntheticPrefix, e.SyntheticSuffix) + + return meta, nil +} + +type rangeKeyIngestValidator struct { + // lastRangeKey is the last range key seen in the previous file. + lastRangeKey keyspan.Span + // comparer, if unset, disables range key validation. + comparer *base.Comparer +} + +func disableRangeKeyChecks() rangeKeyIngestValidator { + return rangeKeyIngestValidator{} +} + +func validateSuffixedBoundaries( + cmp *base.Comparer, lastRangeKey keyspan.Span, +) rangeKeyIngestValidator { + return rangeKeyIngestValidator{ + lastRangeKey: lastRangeKey, + comparer: cmp, + } +} + +// Validate valides if the stored state of this rangeKeyIngestValidator allows for +// a file with the given nextFileSmallestKey to be ingested, such that the stored +// last file's largest range key defragments cleanly with the next file's smallest +// key if it was suffixed. If a value of nil is passed in for nextFileSmallestKey, +// that denotes the next file does not have a range key or there is no next file. +func (r *rangeKeyIngestValidator) Validate(nextFileSmallestKey *keyspan.Span) error { + if r.comparer == nil { + return nil + } + if r.lastRangeKey.Valid() { + if r.comparer.Split.HasSuffix(r.lastRangeKey.End) { + if nextFileSmallestKey == nil || !r.comparer.Equal(r.lastRangeKey.End, nextFileSmallestKey.Start) { + // The last range key has a suffix, and it doesn't defragment cleanly with this range key. + return errors.AssertionFailedf("pebble: ingest sstable has suffixed largest range key that does not match the start key of the next sstable: %s", + r.comparer.FormatKey(r.lastRangeKey.End)) + } else if !keyspan.DefragmentInternal.ShouldDefragment(r.comparer.CompareRangeSuffixes, &r.lastRangeKey, nextFileSmallestKey) { + // The last range key has a suffix, and it doesn't defragment cleanly with this range key. + return errors.AssertionFailedf("pebble: ingest sstable has suffixed range key that won't defragment with next sstable: %s", + r.comparer.FormatKey(r.lastRangeKey.End)) + } + } + } else if nextFileSmallestKey != nil && r.comparer.Split.HasSuffix(nextFileSmallestKey.Start) { + return errors.Newf("pebble: ingest sstable has suffixed range key start that won't defragment: %s", + r.comparer.FormatKey(nextFileSmallestKey.Start)) + } + return nil +} + +// ingestLoad1 creates the TableMetadata for one file. This file will be owned +// by this store. +// +// prevLastRangeKey is the last range key from the previous file. It is used to +// ensure that the range keys defragment cleanly across files. These checks +// are disabled if disableRangeKeyChecks is true. +func ingestLoad1( + ctx context.Context, + opts *Options, + fmv FormatMajorVersion, + readable objstorage.Readable, + cacheHandle *cache.Handle, + tableNum base.TableNum, + rangeKeyValidator rangeKeyIngestValidator, +) (meta *manifest.TableMetadata, lastRangeKey keyspan.Span, err error) { + o := opts.MakeReaderOptions() + o.CacheOpts = sstableinternal.CacheOptions{ + CacheHandle: cacheHandle, + FileNum: base.PhysicalTableDiskFileNum(tableNum), + } + r, err := sstable.NewReader(ctx, readable, o) + if err != nil { + return nil, keyspan.Span{}, errors.CombineErrors(err, readable.Close()) + } + defer func() { _ = r.Close() }() + + // Avoid ingesting tables with format versions this DB doesn't support. + tf, err := r.TableFormat() + if err != nil { + return nil, keyspan.Span{}, err + } + if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() { + return nil, keyspan.Span{}, errors.Newf( + "pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)", + tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(), + ) + } + + if r.Attributes.Has(sstable.AttributeBlobValues) { + return nil, keyspan.Span{}, errors.Newf( + "pebble: ingesting tables with blob references is not supported") + } + + props, err := r.ReadPropertiesBlock(ctx, nil /* buffer pool */) + if err != nil { + return nil, keyspan.Span{}, err + } + + // If this is a columnar block, read key schema name from properties block. + if tf.BlockColumnar() { + if _, ok := opts.KeySchemas[props.KeySchemaName]; !ok { + return nil, keyspan.Span{}, errors.Newf( + "pebble: table uses key schema %q unknown to the database", + props.KeySchemaName) + } + } + + meta = &manifest.TableMetadata{} + meta.TableNum = tableNum + meta.Size = max(uint64(readable.Size()), 1) + meta.CreationTime = time.Now().Unix() + meta.InitPhysicalBacking() + + // Avoid loading into the file cache for collecting stats if we + // don't need to. If there are no range deletions, we have all the + // information to compute the stats here. + // + // This is helpful in tests for avoiding awkwardness around deletion of + // ingested files from MemFS. MemFS implements the Windows semantics of + // disallowing removal of an open file. Under MemFS, if we don't populate + // meta.Stats here, the file will be loaded into the file cache for + // calculating stats before we can remove the original link. + maybeSetStatsFromProperties(meta.PhysicalMeta(), &props.CommonProperties, opts.Logger) + + { + iter, err := r.NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */, sstable.AssertNoBlobHandles) + if err != nil { + return nil, keyspan.Span{}, err + } + defer func() { _ = iter.Close() }() + var smallest InternalKey + if kv := iter.First(); kv != nil { + if err := ingestValidateKey(opts, &kv.K); err != nil { + return nil, keyspan.Span{}, err + } + smallest = kv.K.Clone() + } + if err := iter.Error(); err != nil { + return nil, keyspan.Span{}, err + } + if kv := iter.Last(); kv != nil { + if err := ingestValidateKey(opts, &kv.K); err != nil { + return nil, keyspan.Span{}, err + } + meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, kv.K.Clone()) + } + if err := iter.Error(); err != nil { + return nil, keyspan.Span{}, err + } + } + + iter, err := r.NewRawRangeDelIter(ctx, sstable.NoFragmentTransforms, sstable.NoReadEnv) + if err != nil { + return nil, keyspan.Span{}, err + } + if iter != nil { + defer iter.Close() + var smallest InternalKey + if s, err := iter.First(); err != nil { + return nil, keyspan.Span{}, err + } else if s != nil { + key := s.SmallestKey() + if err := ingestValidateKey(opts, &key); err != nil { + return nil, keyspan.Span{}, err + } + smallest = key.Clone() + } + if s, err := iter.Last(); err != nil { + return nil, keyspan.Span{}, err + } else if s != nil { + k := s.SmallestKey() + if err := ingestValidateKey(opts, &k); err != nil { + return nil, keyspan.Span{}, err + } + largest := s.LargestKey().Clone() + meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest) + } + } + + // Update the range-key bounds for the table. + { + iter, err := r.NewRawRangeKeyIter(ctx, sstable.NoFragmentTransforms, sstable.NoReadEnv) + if err != nil { + return nil, keyspan.Span{}, err + } + if iter != nil { + defer iter.Close() + var smallest InternalKey + if s, err := iter.First(); err != nil { + return nil, keyspan.Span{}, err + } else if s != nil { + key := s.SmallestKey() + if err := ingestValidateKey(opts, &key); err != nil { + return nil, keyspan.Span{}, err + } + smallest = key.Clone() + // Range keys need some additional validation as we need to ensure they + // defragment cleanly with the lastRangeKey from the previous file. + if err := rangeKeyValidator.Validate(s); err != nil { + return nil, keyspan.Span{}, err + } + } + lastRangeKey = keyspan.Span{} + if s, err := iter.Last(); err != nil { + return nil, keyspan.Span{}, err + } else if s != nil { + k := s.SmallestKey() + if err := ingestValidateKey(opts, &k); err != nil { + return nil, keyspan.Span{}, err + } + // As range keys are fragmented, the end key of the last range key in + // the table provides the upper bound for the table. + largest := s.LargestKey().Clone() + meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest) + lastRangeKey = s.Clone() + } else { + // s == nil. + if err := rangeKeyValidator.Validate(nil /* nextFileSmallestKey */); err != nil { + return nil, keyspan.Span{}, err + } + } + } else { + if err := rangeKeyValidator.Validate(nil /* nextFileSmallestKey */); err != nil { + return nil, keyspan.Span{}, err + } + lastRangeKey = keyspan.Span{} + } + } + + if !meta.HasPointKeys && !meta.HasRangeKeys { + return nil, keyspan.Span{}, nil + } + + // Sanity check that the various bounds on the file were set consistently. + if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { + return nil, keyspan.Span{}, err + } + + return meta, lastRangeKey, nil +} + +type ingestLoadResult struct { + local []ingestLocalMeta + shared []ingestSharedMeta + external []ingestExternalMeta + + externalFilesHaveLevel bool +} + +type ingestLocalMeta struct { + *manifest.TableMetadata + path string +} + +type ingestSharedMeta struct { + *manifest.TableMetadata + shared SharedSSTMeta +} + +type ingestExternalMeta struct { + *manifest.TableMetadata + external ExternalFile + // usedExistingBacking is true if the external file is reusing a backing + // that existed before this ingestion. In this case, we called + // VirtualBackings.Protect() on that backing; we will need to call + // Unprotect() after the ingestion. + usedExistingBacking bool +} + +func (r *ingestLoadResult) fileCount() int { + return len(r.local) + len(r.shared) + len(r.external) +} + +func ingestLoad( + ctx context.Context, + opts *Options, + fmv FormatMajorVersion, + paths []string, + shared []SharedSSTMeta, + external []ExternalFile, + cacheHandle *cache.Handle, + pending []base.TableNum, +) (ingestLoadResult, error) { + localFileNums := pending[:len(paths)] + sharedFileNums := pending[len(paths) : len(paths)+len(shared)] + externalFileNums := pending[len(paths)+len(shared) : len(paths)+len(shared)+len(external)] + + var result ingestLoadResult + result.local = make([]ingestLocalMeta, 0, len(paths)) + var lastRangeKey keyspan.Span + // NB: we disable range key boundary assertions if we have shared or external files + // present in this ingestion. This is because a suffixed range key in a local file + // can possibly defragment with a suffixed range key in a shared or external file. + // We also disable range key boundary assertions if we have CreateOnShared set to + // true, as that means we could have suffixed RangeKeyDels or Unsets in the local + // files that won't ever be surfaced, even if there are no shared or external files + // in the ingestion. + shouldDisableRangeKeyChecks := len(shared) > 0 || len(external) > 0 || opts.Experimental.CreateOnShared != remote.CreateOnSharedNone + for i := range paths { + f, err := opts.FS.Open(paths[i]) + if err != nil { + return ingestLoadResult{}, err + } + + readable, err := sstable.NewSimpleReadable(f) + if err != nil { + return ingestLoadResult{}, err + } + var m *manifest.TableMetadata + rangeKeyValidator := disableRangeKeyChecks() + if !shouldDisableRangeKeyChecks { + rangeKeyValidator = validateSuffixedBoundaries(opts.Comparer, lastRangeKey) + } + m, lastRangeKey, err = ingestLoad1(ctx, opts, fmv, readable, cacheHandle, localFileNums[i], rangeKeyValidator) + if err != nil { + return ingestLoadResult{}, err + } + if m != nil { + result.local = append(result.local, ingestLocalMeta{ + TableMetadata: m, + path: paths[i], + }) + } + } + + if !shouldDisableRangeKeyChecks { + rangeKeyValidator := validateSuffixedBoundaries(opts.Comparer, lastRangeKey) + if err := rangeKeyValidator.Validate(nil /* nextFileSmallestKey */); err != nil { + return ingestLoadResult{}, err + } + } + + // Sort the shared files according to level. + sort.Sort(sharedByLevel(shared)) + + result.shared = make([]ingestSharedMeta, 0, len(shared)) + for i := range shared { + m, err := ingestSynthesizeShared(opts, shared[i], sharedFileNums[i]) + if err != nil { + return ingestLoadResult{}, err + } + if shared[i].Level < sharedLevelsStart { + return ingestLoadResult{}, errors.New("cannot ingest shared file in level below sharedLevelsStart") + } + result.shared = append(result.shared, ingestSharedMeta{ + TableMetadata: m, + shared: shared[i], + }) + } + result.external = make([]ingestExternalMeta, 0, len(external)) + for i := range external { + m, err := ingestLoad1External(opts, external[i], externalFileNums[i]) + if err != nil { + return ingestLoadResult{}, err + } + result.external = append(result.external, ingestExternalMeta{ + TableMetadata: m, + external: external[i], + }) + if external[i].Level > 0 { + if i != 0 && !result.externalFilesHaveLevel { + return ingestLoadResult{}, base.AssertionFailedf("pebble: external sstables must all have level set or unset") + } + result.externalFilesHaveLevel = true + } else if result.externalFilesHaveLevel { + return ingestLoadResult{}, base.AssertionFailedf("pebble: external sstables must all have level set or unset") + } + } + return result, nil +} + +func ingestSortAndVerify(cmp Compare, lr ingestLoadResult, exciseSpan KeyRange) error { + // Verify that all the shared files (i.e. files in sharedMeta) + // fit within the exciseSpan. + for _, f := range lr.shared { + if !exciseSpan.Contains(cmp, f.Smallest()) || !exciseSpan.Contains(cmp, f.Largest()) { + return errors.Newf("pebble: shared file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String()) + } + } + + if lr.externalFilesHaveLevel { + for _, f := range lr.external { + if !exciseSpan.Contains(cmp, f.Smallest()) || !exciseSpan.Contains(cmp, f.Largest()) { + return base.AssertionFailedf("pebble: external file outside of excise span, span [%s-%s), file = %s", exciseSpan.Start, exciseSpan.End, f.String()) + } + } + } + + if len(lr.external) > 0 { + if len(lr.shared) > 0 { + // If external files are present alongside shared files, + // return an error. + return base.AssertionFailedf("pebble: external files cannot be ingested atomically alongside shared files") + } + + // Sort according to the smallest key. + slices.SortFunc(lr.external, func(a, b ingestExternalMeta) int { + return cmp(a.Smallest().UserKey, b.Smallest().UserKey) + }) + for i := 1; i < len(lr.external); i++ { + if sstableKeyCompare(cmp, lr.external[i-1].Largest(), lr.external[i].Smallest()) >= 0 { + return errors.Newf("pebble: external sstables have overlapping ranges") + } + } + return nil + } + if len(lr.local) <= 1 { + return nil + } + + // Sort according to the smallest key. + slices.SortFunc(lr.local, func(a, b ingestLocalMeta) int { + return cmp(a.Smallest().UserKey, b.Smallest().UserKey) + }) + + for i := 1; i < len(lr.local); i++ { + if sstableKeyCompare(cmp, lr.local[i-1].Largest(), lr.local[i].Smallest()) >= 0 { + return errors.Newf("pebble: local ingestion sstables have overlapping ranges") + } + } + if len(lr.shared) == 0 { + return nil + } + filesInLevel := make([]*manifest.TableMetadata, 0, len(lr.shared)) + for l := sharedLevelsStart; l < numLevels; l++ { + filesInLevel = filesInLevel[:0] + for i := range lr.shared { + if lr.shared[i].shared.Level == uint8(l) { + filesInLevel = append(filesInLevel, lr.shared[i].TableMetadata) + } + } + for i := range lr.external { + if lr.external[i].external.Level == uint8(l) { + filesInLevel = append(filesInLevel, lr.external[i].TableMetadata) + } + } + slices.SortFunc(filesInLevel, func(a, b *manifest.TableMetadata) int { + return cmp(a.Smallest().UserKey, b.Smallest().UserKey) + }) + for i := 1; i < len(filesInLevel); i++ { + if sstableKeyCompare(cmp, filesInLevel[i-1].Largest(), filesInLevel[i].Smallest()) >= 0 { + return base.AssertionFailedf("pebble: external shared sstables have overlapping ranges") + } + } + } + return nil +} + +func ingestCleanup(objProvider objstorage.Provider, meta []ingestLocalMeta) error { + var firstErr error + for i := range meta { + if err := objProvider.Remove(base.FileTypeTable, meta[i].TableBacking.DiskFileNum); err != nil { + firstErr = firstError(firstErr, err) + } + } + return firstErr +} + +// ingestLinkLocal creates new objects which are backed by either hardlinks to or +// copies of the ingested files. +func ingestLinkLocal( + ctx context.Context, + jobID JobID, + opts *Options, + objProvider objstorage.Provider, + localMetas []ingestLocalMeta, +) error { + for i := range localMetas { + objMeta, err := objProvider.LinkOrCopyFromLocal( + ctx, opts.FS, localMetas[i].path, base.FileTypeTable, localMetas[i].TableBacking.DiskFileNum, + objstorage.CreateOptions{PreferSharedStorage: true}, + ) + if err != nil { + if err2 := ingestCleanup(objProvider, localMetas[:i]); err2 != nil { + opts.Logger.Errorf("ingest cleanup failed: %v", err2) + } + return err + } + if opts.EventListener.TableCreated != nil { + opts.EventListener.TableCreated(TableCreateInfo{ + JobID: int(jobID), + Reason: "ingesting", + Path: objProvider.Path(objMeta), + FileNum: base.PhysicalTableDiskFileNum(localMetas[i].TableNum), + }) + } + } + return nil +} + +// ingestAttachRemote attaches remote objects to the storage provider. +// +// For external objects, we reuse existing FileBackings from the current version +// when possible. +// +// ingestUnprotectExternalBackings() must be called after this function (even in +// error cases). +func (d *DB) ingestAttachRemote(jobID JobID, lr ingestLoadResult) error { + remoteObjs := make([]objstorage.RemoteObjectToAttach, 0, len(lr.shared)+len(lr.external)) + for i := range lr.shared { + backing, err := lr.shared[i].shared.Backing.Get() + if err != nil { + return err + } + remoteObjs = append(remoteObjs, objstorage.RemoteObjectToAttach{ + FileNum: lr.shared[i].TableBacking.DiskFileNum, + FileType: base.FileTypeTable, + Backing: backing, + }) + } + + d.findExistingBackingsForExternalObjects(lr.external) + + newTableBackings := make(map[remote.ObjectKey]*manifest.TableBacking, len(lr.external)) + for i := range lr.external { + meta := lr.external[i].TableMetadata + if meta.TableBacking != nil { + // The backing was filled in by findExistingBackingsForExternalObjects(). + continue + } + key := remote.MakeObjectKey(lr.external[i].external.Locator, lr.external[i].external.ObjName) + if backing, ok := newTableBackings[key]; ok { + // We already created the same backing in this loop. Update its size. + backing.Size += lr.external[i].external.Size + meta.AttachVirtualBacking(backing) + continue + } + providerBacking, err := d.objProvider.CreateExternalObjectBacking(key.Locator, key.ObjectName) + if err != nil { + return err + } + // We have to attach the remote object (and assign it a DiskFileNum). For + // simplicity, we use the same number for both the FileNum and the + // DiskFileNum (even though this is a virtual sstable). + size := max(lr.external[i].external.Size, 1) + meta.InitVirtualBacking(base.DiskFileNum(meta.TableNum), size) + + // Set the underlying TableBacking's size to the same size as the virtualized + // view of the sstable. This ensures that we don't over-prioritize this + // sstable for compaction just yet, as we do not have a clear sense of + // what parts of this sstable are referenced by other nodes. + meta.TableBacking.Size = size + newTableBackings[key] = meta.TableBacking + + remoteObjs = append(remoteObjs, objstorage.RemoteObjectToAttach{ + FileNum: meta.TableBacking.DiskFileNum, + FileType: base.FileTypeTable, + Backing: providerBacking, + }) + } + + for i := range lr.external { + if err := lr.external[i].Validate(d.opts.Comparer.Compare, d.opts.Comparer.FormatKey); err != nil { + return err + } + } + + remoteObjMetas, err := d.objProvider.AttachRemoteObjects(remoteObjs) + if err != nil { + return err + } + + for i := range lr.shared { + // One corner case around file sizes we need to be mindful of, is that + // if one of the shareObjs was initially created by us (and has boomeranged + // back from another node), we'll need to update the TableBacking's size + // to be the true underlying size. Otherwise, we could hit errors when we + // open the db again after a crash/restart (see checkConsistency in open.go), + // plus it more accurately allows us to prioritize compactions of files + // that were originally created by us. + if remoteObjMetas[i].IsShared() && !d.objProvider.IsSharedForeign(remoteObjMetas[i]) { + size, err := d.objProvider.Size(remoteObjMetas[i]) + if err != nil { + return err + } + lr.shared[i].TableBacking.Size = max(uint64(size), 1) + } + } + + if d.opts.EventListener.TableCreated != nil { + for i := range remoteObjMetas { + d.opts.EventListener.TableCreated(TableCreateInfo{ + JobID: int(jobID), + Reason: "ingesting", + Path: d.objProvider.Path(remoteObjMetas[i]), + FileNum: remoteObjMetas[i].DiskFileNum, + }) + } + } + + return nil +} + +// findExistingBackingsForExternalObjects populates the TableBacking for external +// files which are already in use by the current version. +// +// We take a Ref and LatestRef on populated backings. +func (d *DB) findExistingBackingsForExternalObjects(metas []ingestExternalMeta) { + d.mu.Lock() + defer d.mu.Unlock() + + for i := range metas { + diskFileNums := d.objProvider.GetExternalObjects(metas[i].external.Locator, metas[i].external.ObjName) + // We cross-check against fileBackings in the current version because it is + // possible that the external object is referenced by an sstable which only + // exists in a previous version. In that case, that object could be removed + // at any time so we cannot reuse it. + for _, n := range diskFileNums { + if backing, ok := d.mu.versions.latest.virtualBackings.Get(n); ok { + // Protect this backing from being removed from the latest version. We + // will unprotect in ingestUnprotectExternalBackings. + d.mu.versions.latest.virtualBackings.Protect(n) + metas[i].usedExistingBacking = true + metas[i].AttachVirtualBacking(backing) + + // We can't update the size of the backing here, so make sure the + // virtual size is sane. + // TODO(radu): investigate what would it take to update the backing size. + metas[i].Size = min(metas[i].Size, backing.Size) + break + } + } + } +} + +// ingestUnprotectExternalBackings unprotects the file backings that were reused +// for external objects when the ingestion fails. +func (d *DB) ingestUnprotectExternalBackings(lr ingestLoadResult) { + d.mu.Lock() + defer d.mu.Unlock() + + for _, meta := range lr.external { + if meta.usedExistingBacking { + // If the backing is not use anywhere else and the ingest failed (or the + // ingested tables were already compacted away), this call will cause in + // the next version update to remove the backing. + d.mu.versions.latest.virtualBackings.Unprotect(meta.TableBacking.DiskFileNum) + } + } +} + +func setSeqNumInMetadata( + m *manifest.TableMetadata, seqNum base.SeqNum, cmp Compare, format base.FormatKey, +) error { + setSeqFn := func(k base.InternalKey) base.InternalKey { + return base.MakeInternalKey(k.UserKey, seqNum, k.Kind()) + } + // NB: we set the fields directly here, rather than via their Extend* + // methods, as we are updating sequence numbers. + if m.HasPointKeys { + m.PointKeyBounds.SetSmallest(setSeqFn(m.PointKeyBounds.Smallest())) + } + if m.HasRangeKeys { + m.RangeKeyBounds.SetSmallest(setSeqFn(m.RangeKeyBounds.Smallest())) + } + // Only update the seqnum for the largest key if that key is not an + // "exclusive sentinel" (i.e. a range deletion sentinel or a range key + // boundary), as doing so effectively drops the exclusive sentinel (by + // lowering the seqnum from the max value), and extends the bounds of the + // table. + // NB: as the largest range key is always an exclusive sentinel, it is never + // updated. + if m.HasPointKeys && !m.PointKeyBounds.Largest().IsExclusiveSentinel() { + m.PointKeyBounds.SetLargest(setSeqFn(m.PointKeyBounds.Largest())) + } + // Setting smallestSeqNum == largestSeqNum triggers the setting of + // Properties.GlobalSeqNum when an sstable is loaded. + m.SmallestSeqNum = seqNum + m.LargestSeqNum = seqNum + m.LargestSeqNumAbsolute = seqNum + // Ensure the new bounds are consistent. + if err := m.Validate(cmp, format); err != nil { + return err + } + return nil +} + +func ingestUpdateSeqNum( + cmp Compare, format base.FormatKey, seqNum base.SeqNum, loadResult ingestLoadResult, +) error { + // Shared sstables are required to be sorted by level ascending. We then + // iterate the shared sstables in reverse, assigning the lower sequence + // numbers to the shared sstables that will be ingested into the lower + // (larger numbered) levels first. This ensures sequence number shadowing is + // correct. + for i := len(loadResult.shared) - 1; i >= 0; i-- { + if i-1 >= 0 && loadResult.shared[i-1].shared.Level > loadResult.shared[i].shared.Level { + panic(errors.AssertionFailedf("shared files %s, %s out of order", loadResult.shared[i-1], loadResult.shared[i])) + } + if err := setSeqNumInMetadata(loadResult.shared[i].TableMetadata, seqNum, cmp, format); err != nil { + return err + } + seqNum++ + } + for i := range loadResult.external { + if err := setSeqNumInMetadata(loadResult.external[i].TableMetadata, seqNum, cmp, format); err != nil { + return err + } + seqNum++ + } + for i := range loadResult.local { + if err := setSeqNumInMetadata(loadResult.local[i].TableMetadata, seqNum, cmp, format); err != nil { + return err + } + seqNum++ + } + return nil +} + +// ingestTargetLevel returns the target level for a file being ingested. +// If suggestSplit is true, it accounts for ingest-time splitting as part of +// its target level calculation, and if a split candidate is found, that file +// is returned as the splitFile. +func ingestTargetLevel( + ctx context.Context, + cmp base.Compare, + lsmOverlap overlap.WithLSM, + baseLevel int, + compactions map[compaction]struct{}, + meta *manifest.TableMetadata, + suggestSplit bool, +) (targetLevel int, splitFile *manifest.TableMetadata, err error) { + // Find the lowest level which does not have any files which overlap meta. We + // search from L0 to L6 looking for whether there are any files in the level + // which overlap meta. We want the "lowest" level (where lower means + // increasing level number) in order to reduce write amplification. + // + // There are 2 kinds of overlap we need to check for: file boundary overlap + // and data overlap. Data overlap implies file boundary overlap. Note that it + // is always possible to ingest into L0. + // + // To place meta at level i where i > 0: + // - there must not be any data overlap with levels <= i, since that will + // violate the sequence number invariant. + // - no file boundary overlap with level i, since that will violate the + // invariant that files do not overlap in levels i > 0. + // - if there is only a file overlap at a given level, and no data overlap, + // we can still slot a file at that level. We return the fileMetadata with + // which we have file boundary overlap (must be only one file, as sstable + // bounds are usually tight on user keys) and the caller is expected to split + // that sstable into two virtual sstables, allowing this file to go into that + // level. Note that if we have file boundary overlap with two files, which + // should only happen on rare occasions, we treat it as data overlap and + // don't use this optimization. + // + // The file boundary overlap check is simpler to conceptualize. Consider the + // following example, in which the ingested file lies completely before or + // after the file being considered. + // + // |--| |--| ingested file: [a,b] or [f,g] + // |-----| existing file: [c,e] + // _____________________ + // a b c d e f g + // + // In both cases the ingested file can move to considering the next level. + // + // File boundary overlap does not necessarily imply data overlap. The check + // for data overlap is a little more nuanced. Consider the following examples: + // + // 1. No data overlap: + // + // |-| |--| ingested file: [cc-d] or [ee-ff] + // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] + // _____________________ + // a b c d e f g + // + // In this case the ingested files can "fall through" this level. The checks + // continue at the next level. + // + // 2. Data overlap: + // + // |--| ingested file: [d-e] + // |*--*--*----*------*| existing file: [a-g], points: [a, b, c, dd, g] + // _____________________ + // a b c d e f g + // + // In this case the file cannot be ingested into this level as the point 'dd' + // is in the way. + // + // It is worth noting that the check for data overlap is only approximate. In + // the previous example, the ingested table [d-e] could contain only the + // points 'd' and 'e', in which case the table would be eligible for + // considering lower levels. However, such a fine-grained check would need to + // be exhaustive (comparing points and ranges in both the ingested existing + // tables) and such a check is prohibitively expensive. Thus Pebble treats any + // existing point that falls within the ingested table bounds as being "data + // overlap". + + if lsmOverlap[0].Result == overlap.Data { + return 0, nil, nil + } + targetLevel = 0 + splitFile = nil + metaBounds := meta.UserKeyBounds() + for level := baseLevel; level < numLevels; level++ { + var candidateSplitFile *manifest.TableMetadata + switch lsmOverlap[level].Result { + case overlap.Data: + // We cannot ingest into or under this level; return the best target level + // so far. + return targetLevel, splitFile, nil + + case overlap.OnlyBoundary: + if !suggestSplit || lsmOverlap[level].SplitFile == nil { + // We can ingest under this level, but not into this level. + continue + } + // We can ingest into this level if we split this file. + candidateSplitFile = lsmOverlap[level].SplitFile + + case overlap.None: + // We can ingest into this level. + + default: + return 0, nil, base.AssertionFailedf("unexpected WithLevel.Result: %v", lsmOverlap[level].Result) + } + + // Check boundary overlap with any ongoing compactions. We consider an + // overlapping compaction that's writing files to an output level as + // equivalent to boundary overlap with files in that output level. + // + // We cannot check for data overlap with the new SSTs compaction will produce + // since compaction hasn't been done yet. However, there's no need to check + // since all keys in them will be from levels in [c.startLevel, + // c.outputLevel], and all those levels have already had their data overlap + // tested negative (else we'd have returned earlier). + // + // An alternative approach would be to cancel these compactions and proceed + // with an ingest-time split on this level if necessary. However, compaction + // cancellation can result in significant wasted effort and is best avoided + // unless necessary. + overlaps := false + for c := range compactions { + tblCompaction, ok := c.(*tableCompaction) + if !ok { + continue + } + if tblCompaction.outputLevel == nil || level != tblCompaction.outputLevel.level { + continue + } + bounds := tblCompaction.Bounds() + if bounds != nil && metaBounds.Overlaps(cmp, bounds) { + overlaps = true + break + } + } + if !overlaps { + targetLevel = level + splitFile = candidateSplitFile + } + } + return targetLevel, splitFile, nil +} + +// Ingest ingests a set of sstables into the DB. Ingestion of the files is +// atomic and semantically equivalent to creating a single batch containing all +// of the mutations in the sstables. Ingestion may require the memtable to be +// flushed. The ingested sstable files are moved into the DB and must reside on +// the same filesystem as the DB. Sstables can be created for ingestion using +// sstable.Writer. On success, Ingest removes the input paths. +// +// Ingested sstables must have been created with a known KeySchema (when written +// with columnar blocks) and Comparer. They must not contain any references to +// external blob files. +// +// Two types of sstables are accepted for ingestion(s): one is sstables present +// in the instance's vfs.FS and can be referenced locally. The other is sstables +// present in remote.Storage, referred to as shared or foreign sstables. These +// shared sstables can be linked through objstorageprovider.Provider, and do not +// need to already be present on the local vfs.FS. Foreign sstables must all fit +// in an excise span, and are destined for a level specified in SharedSSTMeta. +// +// All sstables *must* be Sync()'d by the caller after all bytes are written +// and before its file handle is closed; failure to do so could violate +// durability or lead to corrupted on-disk state. This method cannot, in a +// platform-and-FS-agnostic way, ensure that all sstables in the input are +// properly synced to disk. Opening new file handles and Sync()-ing them +// does not always guarantee durability; see the discussion here on that: +// https://github.com/cockroachdb/pebble/v2/pull/835#issuecomment-663075379 +// +// Ingestion loads each sstable into the lowest level of the LSM which it +// doesn't overlap (see ingestTargetLevel). If an sstable overlaps a memtable, +// ingestion forces the memtable to flush, and then waits for the flush to +// occur. In some cases, such as with no foreign sstables and no excise span, +// ingestion that gets blocked on a memtable can join the flushable queue and +// finish even before the memtable has been flushed. +// +// The steps for ingestion are: +// +// 1. Allocate table numbers for every sstable being ingested. +// 2. Load the metadata for all sstables being ingested. +// 3. Sort the sstables by smallest key, verifying non overlap (for local +// sstables). +// 4. Hard link (or copy) the local sstables into the DB directory. +// 5. Allocate a sequence number to use for all of the entries in the +// local sstables. This is the step where overlap with memtables is +// determined. If there is overlap, we remember the most recent memtable +// that overlaps. +// 6. Update the sequence number in the ingested local sstables. (Remote +// sstables get fixed sequence numbers that were determined at load time.) +// 7. Wait for the most recent memtable that overlaps to flush (if any). +// 8. Add the ingested sstables to the version (DB.ingestApply). +// 8.1. If an excise span was specified, figure out what sstables in the +// current version overlap with the excise span, and create new virtual +// sstables out of those sstables that exclude the excised span (DB.excise). +// 9. Publish the ingestion sequence number. +// +// Note that if the mutable memtable overlaps with ingestion, a flush of the +// memtable is forced equivalent to DB.Flush. Additionally, subsequent +// mutations that get sequence numbers larger than the ingestion sequence +// number get queued up behind the ingestion waiting for it to complete. This +// can produce a noticeable hiccup in performance. See +// https://github.com/cockroachdb/pebble/v2/issues/25 for an idea for how to fix +// this hiccup. +func (d *DB) Ingest(ctx context.Context, paths []string) error { + if err := d.closed.Load(); err != nil { + panic(err) + } + if d.opts.ReadOnly { + return ErrReadOnly + } + _, err := d.ingest(ctx, ingestArgs{Local: paths}) + return err +} + +// IngestOperationStats provides some information about where in the LSM the +// bytes were ingested. +type IngestOperationStats struct { + // Bytes is the total bytes in the ingested sstables. + Bytes uint64 + // ApproxIngestedIntoL0Bytes is the approximate number of bytes ingested + // into L0. This value is approximate when flushable ingests are active and + // an ingest overlaps an entry in the flushable queue. Currently, this + // approximation is very rough, only including tables that overlapped the + // memtable. This estimate may be improved with #2112. + ApproxIngestedIntoL0Bytes uint64 + // MemtableOverlappingFiles is the count of ingested sstables + // that overlapped keys in the memtables. + MemtableOverlappingFiles int +} + +// ExternalFile are external sstables that can be referenced through +// objprovider and ingested as remote files that will not be refcounted or +// cleaned up. For use with online restore. Note that the underlying sstable +// could contain keys outside the [Smallest,Largest) bounds; however Pebble +// is expected to only read the keys within those bounds. +type ExternalFile struct { + // Locator is the shared.Locator that can be used with objProvider to + // resolve a reference to this external sstable. + Locator remote.Locator + + // ObjName is the unique name of this sstable on Locator. + ObjName string + + // Size of the referenced proportion of the virtualized sstable. An estimate + // is acceptable in lieu of the backing file size. + Size uint64 + + // StartKey and EndKey define the bounds of the sstable; the ingestion + // of this file will only result in keys within [StartKey, EndKey) if + // EndKeyIsInclusive is false or [StartKey, EndKey] if it is true. + // These bounds are loose i.e. it's possible for keys to not span the + // entirety of this range. + // + // StartKey and EndKey user keys must not have suffixes. + // + // Multiple ExternalFiles in one ingestion must all have non-overlapping + // bounds. + StartKey, EndKey []byte + + // EndKeyIsInclusive is true if EndKey should be treated as inclusive. + EndKeyIsInclusive bool + + // HasPointKey and HasRangeKey denote whether this file contains point keys + // or range keys. If both structs are false, an error is returned during + // ingestion. + HasPointKey, HasRangeKey bool + + // SyntheticPrefix will prepend this suffix to all keys in the file during + // iteration. Note that the backing file itself is not modified. + // + // SyntheticPrefix must be a prefix of both Bounds.Start and Bounds.End. + SyntheticPrefix []byte + + // SyntheticSuffix will replace the suffix of every key in the file during + // iteration. Note that the file itself is not modified, rather, every key + // returned by an iterator will have the synthetic suffix. + // + // SyntheticSuffix can only be used under the following conditions: + // - the synthetic suffix must sort before any non-empty suffixes in the + // backing sst (the entire sst, not just the part restricted to Bounds). + // - the backing sst must not contain multiple keys with the same prefix. + SyntheticSuffix []byte + + // Level denotes the level at which this file was present at read time + // if the external file was returned by a scan of an existing Pebble + // instance. If Level is 0, this field is ignored. + Level uint8 +} + +// IngestWithStats does the same as Ingest, and additionally returns +// IngestOperationStats. +func (d *DB) IngestWithStats(ctx context.Context, paths []string) (IngestOperationStats, error) { + if err := d.closed.Load(); err != nil { + panic(err) + } + if d.opts.ReadOnly { + return IngestOperationStats{}, ErrReadOnly + } + return d.ingest(ctx, ingestArgs{Local: paths}) +} + +// IngestExternalFiles does the same as IngestWithStats, and additionally +// accepts external files (with locator info that can be resolved using +// d.opts.SharedStorage). These files must also be non-overlapping with +// each other, and must be resolvable through d.objProvider. +func (d *DB) IngestExternalFiles( + ctx context.Context, external []ExternalFile, +) (IngestOperationStats, error) { + if err := d.closed.Load(); err != nil { + panic(err) + } + + if d.opts.ReadOnly { + return IngestOperationStats{}, ErrReadOnly + } + if d.opts.Experimental.RemoteStorage == nil { + return IngestOperationStats{}, errors.New("pebble: cannot ingest external files without shared storage configured") + } + return d.ingest(ctx, ingestArgs{External: external}) +} + +// IngestAndExcise does the same as IngestWithStats, and additionally accepts a +// list of shared files to ingest that can be read from a remote.Storage through +// a Provider. All the shared files must live within exciseSpan, and any existing +// keys in exciseSpan are deleted by turning existing sstables into virtual +// sstables (if not virtual already) and shrinking their spans to exclude +// exciseSpan. See the comment at Ingest for a more complete picture of the +// ingestion process. +// +// Panics if this DB instance was not instantiated with a remote.Storage and +// shared sstables are present. +func (d *DB) IngestAndExcise( + ctx context.Context, + paths []string, + shared []SharedSSTMeta, + external []ExternalFile, + exciseSpan KeyRange, +) (IngestOperationStats, error) { + if err := d.closed.Load(); err != nil { + panic(err) + } + if d.opts.ReadOnly { + return IngestOperationStats{}, ErrReadOnly + } + // Excise is only supported on prefix keys. + if d.opts.Comparer.Split(exciseSpan.Start) != len(exciseSpan.Start) { + return IngestOperationStats{}, errors.New("IngestAndExcise called with suffixed start key") + } + if d.opts.Comparer.Split(exciseSpan.End) != len(exciseSpan.End) { + return IngestOperationStats{}, errors.New("IngestAndExcise called with suffixed end key") + } + if v := d.FormatMajorVersion(); v < FormatMinForSharedObjects { + return IngestOperationStats{}, errors.Newf( + "store has format major version %d; IngestAndExcise requires at least %d", + v, FormatMinForSharedObjects, + ) + } + args := ingestArgs{ + Local: paths, + Shared: shared, + External: external, + ExciseSpan: exciseSpan, + ExciseBoundsPolicy: tightExciseBounds, + } + return d.ingest(ctx, args) +} + +// Both DB.mu and commitPipeline.mu must be held while this is called. +func (d *DB) newIngestedFlushableEntry( + meta []*manifest.TableMetadata, seqNum base.SeqNum, logNum base.DiskFileNum, exciseSpan KeyRange, +) (*flushableEntry, error) { + // If there's an excise being done atomically with the same ingest, we + // assign the lowest sequence number in the set of sequence numbers for this + // ingestion to the excise. Note that we've already allocated fileCount+1 + // sequence numbers in this case. + // + // This mimics the behaviour in the non-flushable ingest case (see the callsite + // for ingestUpdateSeqNum). + fileSeqNumStart := seqNum + if exciseSpan.Valid() { + fileSeqNumStart = seqNum + 1 // the first seqNum is reserved for the excise. + // The excise span will be retained by the flushable, outliving the + // caller's ingestion call. Copy it. + exciseSpan = KeyRange{ + Start: slices.Clone(exciseSpan.Start), + End: slices.Clone(exciseSpan.End), + } + } + // Update the sequence number for all of the sstables in the + // metadata. Writing the metadata to the manifest when the + // version edit is applied is the mechanism that persists the + // sequence number. The sstables themselves are left unmodified. + // In this case, a version edit will only be written to the manifest + // when the flushable is eventually flushed. If Pebble restarts in that + // time, then we'll lose the ingest sequence number information. But this + // information will also be reconstructed on node restart. + for i, m := range meta { + if err := setSeqNumInMetadata(m, fileSeqNumStart+base.SeqNum(i), d.cmp, d.opts.Comparer.FormatKey); err != nil { + return nil, err + } + } + + f := newIngestedFlushable(meta, d.opts.Comparer, d.newIters, d.tableNewRangeKeyIter, exciseSpan, seqNum) + + // NB: The logNum/seqNum are the WAL number which we're writing this entry + // to and the sequence number within the WAL which we'll write this entry + // to. + entry := d.newFlushableEntry(f, logNum, seqNum) + // The flushable entry starts off with a single reader ref, so increment + // the TableMetadata.Refs. + for _, file := range f.files { + file.Ref() + } + entry.unrefFiles = func(of *manifest.ObsoleteFiles) { + // Invoke Unref on each table. If any files become obsolete, they'll be + // added to the set of obsolete files. + for _, file := range f.files { + file.Unref(of) + } + } + + entry.flushForced = true + entry.releaseMemAccounting = func() {} + return entry, nil +} + +// Both DB.mu and commitPipeline.mu must be held while this is called. Since +// we're holding both locks, the order in which we rotate the memtable or +// recycle the WAL in this function is irrelevant as long as the correct log +// numbers are assigned to the appropriate flushable. +func (d *DB) handleIngestAsFlushable( + meta []*manifest.TableMetadata, seqNum base.SeqNum, exciseSpan KeyRange, +) error { + b := d.NewBatch() + if exciseSpan.Valid() { + b.excise(exciseSpan.Start, exciseSpan.End) + } + for _, m := range meta { + b.ingestSST(m.TableNum) + } + b.setSeqNum(seqNum) + + // If the WAL is disabled, then the logNum used to create the flushable + // entry doesn't matter. We just use the logNum assigned to the current + // mutable memtable. If the WAL is enabled, then this logNum will be + // overwritten by the logNum of the log which will contain the log entry + // for the ingestedFlushable. + logNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum + if !d.opts.DisableWAL { + // We create a new WAL for the flushable instead of reusing the end of + // the previous WAL. This simplifies the increment of the minimum + // unflushed log number, and also simplifies WAL replay. + var prevLogSize uint64 + logNum, prevLogSize = d.rotateWAL() + // As the rotator of the WAL, we're responsible for updating the + // previous flushable queue tail's log size. + d.mu.mem.queue[len(d.mu.mem.queue)-1].logSize = prevLogSize + + d.mu.Unlock() + err := d.commit.directWrite(b) + if err != nil { + d.opts.Logger.Fatalf("%v", err) + } + d.mu.Lock() + } + + entry, err := d.newIngestedFlushableEntry(meta, seqNum, logNum, exciseSpan) + if err != nil { + return err + } + nextSeqNum := seqNum + base.SeqNum(b.Count()) + + // Set newLogNum to the logNum of the previous flushable. This value is + // irrelevant if the WAL is disabled. If the WAL is enabled, then we set + // the appropriate value below. + newLogNum := d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum + if !d.opts.DisableWAL { + // newLogNum will be the WAL num of the next mutable memtable which + // comes after the ingestedFlushable in the flushable queue. The mutable + // memtable will be created below. + // + // The prevLogSize returned by rotateWAL is the WAL to which the + // flushable ingest keys were appended. This intermediary WAL is only + // used to record the flushable ingest and nothing else. + newLogNum, entry.logSize = d.rotateWAL() + } + + d.mu.versions.metrics.Ingest.Count++ + currMem := d.mu.mem.mutable + // NB: Placing ingested sstables above the current memtables + // requires rotating of the existing memtables/WAL. There is + // some concern of churning through tiny memtables due to + // ingested sstables being placed on top of them, but those + // memtables would have to be flushed anyways. + d.mu.mem.queue = append(d.mu.mem.queue, entry) + d.rotateMemtable(newLogNum, nextSeqNum, currMem, 0 /* minSize */) + d.updateReadStateLocked(d.opts.DebugCheck) + // TODO(aaditya): is this necessary? we call this already in rotateMemtable above + d.maybeScheduleFlush() + return nil +} + +type ingestArgs struct { + // Local sstables to ingest. + Local []string + // Shared sstables to ingest. + Shared []SharedSSTMeta + // External sstables to ingest. + External []ExternalFile + // ExciseSpan (unset if not excising). + ExciseSpan KeyRange + ExciseBoundsPolicy exciseBoundsPolicy +} + +// See comment at Ingest() for details on how this works. +func (d *DB) ingest(ctx context.Context, args ingestArgs) (IngestOperationStats, error) { + paths := args.Local + shared := args.Shared + external := args.External + if len(shared) > 0 && d.opts.Experimental.RemoteStorage == nil { + panic("cannot ingest shared sstables with nil SharedStorage") + } + if (args.ExciseSpan.Valid() || len(shared) > 0 || len(external) > 0) && d.FormatMajorVersion() < FormatVirtualSSTables { + return IngestOperationStats{}, errors.New("pebble: format major version too old for excise, shared or external sstable ingestion") + } + if len(external) > 0 && d.FormatMajorVersion() < FormatSyntheticPrefixSuffix { + for i := range external { + if len(external[i].SyntheticPrefix) > 0 { + return IngestOperationStats{}, errors.New("pebble: format major version too old for synthetic prefix ingestion") + } + if len(external[i].SyntheticSuffix) > 0 { + return IngestOperationStats{}, errors.New("pebble: format major version too old for synthetic suffix ingestion") + } + } + } + // Allocate table numbers for all files being ingested and mark them as + // pending in order to prevent them from being deleted. Note that this causes + // the file number ordering to be out of alignment with sequence number + // ordering. The sorting of L0 tables by sequence number avoids relying on + // that (busted) invariant. + pendingOutputs := make([]base.TableNum, len(paths)+len(shared)+len(external)) + for i := 0; i < len(paths)+len(shared)+len(external); i++ { + pendingOutputs[i] = d.mu.versions.getNextTableNum() + } + + jobID := d.newJobID() + + // Load the metadata for all the files being ingested. This step detects + // and elides empty sstables. + loadResult, err := ingestLoad(ctx, d.opts, d.FormatMajorVersion(), paths, shared, external, d.cacheHandle, pendingOutputs) + if err != nil { + return IngestOperationStats{}, err + } + + if loadResult.fileCount() == 0 && !args.ExciseSpan.Valid() { + // All of the sstables to be ingested were empty. Nothing to do. + return IngestOperationStats{}, nil + } + + // Verify the sstables do not overlap. + if err := ingestSortAndVerify(d.cmp, loadResult, args.ExciseSpan); err != nil { + return IngestOperationStats{}, err + } + + // Hard link the sstables into the DB directory. Since the sstables aren't + // referenced by a version, they won't be used. If the hard linking fails + // (e.g. because the files reside on a different filesystem), ingestLinkLocal + // will fall back to copying, and if that fails we undo our work and return an + // error. + if err := ingestLinkLocal(ctx, jobID, d.opts, d.objProvider, loadResult.local); err != nil { + return IngestOperationStats{}, err + } + + err = d.ingestAttachRemote(jobID, loadResult) + defer d.ingestUnprotectExternalBackings(loadResult) + if err != nil { + return IngestOperationStats{}, err + } + + // Make the new tables durable. We need to do this at some point before we + // update the MANIFEST (via UpdateVersionLocked), otherwise a crash can have + // the tables referenced in the MANIFEST, but not present in the provider. + if err := d.objProvider.Sync(); err != nil { + return IngestOperationStats{}, err + } + + // metaFlushableOverlaps is a map indicating which of the ingested sstables + // overlap some table in the flushable queue. It's used to approximate + // ingest-into-L0 stats when using flushable ingests. + metaFlushableOverlaps := make(map[base.TableNum]bool, loadResult.fileCount()) + var mem *flushableEntry + var mut *memTable + // asFlushable indicates whether the sstable was ingested as a flushable. + var asFlushable bool + prepare := func(seqNum base.SeqNum) { + // Note that d.commit.mu is held by commitPipeline when calling prepare. + + // Determine the set of bounds we care about for the purpose of checking + // for overlap among the flushables. If there's an excise span, we need + // to check for overlap with its bounds as well. + overlapBounds := make([]bounded, 0, loadResult.fileCount()+1) + for _, m := range loadResult.local { + overlapBounds = append(overlapBounds, m.TableMetadata) + } + for _, m := range loadResult.shared { + overlapBounds = append(overlapBounds, m.TableMetadata) + } + for _, m := range loadResult.external { + overlapBounds = append(overlapBounds, m.TableMetadata) + } + if args.ExciseSpan.Valid() { + overlapBounds = append(overlapBounds, &args.ExciseSpan) + } + + d.mu.Lock() + defer d.mu.Unlock() + + if args.ExciseSpan.Valid() { + // Check if any of the currently-open EventuallyFileOnlySnapshots + // overlap in key ranges with the excise span. If so, we need to + // check for memtable overlaps with all bounds of that + // EventuallyFileOnlySnapshot in addition to the ingestion's own + // bounds too. + overlapBounds = append(overlapBounds, exciseOverlapBounds( + d.cmp, &d.mu.snapshots.snapshotList, args.ExciseSpan, seqNum)...) + } + + // Check to see if any files overlap with any of the memtables. The queue + // is ordered from oldest to newest with the mutable memtable being the + // last element in the slice. We want to wait for the newest table that + // overlaps. + + for i := len(d.mu.mem.queue) - 1; i >= 0; i-- { + m := d.mu.mem.queue[i] + m.computePossibleOverlaps(func(b bounded) shouldContinue { + // If this is the first table to overlap a flushable, save + // the flushable. This ingest must be ingested or flushed + // after it. + if mem == nil { + mem = m + } + + switch v := b.(type) { + case *manifest.TableMetadata: + // NB: False positives are possible if `m` is a flushable + // ingest that overlaps the file `v` in bounds but doesn't + // contain overlapping data. This is considered acceptable + // because it's rare (in CockroachDB a bound overlap likely + // indicates a data overlap), and blocking the commit + // pipeline while we perform I/O to check for overlap may be + // more disruptive than enqueueing this ingestion on the + // flushable queue and switching to a new memtable. + metaFlushableOverlaps[v.TableNum] = true + case *KeyRange: + // An excise span or an EventuallyFileOnlySnapshot protected range; + // not a file. + default: + panic("unreachable") + } + return continueIteration + }, overlapBounds...) + } + + if mem == nil { + // No overlap with any of the queued flushables, so no need to queue + // after them. + + // New writes with higher sequence numbers may be concurrently + // committed. We must ensure they don't flush before this ingest + // completes. To do that, we ref the mutable memtable as a writer, + // preventing its flushing (and the flushing of all subsequent + // flushables in the queue). Once we've acquired the manifest lock + // to add the ingested sstables to the LSM, we can unref as we're + // guaranteed that the flush won't edit the LSM before this ingest. + mut = d.mu.mem.mutable + mut.writerRef() + return + } + + // The ingestion overlaps with some entry in the flushable queue. If the + // pre-conditions are met below, we can treat this ingestion as a flushable + // ingest, otherwise we wait on the memtable flush before ingestion. + // + // TODO(aaditya): We should make flushableIngest compatible with remote + // files. + hasRemoteFiles := len(shared) > 0 || len(external) > 0 + canIngestFlushable := d.FormatMajorVersion() >= FormatFlushableIngest && + // We require that either the queue of flushables is below the + // stop-writes threshold (note that this is typically a conservative + // check, since not every element of this queue will contribute the full + // memtable memory size that could result in a write stall), or WAL + // failover is permitting an unlimited queue without causing a write + // stall. The latter condition is important to avoid delays in + // visibility of concurrent writes that happen to get a sequence number + // after this ingest and then must wait for this ingest that is itself + // waiting on a large flush. See + // https://github.com/cockroachdb/pebble/v2/issues/4944 for an illustration + // of this problem. + (len(d.mu.mem.queue) < d.opts.MemTableStopWritesThreshold || + d.mu.log.manager.ElevateWriteStallThresholdForFailover()) && + !d.opts.Experimental.DisableIngestAsFlushable() && !hasRemoteFiles && + (!args.ExciseSpan.Valid() || d.FormatMajorVersion() >= FormatFlushableIngestExcises) + + if !canIngestFlushable { + // We're not able to ingest as a flushable, + // so we must synchronously flush. + // + // TODO(bilal): Currently, if any of the files being ingested are shared, + // we cannot use flushable ingests and need + // to wait synchronously. + if mem.flushable == d.mu.mem.mutable { + err = d.makeRoomForWrite(nil) + } + // New writes with higher sequence numbers may be concurrently + // committed. We must ensure they don't flush before this ingest + // completes. To do that, we ref the mutable memtable as a writer, + // preventing its flushing (and the flushing of all subsequent + // flushables in the queue). Once we've acquired the manifest lock + // to add the ingested sstables to the LSM, we can unref as we're + // guaranteed that the flush won't edit the LSM before this ingest. + mut = d.mu.mem.mutable + mut.writerRef() + mem.flushForced = true + d.maybeScheduleFlush() + return + } + // Since there aren't too many memtables already queued up, we can + // slide the ingested sstables on top of the existing memtables. + asFlushable = true + fileMetas := make([]*manifest.TableMetadata, len(loadResult.local)) + for i := range fileMetas { + fileMetas[i] = loadResult.local[i].TableMetadata + } + err = d.handleIngestAsFlushable(fileMetas, seqNum, args.ExciseSpan) + } + + var ve *manifest.VersionEdit + apply := func(seqNum base.SeqNum) { + if err != nil || asFlushable { + // An error occurred during prepare. + if mut != nil { + if mut.writerUnref() { + d.mu.Lock() + d.maybeScheduleFlush() + d.mu.Unlock() + } + } + return + } + + // If there's an excise being done atomically with the same ingest, we + // assign the lowest sequence number in the set of sequence numbers for this + // ingestion to the excise. Note that we've already allocated fileCount+1 + // sequence numbers in this case. + if args.ExciseSpan.Valid() { + seqNum++ // the first seqNum is reserved for the excise. + } + // Update the sequence numbers for all ingested sstables' + // metadata. When the version edit is applied, the metadata is + // written to the manifest, persisting the sequence number. + // The sstables themselves are left unmodified. + if err = ingestUpdateSeqNum( + d.cmp, d.opts.Comparer.FormatKey, seqNum, loadResult, + ); err != nil { + if mut != nil { + if mut.writerUnref() { + d.mu.Lock() + d.maybeScheduleFlush() + d.mu.Unlock() + } + } + return + } + + // If we overlapped with a memtable in prepare wait for the flush to + // finish. + if mem != nil { + <-mem.flushed + } + + // Assign the sstables to the correct level in the LSM and apply the + // version edit. + ve, err = d.ingestApply(ctx, jobID, loadResult, mut, args.ExciseSpan, args.ExciseBoundsPolicy, seqNum) + } + + // Only one ingest can occur at a time because if not, one would block waiting + // for the other to finish applying. This blocking would happen while holding + // the commit mutex which would prevent unrelated batches from writing their + // changes to the WAL and memtable. This will cause a bigger commit hiccup + // during ingestion. + seqNumCount := loadResult.fileCount() + if args.ExciseSpan.Valid() { + seqNumCount++ + } + d.commit.ingestSem <- struct{}{} + d.commit.AllocateSeqNum(seqNumCount, prepare, apply) + <-d.commit.ingestSem + + if err != nil { + if err2 := ingestCleanup(d.objProvider, loadResult.local); err2 != nil { + d.opts.Logger.Errorf("ingest cleanup failed: %v", err2) + } + } else { + // Since we either created a hard link to the ingesting files, or copied + // them over, it is safe to remove the originals paths. + for i := range loadResult.local { + path := loadResult.local[i].path + if err2 := d.opts.FS.Remove(path); err2 != nil { + d.opts.Logger.Errorf("ingest failed to remove original file: %s", err2) + } + } + } + + // TODO(jackson): Refactor this so that the case where there are no files + // but a valid excise span is not so exceptional. + + var stats IngestOperationStats + if loadResult.fileCount() > 0 { + info := TableIngestInfo{ + JobID: int(jobID), + Err: err, + flushable: asFlushable, + } + if len(loadResult.local) > 0 { + info.GlobalSeqNum = loadResult.local[0].SmallestSeqNum + } else if len(loadResult.shared) > 0 { + info.GlobalSeqNum = loadResult.shared[0].SmallestSeqNum + } else { + info.GlobalSeqNum = loadResult.external[0].SmallestSeqNum + } + if ve != nil { + info.Tables = make([]struct { + TableInfo + Level int + }, len(ve.NewTables)) + for i := range ve.NewTables { + e := &ve.NewTables[i] + info.Tables[i].Level = e.Level + info.Tables[i].TableInfo = e.Meta.TableInfo() + stats.Bytes += e.Meta.Size + if e.Level == 0 { + stats.ApproxIngestedIntoL0Bytes += e.Meta.Size + } + if metaFlushableOverlaps[e.Meta.TableNum] { + stats.MemtableOverlappingFiles++ + } + } + } else if asFlushable { + // NB: If asFlushable == true, there are no shared sstables. + info.Tables = make([]struct { + TableInfo + Level int + }, len(loadResult.local)) + for i, f := range loadResult.local { + info.Tables[i].Level = -1 + info.Tables[i].TableInfo = f.TableInfo() + stats.Bytes += f.Size + // We don't have exact stats on which files will be ingested into + // L0, because actual ingestion into the LSM has been deferred until + // flush time. Instead, we infer based on memtable overlap. + // + // TODO(jackson): If we optimistically compute data overlap (#2112) + // before entering the commit pipeline, we can use that overlap to + // improve our approximation by incorporating overlap with L0, not + // just memtables. + if metaFlushableOverlaps[f.TableNum] { + stats.ApproxIngestedIntoL0Bytes += f.Size + stats.MemtableOverlappingFiles++ + } + } + } + d.opts.EventListener.TableIngested(info) + } + + return stats, err +} + +type ingestSplitFile struct { + // ingestFile is the file being ingested. + ingestFile *manifest.TableMetadata + // splitFile is the file that needs to be split to allow ingestFile to slot + // into `level` level. + splitFile *manifest.TableMetadata + // The level where ingestFile will go (and where splitFile already is). + level int +} + +// ingestSplit splits files specified in `files` and updates ve in-place to +// account for existing files getting split into two virtual sstables. The map +// `replacedFiles` contains an in-progress map of all files that have been +// replaced with new virtual sstables in this version edit so far, which is also +// updated in-place. +// +// d.mu as well as the manifest lock must be held when calling this method. +func (d *DB) ingestSplit( + ctx context.Context, + ve *manifest.VersionEdit, + updateMetrics func(*manifest.TableMetadata, int, []manifest.NewTableEntry), + files []ingestSplitFile, + replacedTables map[base.TableNum][]manifest.NewTableEntry, +) error { + for _, s := range files { + ingestFileBounds := s.ingestFile.UserKeyBounds() + // replacedFiles can be thought of as a tree, where we start iterating with + // s.splitFile and run its fileNum through replacedFiles, then find which of + // the replaced files overlaps with s.ingestFile, which becomes the new + // splitFile, then we check splitFile's replacements in replacedFiles again + // for overlap with s.ingestFile, and so on until we either can't find the + // current splitFile in replacedFiles (i.e. that's the file that now needs to + // be split), or we don't find a file that overlaps with s.ingestFile, which + // means a prior ingest split already produced enough room for s.ingestFile + // to go into this level without necessitating another ingest split. + splitFile := s.splitFile + for splitFile != nil { + replaced, ok := replacedTables[splitFile.TableNum] + if !ok { + break + } + updatedSplitFile := false + for i := range replaced { + if replaced[i].Meta.Overlaps(d.cmp, &ingestFileBounds) { + if updatedSplitFile { + // This should never happen because the earlier ingestTargetLevel + // function only finds split file candidates that are guaranteed to + // have no data overlap, only boundary overlap. See the comments + // in that method to see the definitions of data vs boundary + // overlap. That, plus the fact that files in `replaced` are + // guaranteed to have file bounds that are tight on user keys + // (as that's what `d.excise` produces), means that the only case + // where we overlap with two or more files in `replaced` is if we + // actually had data overlap all along, or if the ingestion files + // were overlapping, either of which is an invariant violation. + panic("updated with two files in ingestSplit") + } + splitFile = replaced[i].Meta + updatedSplitFile = true + } + } + if !updatedSplitFile { + // None of the replaced files overlapped with the file being ingested. + // This can happen if we've already excised a span overlapping with + // this file, or if we have consecutive ingested files that can slide + // within the same gap between keys in an existing file. For instance, + // if an existing file has keys a and g and we're ingesting b-c, d-e, + // the first loop iteration will split the existing file into one that + // ends in a and another that starts at g, and the second iteration will + // fall into this case and require no splitting. + // + // No splitting necessary. + splitFile = nil + } + } + if splitFile == nil { + continue + } + // NB: excise operates on [start, end). We're splitting at [start, end] + // (assuming !s.ingestFile.Largest.IsExclusiveSentinel()). The conflation + // of exclusive vs inclusive end bounds should not make a difference here + // as we're guaranteed to not have any data overlap between splitFile and + // s.ingestFile. d.excise will return an error if we pass an inclusive user + // key bound _and_ we end up seeing data overlap at the end key. + exciseBounds := base.UserKeyBoundsFromInternal(s.ingestFile.Smallest(), s.ingestFile.Largest()) + leftTable, rightTable, err := d.exciseTable(ctx, exciseBounds, splitFile, s.level, tightExciseBounds) + if err != nil { + return err + } + added := applyExciseToVersionEdit(ve, splitFile, leftTable, rightTable, s.level) + replacedTables[splitFile.TableNum] = added + for i := range added { + addedBounds := added[i].Meta.UserKeyBounds() + if s.ingestFile.Overlaps(d.cmp, &addedBounds) { + panic("ingest-time split produced a file that overlaps with ingested file") + } + } + updateMetrics(splitFile, s.level, added) + } + // Flatten the version edit by removing any entries from ve.NewFiles that + // are also in ve.DeletedFiles. + newNewFiles := ve.NewTables[:0] + for i := range ve.NewTables { + fn := ve.NewTables[i].Meta.TableNum + deEntry := manifest.DeletedTableEntry{Level: ve.NewTables[i].Level, FileNum: fn} + if _, ok := ve.DeletedTables[deEntry]; ok { + delete(ve.DeletedTables, deEntry) + } else { + newNewFiles = append(newNewFiles, ve.NewTables[i]) + } + } + ve.NewTables = newNewFiles + return nil +} + +func (d *DB) ingestApply( + ctx context.Context, + jobID JobID, + lr ingestLoadResult, + mut *memTable, + exciseSpan KeyRange, + exciseBoundsPolicy exciseBoundsPolicy, + exciseSeqNum base.SeqNum, +) (*manifest.VersionEdit, error) { + d.mu.Lock() + defer d.mu.Unlock() + + ve := &manifest.VersionEdit{ + NewTables: make([]manifest.NewTableEntry, lr.fileCount()), + } + if exciseSpan.Valid() || (d.opts.Experimental.IngestSplit != nil && d.opts.Experimental.IngestSplit()) { + ve.DeletedTables = map[manifest.DeletedTableEntry]*manifest.TableMetadata{} + } + var metrics levelMetricsDelta + + // Determine the target level inside UpdateVersionLocked. This prevents two + // concurrent ingestion jobs from using the same version to determine the + // target level, and also provides serialization with concurrent compaction + // and flush jobs. + err := d.mu.versions.UpdateVersionLocked(func() (versionUpdate, error) { + if mut != nil { + // Unref the mutable memtable to allows its flush to proceed. Now that we've + // acquired the manifest lock, we can be certain that if the mutable + // memtable has received more recent conflicting writes, the flush won't + // beat us to applying to the manifest resulting in sequence number + // inversion. Even though we call maybeScheduleFlush right now, this flush + // will apply after our ingestion. + if mut.writerUnref() { + d.maybeScheduleFlush() + } + } + + current := d.mu.versions.currentVersion() + overlapChecker := &overlapChecker{ + comparer: d.opts.Comparer, + newIters: d.newIters, + opts: IterOptions{ + logger: d.opts.Logger, + Category: categoryIngest, + }, + v: current, + } + shouldIngestSplit := d.opts.Experimental.IngestSplit != nil && + d.opts.Experimental.IngestSplit() && d.FormatMajorVersion() >= FormatVirtualSSTables + baseLevel := d.mu.versions.picker.getBaseLevel() + // filesToSplit is a list where each element is a pair consisting of a file + // being ingested and a file being split to make room for an ingestion into + // that level. Each ingested file will appear at most once in this list. It + // is possible for split files to appear twice in this list. + filesToSplit := make([]ingestSplitFile, 0) + checkCompactions := false + for i := 0; i < lr.fileCount(); i++ { + // Determine the lowest level in the LSM for which the sstable doesn't + // overlap any existing files in the level. + var m *manifest.TableMetadata + specifiedLevel := -1 + isShared := false + isExternal := false + if i < len(lr.local) { + // local file. + m = lr.local[i].TableMetadata + } else if (i - len(lr.local)) < len(lr.shared) { + // shared file. + isShared = true + sharedIdx := i - len(lr.local) + m = lr.shared[sharedIdx].TableMetadata + specifiedLevel = int(lr.shared[sharedIdx].shared.Level) + } else { + // external file. + isExternal = true + externalIdx := i - (len(lr.local) + len(lr.shared)) + m = lr.external[externalIdx].TableMetadata + if lr.externalFilesHaveLevel { + specifiedLevel = int(lr.external[externalIdx].external.Level) + } + } + + // Add to CreatedBackingTables if this is a new backing. + // + // Shared files always have a new backing. External files have new backings + // iff the backing disk file num and the file num match (see ingestAttachRemote). + if isShared || (isExternal && m.TableBacking.DiskFileNum == base.DiskFileNum(m.TableNum)) { + ve.CreatedBackingTables = append(ve.CreatedBackingTables, m.TableBacking) + } + + f := &ve.NewTables[i] + var err error + if specifiedLevel != -1 { + f.Level = specifiedLevel + } else { + var splitTable *manifest.TableMetadata + if exciseSpan.Valid() && exciseSpan.Contains(d.cmp, m.Smallest()) && exciseSpan.Contains(d.cmp, m.Largest()) { + // This file fits perfectly within the excise span. We can slot it at + // L6, or sharedLevelsStart - 1 if we have shared files. + if len(lr.shared) > 0 || lr.externalFilesHaveLevel { + f.Level = sharedLevelsStart - 1 + if baseLevel > f.Level { + f.Level = 0 + } + } else { + f.Level = 6 + } + } else { + // We check overlap against the LSM without holding DB.mu. Note that we + // are still holding the log lock, so the version cannot change. + // TODO(radu): perform this check optimistically outside of the log lock. + var lsmOverlap overlap.WithLSM + lsmOverlap, err = func() (overlap.WithLSM, error) { + d.mu.Unlock() + defer d.mu.Lock() + return overlapChecker.DetermineLSMOverlap(ctx, m.UserKeyBounds()) + }() + if err == nil { + f.Level, splitTable, err = ingestTargetLevel( + ctx, d.cmp, lsmOverlap, baseLevel, d.mu.compact.inProgress, m, shouldIngestSplit, + ) + } + } + + if splitTable != nil { + if invariants.Enabled { + if lf := current.Levels[f.Level].Find(d.cmp, splitTable); lf.Empty() { + panic("splitFile returned is not in level it should be") + } + } + // We take advantage of the fact that we won't drop the db mutex + // between now and the call to UpdateVersionLocked. So, no files should + // get added to a new in-progress compaction at this point. We can + // avoid having to iterate on in-progress compactions to cancel them + // if none of the files being split have a compacting state. + if splitTable.IsCompacting() { + checkCompactions = true + } + filesToSplit = append(filesToSplit, ingestSplitFile{ingestFile: m, splitFile: splitTable, level: f.Level}) + } + } + if err != nil { + return versionUpdate{}, err + } + if isShared && f.Level < sharedLevelsStart { + panic(fmt.Sprintf("cannot slot a shared file higher than the highest shared level: %d < %d", + f.Level, sharedLevelsStart)) + } + f.Meta = m + levelMetrics := metrics[f.Level] + if levelMetrics == nil { + levelMetrics = &LevelMetrics{} + metrics[f.Level] = levelMetrics + } + levelMetrics.TablesCount++ + levelMetrics.TablesSize += int64(m.Size) + levelMetrics.EstimatedReferencesSize += m.EstimatedReferenceSize() + levelMetrics.TableBytesIngested += m.Size + levelMetrics.TablesIngested++ + } + // replacedTables maps files excised due to exciseSpan (or splitFiles returned + // by ingestTargetLevel), to files that were created to replace it. This map + // is used to resolve references to split files in filesToSplit, as it is + // possible for a file that we want to split to no longer exist or have a + // newer fileMetadata due to a split induced by another ingestion file, or an + // excise. + replacedTables := make(map[base.TableNum][]manifest.NewTableEntry) + updateLevelMetricsOnExcise := func(m *manifest.TableMetadata, level int, added []manifest.NewTableEntry) { + levelMetrics := metrics[level] + if levelMetrics == nil { + levelMetrics = &LevelMetrics{} + metrics[level] = levelMetrics + } + levelMetrics.TablesCount-- + levelMetrics.TablesSize -= int64(m.Size) + levelMetrics.EstimatedReferencesSize -= m.EstimatedReferenceSize() + for i := range added { + levelMetrics.TablesCount++ + levelMetrics.TablesSize += int64(added[i].Meta.Size) + levelMetrics.EstimatedReferencesSize += added[i].Meta.EstimatedReferenceSize() + } + } + var exciseBounds base.UserKeyBounds + if exciseSpan.Valid() { + exciseBounds = exciseSpan.UserKeyBounds() + // Iterate through all levels and find files that intersect with exciseSpan. + // + // TODO(bilal): We could drop the DB mutex here as we don't need it for + // excises; we only need to hold the version lock which we already are + // holding. However releasing the DB mutex could mess with the + // ingestTargetLevel calculation that happened above, as it assumed that it + // had a complete view of in-progress compactions that wouldn't change + // until UpdateVersionLocked is called. If we were to drop the mutex now, + // we could schedule another in-progress compaction that would go into the + // chosen target level and lead to file overlap within level (which would + // panic in UpdateVersionLocked). We should drop the db mutex here, do the + // excise, then re-grab the DB mutex and rerun just the in-progress + // compaction check to see if any new compactions are conflicting with our + // chosen target levels for files, and if they are, we should signal those + // compactions to error out. + for layer, ls := range current.AllLevelsAndSublevels() { + for m := range ls.Overlaps(d.cmp, exciseSpan.UserKeyBounds()).All() { + leftTable, rightTable, err := d.exciseTable(ctx, exciseBounds, m, layer.Level(), exciseBoundsPolicy) + if err != nil { + return versionUpdate{}, err + } + newFiles := applyExciseToVersionEdit(ve, m, leftTable, rightTable, layer.Level()) + replacedTables[m.TableNum] = newFiles + updateLevelMetricsOnExcise(m, layer.Level(), newFiles) + } + } + } + if len(filesToSplit) > 0 { + // For the same reasons as the above call to excise, we hold the db mutex + // while calling this method. + if err := d.ingestSplit(ctx, ve, updateLevelMetricsOnExcise, filesToSplit, replacedTables); err != nil { + return versionUpdate{}, err + } + } + if len(filesToSplit) > 0 || exciseSpan.Valid() { + for c := range d.mu.compact.inProgress { + if c.VersionEditApplied() { + continue + } + // Check if this compaction overlaps with the excise span. Note that just + // checking if the inputs individually overlap with the excise span + // isn't sufficient; for instance, a compaction could have [a,b] and [e,f] + // as inputs and write it all out as [a,b,e,f] in one sstable. If we're + // doing a [c,d) excise at the same time as this compaction, we will have + // to error out the whole compaction as we can't guarantee it hasn't/won't + // write a file overlapping with the excise span. + bounds := c.Bounds() + if bounds != nil && bounds.Overlaps(d.cmp, &exciseBounds) { + c.Cancel() + } + // Check if this compaction's inputs have been replaced due to an + // ingest-time split. In that case, cancel the compaction as a newly picked + // compaction would need to include any new files that slid in between + // previously-existing files. Note that we cancel any compaction that has a + // file that was ingest-split as an input, even if it started before this + // ingestion. + if checkCompactions { + for _, table := range c.Tables() { + if _, ok := replacedTables[table.TableNum]; ok { + c.Cancel() + break + } + } + } + } + } + + return versionUpdate{ + VE: ve, + JobID: jobID, + Metrics: metrics, + InProgressCompactionsFn: func() []compactionInfo { return d.getInProgressCompactionInfoLocked(nil) }, + }, nil + }) + if err != nil { + return nil, err + } + + // Check for any EventuallyFileOnlySnapshots that could be watching for + // an excise on this span. There should be none as the + // computePossibleOverlaps steps should have forced these EFOS to transition + // to file-only snapshots by now. If we see any that conflict with this + // excise, panic. + if exciseSpan.Valid() { + for s := d.mu.snapshots.root.next; s != &d.mu.snapshots.root; s = s.next { + // Skip non-EFOS snapshots, and also skip any EFOS that were created + // *after* the excise. + if s.efos == nil || base.Visible(exciseSeqNum, s.efos.seqNum, base.SeqNumMax) { + continue + } + efos := s.efos + // TODO(bilal): We can make this faster by taking advantage of the sorted + // nature of protectedRanges to do a sort.Search, or even maintaining a + // global list of all protected ranges instead of having to peer into every + // snapshot. + for i := range efos.protectedRanges { + if efos.protectedRanges[i].OverlapsKeyRange(d.cmp, exciseSpan) { + panic("unexpected excise of an EventuallyFileOnlySnapshot's bounds") + } + } + } + } + + d.mu.versions.metrics.Ingest.Count++ + + d.updateReadStateLocked(d.opts.DebugCheck) + // updateReadStateLocked could have generated obsolete tables, schedule a + // cleanup job if necessary. + d.deleteObsoleteFiles(jobID) + d.updateTableStatsLocked(ve.NewTables) + // The ingestion may have pushed a level over the threshold for compaction, + // so check to see if one is necessary and schedule it. + d.maybeScheduleCompaction() + var toValidate []manifest.NewTableEntry + dedup := make(map[base.DiskFileNum]struct{}) + for _, entry := range ve.NewTables { + if _, ok := dedup[entry.Meta.TableBacking.DiskFileNum]; !ok { + toValidate = append(toValidate, entry) + dedup[entry.Meta.TableBacking.DiskFileNum] = struct{}{} + } + } + d.maybeValidateSSTablesLocked(toValidate) + return ve, nil +} + +// maybeValidateSSTablesLocked adds the slice of newTableEntrys to the pending +// queue of files to be validated, when the feature is enabled. +// +// Note that if two entries with the same backing file are added twice, then the +// block checksums for the backing file will be validated twice. +// +// DB.mu must be locked when calling. +func (d *DB) maybeValidateSSTablesLocked(newFiles []manifest.NewTableEntry) { + // Only add to the validation queue when the feature is enabled. + if !d.opts.Experimental.ValidateOnIngest { + return + } + + d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, newFiles...) + if d.shouldValidateSSTablesLocked() { + go d.validateSSTables() + } +} + +// shouldValidateSSTablesLocked returns true if SSTable validation should run. +// DB.mu must be locked when calling. +func (d *DB) shouldValidateSSTablesLocked() bool { + return !d.mu.tableValidation.validating && + d.closed.Load() == nil && + d.opts.Experimental.ValidateOnIngest && + len(d.mu.tableValidation.pending) > 0 +} + +// validateSSTables runs a round of validation on the tables in the pending +// queue. +func (d *DB) validateSSTables() { + d.mu.Lock() + if !d.shouldValidateSSTablesLocked() { + d.mu.Unlock() + return + } + + pending := d.mu.tableValidation.pending + d.mu.tableValidation.pending = nil + d.mu.tableValidation.validating = true + jobID := d.newJobIDLocked() + rs := d.loadReadState() + + // Drop DB.mu before performing IO. + d.mu.Unlock() + + // Validate all tables in the pending queue. This could lead to a situation + // where we are starving IO from other tasks due to having to page through + // all the blocks in all the sstables in the queue. + // TODO(travers): Add some form of pacing to avoid IO starvation. + + // If we fail to validate any files due to reasons other than uncovered + // corruption, accumulate them and re-queue them for another attempt. + var retry []manifest.NewTableEntry + + for _, f := range pending { + // The file may have been moved or deleted since it was ingested, in + // which case we skip. + if !rs.current.Contains(f.Level, f.Meta) { + // Assume the file was moved to a lower level. It is rare enough + // that a table is moved or deleted between the time it was ingested + // and the time the validation routine runs that the overall cost of + // this inner loop is tolerably low, when amortized over all + // ingested tables. + found := false + for i := f.Level + 1; i < numLevels; i++ { + if rs.current.Contains(i, f.Meta) { + found = true + break + } + } + if !found { + continue + } + } + + // TOOD(radu): plumb a ReadEnv with a CategoryIngest stats collector through + // to ValidateBlockChecksums. + err := d.fileCache.withReader(context.TODO(), block.NoReadEnv, + f.Meta, func(r *sstable.Reader, _ sstable.ReadEnv) error { + return r.ValidateBlockChecksums() + }) + + if err != nil { + if IsCorruptionError(err) { + // TODO(travers): Hook into the corruption reporting pipeline, once + // available. See pebble#1192. + d.opts.Logger.Fatalf("pebble: encountered corruption during ingestion: %s", err) + } else { + // If there was some other, possibly transient, error that + // caused table validation to fail inform the EventListener and + // move on. We remember the table so that we can retry it in a + // subsequent table validation job. + // + // TODO(jackson): If the error is not transient, this will retry + // validation indefinitely. While not great, it's the same + // behavior as erroring flushes and compactions. We should + // address this as a part of #270. + d.opts.EventListener.BackgroundError(err) + retry = append(retry, f) + continue + } + } + + d.opts.EventListener.TableValidated(TableValidatedInfo{ + JobID: int(jobID), + Meta: f.Meta, + }) + } + rs.unref() + d.mu.Lock() + defer d.mu.Unlock() + d.mu.tableValidation.pending = append(d.mu.tableValidation.pending, retry...) + d.mu.tableValidation.validating = false + d.mu.tableValidation.cond.Broadcast() + if d.shouldValidateSSTablesLocked() { + go d.validateSSTables() + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal.go b/vendor/github.com/cockroachdb/pebble/v2/internal.go new file mode 100644 index 0000000..13daab2 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal.go @@ -0,0 +1,81 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import "github.com/cockroachdb/pebble/v2/internal/base" + +// SeqNum exports the base.SeqNum type. +type SeqNum = base.SeqNum + +// InternalKeyKind exports the base.InternalKeyKind type. +type InternalKeyKind = base.InternalKeyKind + +// These constants are part of the file format, and should not be changed. +const ( + InternalKeyKindDelete = base.InternalKeyKindDelete + InternalKeyKindSet = base.InternalKeyKindSet + InternalKeyKindMerge = base.InternalKeyKindMerge + InternalKeyKindLogData = base.InternalKeyKindLogData + InternalKeyKindSingleDelete = base.InternalKeyKindSingleDelete + InternalKeyKindRangeDelete = base.InternalKeyKindRangeDelete + InternalKeyKindMax = base.InternalKeyKindMax + InternalKeyKindSetWithDelete = base.InternalKeyKindSetWithDelete + InternalKeyKindRangeKeySet = base.InternalKeyKindRangeKeySet + InternalKeyKindRangeKeyUnset = base.InternalKeyKindRangeKeyUnset + InternalKeyKindRangeKeyDelete = base.InternalKeyKindRangeKeyDelete + InternalKeyKindRangeKeyMin = base.InternalKeyKindRangeKeyMin + InternalKeyKindRangeKeyMax = base.InternalKeyKindRangeKeyMax + InternalKeyKindIngestSST = base.InternalKeyKindIngestSST + InternalKeyKindDeleteSized = base.InternalKeyKindDeleteSized + InternalKeyKindExcise = base.InternalKeyKindExcise + InternalKeyKindInvalid = base.InternalKeyKindInvalid +) + +// InternalKeyTrailer exports the base.InternalKeyTrailer type. +type InternalKeyTrailer = base.InternalKeyTrailer + +// InternalKey exports the base.InternalKey type. +type InternalKey = base.InternalKey + +// KeyRange exports the base.KeyRange type. +type KeyRange = base.KeyRange + +// MakeInternalKey constructs an internal key from a specified user key, +// sequence number and kind. +func MakeInternalKey(userKey []byte, seqNum SeqNum, kind InternalKeyKind) InternalKey { + return base.MakeInternalKey(userKey, seqNum, kind) +} + +// MakeInternalKeyTrailer constructs a trailer from a specified sequence number +// and kind. +func MakeInternalKeyTrailer(seqNum SeqNum, kind InternalKeyKind) InternalKeyTrailer { + return base.MakeTrailer(seqNum, kind) +} + +type internalIterator = base.InternalIterator + +type topLevelIterator = base.TopLevelIterator + +// IsCorruptionError returns true if the given error indicates database +// corruption. +func IsCorruptionError(err error) bool { + return base.IsCorruptionError(err) +} + +// ErrCorruption is a marker to indicate that data in a file (WAL, MANIFEST, +// sstable) isn't in the expected format. +// DEPRECATED: should use IsCorruptionError() instead. +var ErrCorruption = base.ErrCorruption + +// AttributeAndLen exports the base.AttributeAndLen type. +type AttributeAndLen = base.AttributeAndLen + +// ShortAttribute exports the base.ShortAttribute type. +type ShortAttribute = base.ShortAttribute + +// LazyFetcher exports the base.LazyFetcher type. This export is needed since +// LazyValue.Clone requires a pointer to a LazyFetcher struct to avoid +// allocations. No code outside Pebble needs to peer into a LazyFetcher. +type LazyFetcher = base.LazyFetcher diff --git a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/LICENSE b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/LICENSE similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/arenaskl/LICENSE rename to vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/LICENSE diff --git a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/README.md b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/README.md similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/arenaskl/README.md rename to vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/README.md diff --git a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/arena.go b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/arena.go similarity index 93% rename from vendor/github.com/cockroachdb/pebble/internal/arenaskl/arena.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/arena.go index 011c3b0..ddf8662 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/arena.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/arena.go @@ -22,8 +22,8 @@ import ( "unsafe" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/constants" - "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/constants" + "github.com/cockroachdb/pebble/v2/internal/invariants" ) // Arena is lock-free. @@ -80,14 +80,14 @@ func (a *Arena) Capacity() uint32 { // If overflow is not 0, it also ensures that many bytes after the buffer are // inside the arena (this is used for structures that are larger than the // requested size but don't use those extra bytes). -func (a *Arena) alloc(size, alignment, overflow uint32) (uint32, uint32, error) { +func (a *Arena) alloc(size, alignment, overflow uint32) (uint32, error) { if invariants.Enabled && (alignment&(alignment-1)) != 0 { panic(errors.AssertionFailedf("invalid alignment %d", alignment)) } // Verify that the arena isn't already full. origSize := a.n.Load() if int(origSize) > len(a.buf) { - return 0, 0, ErrArenaFull + return 0, ErrArenaFull } // Pad the allocation with enough bytes to ensure the requested alignment. @@ -95,12 +95,12 @@ func (a *Arena) alloc(size, alignment, overflow uint32) (uint32, uint32, error) newSize := a.n.Add(padded) if newSize+uint64(overflow) > uint64(len(a.buf)) { - return 0, 0, ErrArenaFull + return 0, ErrArenaFull } // Return the aligned offset. offset := (uint32(newSize) - size) & ^(alignment - 1) - return offset, uint32(padded), nil + return offset, nil } func (a *Arena) getBytes(offset uint32, size uint32) []byte { diff --git a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/flush_iterator.go b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/flush_iterator.go similarity index 66% rename from vendor/github.com/cockroachdb/pebble/internal/arenaskl/flush_iterator.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/flush_iterator.go index 2a7ea03..f6279ee 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/flush_iterator.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/flush_iterator.go @@ -17,14 +17,13 @@ package arenaskl -import "github.com/cockroachdb/pebble/internal/base" +import "github.com/cockroachdb/pebble/v2/internal/base" // flushIterator is an iterator over the skiplist object. Use Skiplist.NewFlushIter // to construct an iterator. The current state of the iterator can be cloned by // simply value copying the struct. type flushIterator struct { Iterator - bytesIterated *uint64 } // flushIterator implements the base.InternalIterator interface. @@ -34,21 +33,15 @@ func (it *flushIterator) String() string { return "memtable" } -func (it *flushIterator) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { +func (it *flushIterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { panic("pebble: SeekGE unimplemented") } -func (it *flushIterator) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { +func (it *flushIterator) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { panic("pebble: SeekPrefixGE unimplemented") } -func (it *flushIterator) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*base.InternalKey, base.LazyValue) { +func (it *flushIterator) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { panic("pebble: SeekLT unimplemented") } @@ -56,33 +49,28 @@ func (it *flushIterator) SeekLT( // if the iterator is pointing at a valid entry, and (nil, nil) otherwise. Note // that First only checks the upper bound. It is up to the caller to ensure // that key is greater than or equal to the lower bound. -func (it *flushIterator) First() (*base.InternalKey, base.LazyValue) { - key, val := it.Iterator.First() - if key == nil { - return nil, base.LazyValue{} - } - *it.bytesIterated += uint64(it.nd.allocSize) - return key, val +func (it *flushIterator) First() *base.InternalKV { + return it.Iterator.First() } // Next advances to the next position. Returns the key and value if the // iterator is pointing at a valid entry, and (nil, nil) otherwise. // Note: flushIterator.Next mirrors the implementation of Iterator.Next // due to performance. Keep the two in sync. -func (it *flushIterator) Next() (*base.InternalKey, base.LazyValue) { +func (it *flushIterator) Next() *base.InternalKV { it.nd = it.list.getNext(it.nd, 0) if it.nd == it.list.tail { - return nil, base.LazyValue{} + return nil } it.decodeKey() - *it.bytesIterated += uint64(it.nd.allocSize) - return &it.key, base.MakeInPlaceValue(it.value()) + it.kv.V = base.MakeInPlaceValue(it.value()) + return &it.kv } -func (it *flushIterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { +func (it *flushIterator) NextPrefix(succKey []byte) *base.InternalKV { panic("pebble: NextPrefix unimplemented") } -func (it *flushIterator) Prev() (*base.InternalKey, base.LazyValue) { +func (it *flushIterator) Prev() *base.InternalKV { panic("pebble: Prev unimplemented") } diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/iterator.go b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/iterator.go new file mode 100644 index 0000000..0fb1523 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/iterator.go @@ -0,0 +1,287 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * Modifications copyright (C) 2017 Andy Kimball and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package arenaskl + +import ( + "context" + "sync" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +type splice struct { + prev *node + next *node +} + +func (s *splice) init(prev, next *node) { + s.prev = prev + s.next = next +} + +// Iterator is an iterator over the skiplist object. Use Skiplist.NewIter +// to construct an iterator. The current state of the iterator can be cloned by +// simply value copying the struct. All iterator methods are thread-safe. +type Iterator struct { + list *Skiplist + nd *node + kv base.InternalKV + lower []byte + upper []byte + // {lower|upper}Node are lazily populated with an arbitrary node that is + // beyond the lower or upper bound respectively. Note the node is + // "arbitrary" because it may not be the first node that exceeds the bound. + // Concurrent insertions into the skiplist may introduce new nodes with keys + // that exceed the bounds but are closer to the bounds than the current + // values of [lower|upper]Node. + // + // Once populated, [lower|upper]Node may be used to detect when iteration + // has reached a bound without performing a key comparison. This may be + // beneficial when performing repeated SeekGEs with TrySeekUsingNext and an + // upper bound set. Once the upper bound has been met, no additional key + // comparisons are necessary. + lowerNode *node + upperNode *node +} + +// Iterator implements the base.InternalIterator interface. +var _ base.InternalIterator = (*Iterator)(nil) + +var iterPool = sync.Pool{ + New: func() interface{} { + return &Iterator{} + }, +} + +// Close resets the iterator. +func (it *Iterator) Close() error { + *it = Iterator{} + iterPool.Put(it) + return nil +} + +func (it *Iterator) String() string { + return "memtable" +} + +// Error returns any accumulated error. +func (it *Iterator) Error() error { + return nil +} + +// SeekGE moves the iterator to the first entry whose key is greater than or +// equal to the given key. Returns the KV pair if the iterator is pointing at a +// valid entry, and nil otherwise. Note that SeekGE only checks the upper bound. +// It is up to the caller to ensure that key is greater than or equal to the +// lower bound. +func (it *Iterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { + if flags.TrySeekUsingNext() { + if it.nd == it.list.tail || it.nd == it.upperNode { + // Iterator is done. + return nil + } + less := it.list.cmp(it.kv.K.UserKey, key) < 0 + // Arbitrary constant. By measuring the seek cost as a function of the + // number of elements in the skip list, and fitting to a model, we + // could adjust the number of nexts based on the current size of the + // skip list. + const numNexts = 5 + kv := &it.kv + for i := 0; less && i < numNexts; i++ { + if kv = it.Next(); kv == nil { + // Iterator is done. + return nil + } + less = it.list.cmp(kv.K.UserKey, key) < 0 + } + if !less { + return kv + } + } + _, it.nd, _ = it.seekForBaseSplice(key) + if it.nd == it.list.tail || it.nd == it.upperNode { + return nil + } + it.decodeKey() + if it.upper != nil && it.list.cmp(it.upper, it.kv.K.UserKey) <= 0 { + it.upperNode = it.nd + return nil + } + it.kv.V = base.MakeInPlaceValue(it.value()) + return &it.kv +} + +// SeekPrefixGE moves the iterator to the first entry whose key is greater than +// or equal to the given key. This method is equivalent to SeekGE and is +// provided so that an arenaskl.Iterator implements the +// internal/base.InternalIterator interface. +func (it *Iterator) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + return it.SeekGE(key, flags) +} + +// SeekLT moves the iterator to the last entry whose key is less than the given +// key. Returns the KV pair if the iterator is pointing at a valid entry, and +// nil otherwise. Note that SeekLT only checks the lower bound. It is up to the +// caller to ensure that key is less than the upper bound. +func (it *Iterator) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { + // NB: the top-level Iterator has already adjusted key based on + // the upper-bound. + it.nd, _, _ = it.seekForBaseSplice(key) + if it.nd == it.list.head || it.nd == it.lowerNode { + return nil + } + it.decodeKey() + if it.lower != nil && it.list.cmp(it.lower, it.kv.K.UserKey) > 0 { + it.lowerNode = it.nd + return nil + } + it.kv.V = base.MakeInPlaceValue(it.value()) + return &it.kv +} + +// First seeks position at the first entry in list. Returns the KV pair if the +// iterator is pointing at a valid entry, and nil otherwise. Note that First +// only checks the upper bound. It is up to the caller to ensure that key is +// greater than or equal to the lower bound (e.g. via a call to SeekGE(lower)). +func (it *Iterator) First() *base.InternalKV { + it.nd = it.list.getNext(it.list.head, 0) + if it.nd == it.list.tail || it.nd == it.upperNode { + return nil + } + it.decodeKey() + if it.upper != nil && it.list.cmp(it.upper, it.kv.K.UserKey) <= 0 { + it.upperNode = it.nd + return nil + } + it.kv.V = base.MakeInPlaceValue(it.value()) + return &it.kv +} + +// Last seeks position at the last entry in list. Returns the KV pair if the +// iterator is pointing at a valid entry, and nil otherwise. Note that Last only +// checks the lower bound. It is up to the caller to ensure that key is less +// than the upper bound (e.g. via a call to SeekLT(upper)). +func (it *Iterator) Last() *base.InternalKV { + it.nd = it.list.getPrev(it.list.tail, 0) + if it.nd == it.list.head || it.nd == it.lowerNode { + return nil + } + it.decodeKey() + if it.lower != nil && it.list.cmp(it.lower, it.kv.K.UserKey) > 0 { + it.lowerNode = it.nd + return nil + } + it.kv.V = base.MakeInPlaceValue(it.value()) + return &it.kv +} + +// Next advances to the next position. Returns the KV pair if the iterator is +// pointing at a valid entry, and nil otherwise. +// Note: flushIterator.Next mirrors the implementation of Iterator.Next +// due to performance. Keep the two in sync. +func (it *Iterator) Next() *base.InternalKV { + it.nd = it.list.getNext(it.nd, 0) + if it.nd == it.list.tail || it.nd == it.upperNode { + return nil + } + it.decodeKey() + if it.upper != nil && it.list.cmp(it.upper, it.kv.K.UserKey) <= 0 { + it.upperNode = it.nd + return nil + } + it.kv.V = base.MakeInPlaceValue(it.value()) + return &it.kv +} + +// NextPrefix advances to the next position with a new prefix. Returns the KV +// pair if the iterator is pointing at a valid entry and nil otherwise. +func (it *Iterator) NextPrefix(succKey []byte) *base.InternalKV { + return it.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext()) +} + +// Prev moves to the previous position. Returns the KV pair if the iterator is +// pointing at a valid entry and nil otherwise. +func (it *Iterator) Prev() *base.InternalKV { + it.nd = it.list.getPrev(it.nd, 0) + if it.nd == it.list.head || it.nd == it.lowerNode { + return nil + } + it.decodeKey() + if it.lower != nil && it.list.cmp(it.lower, it.kv.K.UserKey) > 0 { + it.lowerNode = it.nd + return nil + } + it.kv.V = base.MakeInPlaceValue(it.value()) + return &it.kv +} + +// value returns the value at the current position. +func (it *Iterator) value() []byte { + return it.nd.getValue(it.list.arena) +} + +// SetBounds sets the lower and upper bounds for the iterator. Note that the +// result of Next and Prev will be undefined until the iterator has been +// repositioned with SeekGE, SeekPrefixGE, SeekLT, First, or Last. +func (it *Iterator) SetBounds(lower, upper []byte) { + it.lower = lower + it.upper = upper + it.lowerNode = nil + it.upperNode = nil +} + +// SetContext implements base.InternalIterator. +func (it *Iterator) SetContext(_ context.Context) {} + +// DebugTree is part of the InternalIterator interface. +func (it *Iterator) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p)", it, it) +} + +func (it *Iterator) decodeKey() { + it.kv.K.UserKey = it.list.arena.getBytes(it.nd.keyOffset, it.nd.keySize) + it.kv.K.Trailer = it.nd.keyTrailer +} + +func (it *Iterator) seekForBaseSplice(key []byte) (prev, next *node, found bool) { + ikey := base.MakeSearchKey(key) + level := int(it.list.Height() - 1) + + prev = it.list.head + for { + prev, next, found = it.list.findSpliceForLevel(ikey, level, prev) + + if found { + if level != 0 { + // next is pointing at the target node, but we need to find previous on + // the bottom level. + prev = it.list.getPrev(next, 0) + } + break + } + + if level == 0 { + break + } + + level-- + } + + return +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/node.go b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/node.go similarity index 89% rename from vendor/github.com/cockroachdb/pebble/internal/arenaskl/node.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/node.go index d464bc5..25b1402 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/node.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/node.go @@ -21,7 +21,7 @@ import ( "math" "sync/atomic" - "github.com/cockroachdb/pebble/internal/base" + "github.com/cockroachdb/pebble/v2/internal/base" ) // MaxNodeSize returns the maximum space needed for a node with the specified @@ -47,9 +47,14 @@ type node struct { // Immutable fields, so no need to lock to access key. keyOffset uint32 keySize uint32 - keyTrailer uint64 + keyTrailer base.InternalKeyTrailer valueSize uint32 - allocSize uint32 + + // Padding to align tower on an 8-byte boundary, so that 32-bit and 64-bit + // architectures use the same memory layout for node. Needed for tests which + // expect a certain struct size. The padding can be removed if we add or + // remove a field from the node. + _ [4]byte // Most nodes do not need to use the full height of the tower, since the // probability of each successive level decreases exponentially. Because @@ -95,7 +100,7 @@ func newRawNode(arena *Arena, height uint32, keySize, valueSize uint32) (nd *nod unusedSize := uint32((maxHeight - int(height)) * linksSize) nodeSize := uint32(maxNodeSize) - unusedSize - nodeOffset, allocSize, err := arena.alloc(nodeSize+keySize+valueSize, nodeAlignment, unusedSize) + nodeOffset, err := arena.alloc(nodeSize+keySize+valueSize, nodeAlignment, unusedSize) if err != nil { return } @@ -104,7 +109,6 @@ func newRawNode(arena *Arena, height uint32, keySize, valueSize uint32) (nd *nod nd.keyOffset = nodeOffset + nodeSize nd.keySize = keySize nd.valueSize = valueSize - nd.allocSize = allocSize return } diff --git a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/skl.go b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/skl.go similarity index 97% rename from vendor/github.com/cockroachdb/pebble/internal/arenaskl/skl.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/skl.go index ef1ebfc..186ebf2 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/arenaskl/skl.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/arenaskl/skl.go @@ -41,17 +41,17 @@ Key differences: - Iterator includes mutator functions. */ -package arenaskl // import "github.com/cockroachdb/pebble/internal/arenaskl" +package arenaskl // import "github.com/cockroachdb/pebble/v2/internal/arenaskl" import ( "math" + "math/rand/v2" "runtime" "sync/atomic" "unsafe" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/fastrand" + "github.com/cockroachdb/pebble/v2/internal/base" ) const ( @@ -307,10 +307,9 @@ func (s *Skiplist) NewIter(lower, upper []byte) *Iterator { // NewFlushIter returns a new flushIterator, which is similar to an Iterator // but also sets the current number of the bytes that have been iterated // through. -func (s *Skiplist) NewFlushIter(bytesFlushed *uint64) base.InternalIterator { +func (s *Skiplist) NewFlushIter() base.InternalIterator { return &flushIterator{ - Iterator: Iterator{list: s, nd: s.head}, - bytesIterated: bytesFlushed, + Iterator: Iterator{list: s, nd: s.head}, } } @@ -338,7 +337,7 @@ func (s *Skiplist) newNode( } func (s *Skiplist) randomHeight() uint32 { - rnd := fastrand.Uint32() + rnd := rand.Uint32() h := uint32(1) for h < maxHeight && rnd <= probabilities[h] { diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/cleaner.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/cleaner.go similarity index 83% rename from vendor/github.com/cockroachdb/pebble/internal/base/cleaner.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/base/cleaner.go index b86d455..0800572 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/base/cleaner.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/cleaner.go @@ -4,7 +4,7 @@ package base -import "github.com/cockroachdb/pebble/vfs" +import "github.com/cockroachdb/pebble/v2/vfs" // Cleaner cleans obsolete files. type Cleaner interface { @@ -35,9 +35,12 @@ type ArchiveCleaner struct{} var _ NeedsFileContents = ArchiveCleaner{} // Clean archives file. +// +// TODO(sumeer): for log files written to the secondary FS, the archiving will +// also write to the secondary. We should consider archiving to the primary. func (ArchiveCleaner) Clean(fs vfs.FS, fileType FileType, path string) error { switch fileType { - case FileTypeLog, FileTypeManifest, FileTypeTable: + case FileTypeLog, FileTypeManifest, FileTypeTable, FileTypeBlob: destDir := fs.PathJoin(fs.PathDir(path), "archive") if err := fs.MkdirAll(destDir, 0755); err != nil { diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/base/close_helper.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/close_helper.go new file mode 100644 index 0000000..f9da4f1 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/close_helper.go @@ -0,0 +1,30 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package base + +import "io" + +// CloseHelper wraps an io.Closer in a wrapper that ignores extra calls to +// Close. It is useful to ensure cleanup in error paths (using defer) without +// double-closing. +func CloseHelper(closer io.Closer) io.Closer { + return &closeHelper{ + Closer: closer, + } +} + +type closeHelper struct { + Closer io.Closer +} + +// Close the underlying Closer, unless it was already closed. +func (h *closeHelper) Close() error { + closer := h.Closer + if closer == nil { + return nil + } + h.Closer = nil + return closer.Close() +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/base/compaction_grant_handle.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/compaction_grant_handle.go new file mode 100644 index 0000000..8bcebd6 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/compaction_grant_handle.go @@ -0,0 +1,67 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package base + +// CompactionGrantHandleStats contains stats provided to a CompactionGrantHandle. +type CompactionGrantHandleStats struct { + // CumWriteBytes is the cumulative bytes written to disk. + CumWriteBytes uint64 + // TODO(sumeer): add more stats like: + // cumReadBytes uint64 + // cumReadBytesInCache uint64 +} + +// CompactionGrantHandle is used to frequently update the CompactionScheduler +// about resource consumption. The MeasureCPU and CumulativeStats methods must +// be called frequently. +type CompactionGrantHandle interface { + // Started is called once and must precede calls to MeasureCPU and + // CumulativeStats. + Started() + CPUMeasurer + // CumulativeStats reports the current cumulative stats. This method may + // block if the scheduler wants to pace the compaction (say to moderate its + // consumption of disk write bandwidth). + CumulativeStats(stats CompactionGrantHandleStats) + // Done must be called when the compaction completes (whether success or + // failure). It may synchronously result in a call to + // DBForCompaction.Schedule so this must be called without holding any + // locks, *and* after the new version (if the compaction was successful) has + // been installed. + Done() +} + +// CompactionGoroutineKind identifies the kind of compaction goroutine. +type CompactionGoroutineKind uint8 + +const ( + // CompactionGoroutinePrimary is the primary compaction goroutine that + // iterates over key-value pairs in the input and calls the current sstable + // writer and blob file writer. + CompactionGoroutinePrimary CompactionGoroutineKind = iota + // CompactionGoroutineSSTableSecondary is the secondary goroutine in the + // current sstable writer that writes blocks to the sstable. + CompactionGoroutineSSTableSecondary + // CompactionGoroutineBlobFileSecondary is the secondary goroutine in the + // current blob file writer that writes blocks to the blob file. + CompactionGoroutineBlobFileSecondary +) + +// CPUMeasurer is used to measure the CPU consumption of goroutines involved +// in a compaction. +type CPUMeasurer interface { + // MeasureCPU allows the measurer to keep track of CPU usage while a + // compaction is ongoing. It is to be called regularly from the compaction + // goroutine corresponding to the argument. The first call from a goroutine + // must be done before any significant CPU consumption, since it is used to + // initialize the measurer for the goroutine making the call. If a + // compaction is not using a certain kind of goroutine, it can skip calling + // this method with the corresponding argument. + MeasureCPU(CompactionGoroutineKind) +} + +type NoopCPUMeasurer struct{} + +func (NoopCPUMeasurer) MeasureCPU(CompactionGoroutineKind) {} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/base/comparer.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/comparer.go new file mode 100644 index 0000000..92713ba --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/comparer.go @@ -0,0 +1,586 @@ +// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package base + +import ( + "bytes" + "encoding/binary" + "fmt" + "slices" + "strconv" + "unicode/utf8" + + "github.com/cockroachdb/crlib/crbytes" + "github.com/cockroachdb/errors" +) + +// Compare returns -1, 0, or +1 depending on whether a is 'less than', 'equal +// to' or 'greater than' b. +// +// Both a and b must be valid keys. Note that because of synthetic prefix +// functionality, the Compare function can be called on a key (either from the +// database or passed as an argument for an iterator operation) after the +// synthetic prefix has been removed. In general, this implies that removing any +// leading bytes from a prefix must yield another valid prefix. +// +// A key a is less than b if a's prefix is byte-wise less than b's prefix, or if +// the prefixes are equal and a's suffix is less than b's suffix (according to +// ComparePointSuffixes). +// +// In other words, if prefix(a) = a[:Split(a)] and suffix(a) = a[Split(a):]: +// +// Compare(a, b) = bytes.Compare(prefix(a), prefix(b)) if not 0, +// otherwise ComparePointSuffixes(suffix(a), suffix(b)) +// +// Compare defaults to using the formula above but it can be customized if there +// is a (potentially faster) specialization or it has to compare suffixes +// differently. +type Compare func(a, b []byte) int + +// CompareRangeSuffixes compares two suffixes where either or both suffix +// originates from a range key and returns -1, 0, or +1. +// +// For historical reasons (see +// https://github.com/cockroachdb/cockroach/issues/130533 for a summary), we +// allow this function to be more strict than Compare. Specifically, Compare may +// treat two suffixes as equal whereas CompareRangeSuffixes might not. +// +// CompareRangeSuffixes is allowed to be more strict than see +// ComparePointSuffixes, meaning it may return -1 or +1 when +// ComparePointSuffixes would return 0. +// +// The empty slice suffix must be 'less than' any non-empty suffix. +type CompareRangeSuffixes func(a, b []byte) int + +// ComparePointSuffixes compares two point key suffixes and returns -1, 0, or +1. +// +// For historical reasons (see +// https://github.com/cockroachdb/cockroach/issues/130533 for a summary), this +// function is distinct from CompareRangeSuffixes. Specifically, +// ComparePointSuffixes may treat two suffixes as equal whereas +// CompareRangeSuffixes might not. Unlike CompareRangeSuffixes, this function +// must agree with Compare. +// +// The empty slice suffix must be 'less than' any non-empty suffix. +// +// A full key k is composed of a prefix k[:Split(k)] and suffix k[Split(k):]. +// Suffixes are compared to break ties between equal prefixes. +type ComparePointSuffixes func(a, b []byte) int + +// defaultCompare implements Compare in terms of Split and ComparePointSuffixes, as +// mentioned above. +func defaultCompare(split Split, compareSuffixes ComparePointSuffixes, a, b []byte) int { + an := split(a) + bn := split(b) + if prefixCmp := bytes.Compare(a[:an], b[:bn]); prefixCmp != 0 { + return prefixCmp + } + return compareSuffixes(a[an:], b[bn:]) +} + +// Equal returns true if a and b are equivalent. +// +// For a given Compare, Equal(a,b)=true iff Compare(a,b)=0; that is, Equal is a +// (potentially faster) specialization of Compare. +type Equal func(a, b []byte) bool + +// AbbreviatedKey returns a fixed length prefix of a user key such that +// +// AbbreviatedKey(a) < AbbreviatedKey(b) implies a < b, and +// AbbreviatedKey(a) > AbbreviatedKey(b) implies a > b. +// +// If AbbreviatedKey(a) == AbbreviatedKey(b), an additional comparison is +// required to determine if the two keys are actually equal. +// +// This helps optimize indexed batch comparisons for cache locality. If a Split +// function is specified, AbbreviatedKey usually returns the first eight bytes +// of the user key prefix in the order that gives the correct ordering. +type AbbreviatedKey func(key []byte) uint64 + +// FormatKey returns a formatter for the user key. +type FormatKey func(key []byte) fmt.Formatter + +// DefaultFormatter is the default implementation of user key formatting: +// non-ASCII data is formatted as escaped hexadecimal values. +var DefaultFormatter FormatKey = func(key []byte) fmt.Formatter { + return FormatBytes(key) +} + +// FormatValue returns a formatter for the user value. The key is also specified +// for the value formatter in order to support value formatting that is +// dependent on the key. +type FormatValue func(key, value []byte) fmt.Formatter + +// Separator is used to construct SSTable index blocks. A trivial implementation +// is `return append(dst, a...)`, but appending fewer bytes leads to smaller +// SSTables. +// +// Given keys a, b for which Compare(a, b) < 0, Separator produces a key k such +// that: +// +// 1. Compare(a, k) <= 0, and +// 2. Compare(k, b) < 0. +// +// For example, if a and b are the []byte equivalents of the strings "black" and +// "blue", then the function may append "blb" to dst. +// +// Callers must guarantee that len(a) > 0 and len(b) > 0. +type Separator func(dst, a, b []byte) []byte + +// Successor appends to dst a shortened key k given a key a such that +// Compare(a, k) <= 0. A simple implementation may return a unchanged. +// The appended key k must be valid to pass to Compare. +// +// The parameter a may be an empty slice even if an empty slice was never +// committed to Pebble and is not a valid key representation. If a is the empty +// slice, Successor must return a valid key but otherwise has no other +// constraints. +type Successor func(dst, a []byte) []byte + +// ImmediateSuccessor is invoked with a prefix key ([Split(a) == len(a)]) and +// appends to dst the smallest prefix key that is larger than the given prefix a. +// +// ImmediateSuccessor must generate a prefix key k such that: +// +// Split(k) == len(k) and Compare(a, k) < 0 +// +// and there exists no representable prefix key k2 such that: +// +// Split(k2) == len(k2) and Compare(a, k2) < 0 and Compare(k2, k) < 0 +// +// As an example, an implementation built on the natural byte ordering using +// bytes.Compare could append a `\0` to `a`. +// +// The appended key must be valid to pass to Compare. +type ImmediateSuccessor func(dst, a []byte) []byte + +// Split returns the length of the prefix of the user key that corresponds to +// the key portion of an MVCC encoding scheme to enable the use of prefix bloom +// filters. +// +// The method will only ever be called with valid MVCC keys, that is, keys that +// the user could potentially store in the database. Pebble does not know which +// keys are MVCC keys and which are not, and may call Split on both MVCC keys +// and non-MVCC keys. +// +// A trivial MVCC scheme is one in which Split() returns len(a). This +// corresponds to assigning a constant version to each key in the database. For +// performance reasons, it is preferable to use a `nil` split in this case. +// +// Let prefix(a) = a[:Split(a)] and suffix(a) = a[Split(a):]. The following +// properties must hold: +// +// 1. A key consisting of just a prefix must sort before all other keys with +// that prefix: +// +// If len(suffix(a)) > 0, then Compare(prefix(a), a) < 0. +// +// 2. Prefixes must be used to order keys before suffixes: +// +// If Compare(a, b) <= 0, then Compare(prefix(a), prefix(b)) <= 0. +// If Compare(prefix(a), prefix(b)) < 0, then Compare(a, b) < 0 +// +// 3. Suffixes themselves must be valid keys and comparable, respecting the same +// ordering as within a key: +// +// If Compare(prefix(a), prefix(b)) = 0, then +// Compare(a, b) = ComparePointSuffixes(suffix(a), suffix(b)) +type Split func(a []byte) int + +// Prefix returns the prefix of the key k, using s to split the key. +func (s Split) Prefix(k []byte) []byte { + i := s(k) + return k[:i:i] +} + +// HasSuffix returns true if the key k has a suffix remaining after +// Split is called on it. For keys where the entirety of the key is +// returned by Split, HasSuffix will return false. +func (s Split) HasSuffix(k []byte) bool { + return s(k) < len(k) +} + +// DefaultSplit is a trivial implementation of Split which always returns the +// full key. +var DefaultSplit Split = func(key []byte) int { return len(key) } + +// Comparer defines a total ordering over the space of []byte keys: a 'less +// than' relationship. +type Comparer struct { + // The following must always be specified. + AbbreviatedKey AbbreviatedKey + Separator Separator + Successor Successor + + // ImmediateSuccessor must be specified if range keys are used. + ImmediateSuccessor ImmediateSuccessor + + // Split defaults to a trivial implementation that returns the full key length + // if it is not specified. + Split Split + + // CompareRangeSuffixes defaults to bytes.Compare if it is not specified. + CompareRangeSuffixes CompareRangeSuffixes + // ComparePointSuffixes defaults to bytes.Compare if it is not specified. + ComparePointSuffixes ComparePointSuffixes + + // Compare defaults to a generic implementation that uses Split, + // bytes.Compare, and ComparePointSuffixes if it is not specified. + Compare Compare + // Equal defaults to using Compare() == 0 if it is not specified. + Equal Equal + // FormatKey defaults to the DefaultFormatter if it is not specified. + FormatKey FormatKey + + // FormatValue is optional. + FormatValue FormatValue + + // ValidateKey is an optional function that determines whether a key is + // valid according to this Comparer's key encoding. + ValidateKey ValidateKey + + // Name is the name of the comparer. + // + // The on-disk format stores the comparer name, and opening a database with a + // different comparer from the one it was created with will result in an + // error. + Name string +} + +// EnsureDefaults ensures that all non-optional fields are set. +// +// If c is nil, returns DefaultComparer. +// +// If any fields need to be set, returns a modified copy of c. +func (c *Comparer) EnsureDefaults() *Comparer { + if c == nil { + return DefaultComparer + } + if c.AbbreviatedKey == nil || c.Separator == nil || c.Successor == nil || c.Name == "" { + panic("invalid Comparer: mandatory field not set") + } + if c.CompareRangeSuffixes != nil && c.Compare != nil && c.Equal != nil && c.Split != nil && c.FormatKey != nil { + return c + } + n := &Comparer{} + *n = *c + + if n.Split == nil { + n.Split = DefaultSplit + } + if n.CompareRangeSuffixes == nil && n.Compare == nil && n.Equal == nil { + n.CompareRangeSuffixes = bytes.Compare + n.Compare = bytes.Compare + n.Equal = bytes.Equal + } else { + if n.CompareRangeSuffixes == nil { + n.CompareRangeSuffixes = bytes.Compare + } + if n.Compare == nil { + n.Compare = func(a, b []byte) int { + return defaultCompare(n.Split, n.ComparePointSuffixes, a, b) + } + } + if n.Equal == nil { + n.Equal = func(a, b []byte) bool { + return n.Compare(a, b) == 0 + } + } + } + if n.FormatKey == nil { + n.FormatKey = DefaultFormatter + } + return n +} + +// DefaultComparer is the default implementation of the Comparer interface. +// It uses the natural ordering, consistent with bytes.Compare. +var DefaultComparer = &Comparer{ + ComparePointSuffixes: bytes.Compare, + CompareRangeSuffixes: bytes.Compare, + Compare: bytes.Compare, + Equal: bytes.Equal, + + AbbreviatedKey: func(key []byte) uint64 { + if len(key) >= 8 { + return binary.BigEndian.Uint64(key) + } + var v uint64 + for _, b := range key { + v <<= 8 + v |= uint64(b) + } + return v << uint(8*(8-len(key))) + }, + + Split: DefaultSplit, + + FormatKey: DefaultFormatter, + + Separator: func(dst, a, b []byte) []byte { + if len(a) == 0 || len(b) == 0 { + panic(errors.AssertionFailedf("empty keys")) + } + + i := crbytes.CommonPrefix(a, b) + n := len(dst) + dst = append(dst, a...) + + if i == len(a) || i == len(b) { + // Do not shorten if one string is a prefix of the other. + return dst + } + + if a[i] >= b[i] { + // b is smaller than a or a is already the shortest possible. + return dst + } + + if i < len(b)-1 || a[i]+1 < b[i] { + i += n + dst[i]++ + return dst[:i+1] + } + + i += n + 1 + for ; i < len(dst); i++ { + if dst[i] != 0xff { + dst[i]++ + return dst[:i+1] + } + } + return dst + }, + + Successor: func(dst, a []byte) (ret []byte) { + for i := 0; i < len(a); i++ { + if a[i] != 0xff { + dst = append(dst, a[:i+1]...) + dst[len(dst)-1]++ + return dst + } + } + // a is a run of 0xffs, leave it alone. + return append(dst, a...) + }, + + ImmediateSuccessor: func(dst, a []byte) (ret []byte) { + return append(append(dst, a...), 0x00) + }, + + // This name is part of the C++ Level-DB implementation's default file + // format, and should not be changed. + Name: "leveldb.BytewiseComparator", +} + +// MinUserKey returns the smaller of two user keys. If one of the keys is nil, +// the other one is returned. +func MinUserKey(cmp Compare, a, b []byte) []byte { + if a != nil && (b == nil || cmp(a, b) < 0) { + return a + } + return b +} + +// FormatBytes formats a byte slice using hexadecimal escapes for non-ASCII +// data. +type FormatBytes []byte + +const lowerhex = "0123456789abcdef" + +// Format implements the fmt.Formatter interface. +func (p FormatBytes) Format(s fmt.State, c rune) { + buf := make([]byte, 0, len(p)) + for _, b := range p { + if b < utf8.RuneSelf && strconv.IsPrint(rune(b)) { + buf = append(buf, b) + continue + } + buf = append(buf, `\x`...) + buf = append(buf, lowerhex[b>>4]) + buf = append(buf, lowerhex[b&0xF]) + } + s.Write(buf) +} + +// MakeAssertComparer creates a Comparer that is the same with the given +// Comparer except that it asserts that the Compare and Equal functions adhere +// to their specifications. +func MakeAssertComparer(c Comparer) Comparer { + return Comparer{ + Compare: func(a []byte, b []byte) int { + res := c.Compare(a, b) + // Verify that Compare is consistent with the default implementation. + if expected := defaultCompare(c.Split, c.ComparePointSuffixes, a, b); res != expected { + panic(AssertionFailedf("%s: Compare(%s, %s)=%d, expected %d", + c.Name, c.FormatKey(a), c.FormatKey(b), res, expected)) + } + return res + }, + + Equal: func(a []byte, b []byte) bool { + eq := c.Equal(a, b) + // Verify that Equal is consistent with Compare. + if expected := c.Compare(a, b); eq != (expected == 0) { + panic("Compare and Equal are not consistent") + } + return eq + }, + + // TODO(radu): add more checks. + ComparePointSuffixes: c.ComparePointSuffixes, + CompareRangeSuffixes: c.CompareRangeSuffixes, + AbbreviatedKey: c.AbbreviatedKey, + Separator: func(dst, a, b []byte) []byte { + if len(a) == 0 || len(b) == 0 { + panic(errors.AssertionFailedf("empty keys")) + } + ret := c.Separator(dst, a, b) + // The Separator func must return a valid key. + c.ValidateKey.MustValidate(ret) + return ret + }, + Successor: func(dst, a []byte) []byte { + ret := c.Successor(dst, a) + // The Successor func must return a valid key. + c.ValidateKey.MustValidate(ret) + return ret + }, + ImmediateSuccessor: func(dst, a []byte) []byte { + ret := c.ImmediateSuccessor(dst, a) + // The ImmediateSuccessor func must return a valid key. + c.ValidateKey.MustValidate(ret) + return ret + }, + FormatKey: c.FormatKey, + Split: c.Split, + FormatValue: c.FormatValue, + ValidateKey: c.ValidateKey, + Name: c.Name, + } +} + +// ValidateKey is a func that determines whether a key is valid according to a +// particular key encoding. Returns nil if the provided key is a valid, full +// user key. Implementations must be careful to not mutate the provided key. +type ValidateKey func([]byte) error + +// Validate validates the provided user key. If the func is nil, Validate +// returns nil. +func (v ValidateKey) Validate(key []byte) error { + if v == nil { + return nil + } + return v(key) +} + +// MustValidate validates the provided user key, panicking if the key is +// invalid. +func (v ValidateKey) MustValidate(key []byte) { + if err := v.Validate(key); err != nil { + panic(err) + } +} + +// CheckComparer is a mini test suite that verifies a comparer implementation. +// +// It takes lists of valid prefixes and suffixes. It is recommended that both +// lists have at least three elements. +func CheckComparer(c *Comparer, prefixes [][]byte, suffixes [][]byte) error { + // Empty slice is always a valid suffix. + suffixes = append(suffixes, nil) + + // Verify the suffixes have a consistent ordering. + slices.SortFunc(suffixes, c.CompareRangeSuffixes) + if !slices.IsSortedFunc(suffixes, c.CompareRangeSuffixes) { + return errors.Errorf("CompareRangeSuffixes is inconsistent") + } + // Verify the ordering imposed by CompareRangeSuffixes is considered a valid + // ordering for point suffixes. CompareRangeSuffixes imposes a stricter + // ordering than ComaprePointSuffixes, but a CompareRangesSuffixes ordering + // must be a valid ordering for point suffixes. + if !slices.IsSortedFunc(suffixes, c.ComparePointSuffixes) { + return errors.Errorf("ComparePointSuffixes is inconsistent") + } + + n := len(prefixes) + // Removing leading bytes from prefixes must yield valid prefixes. + for i := 0; i < n; i++ { + for j := 1; j < len(prefixes[i]); j++ { + prefixes = append(prefixes, prefixes[i][j:]) + } + } + + // Check the split function. + for _, p := range prefixes { + for _, s := range suffixes { + key := slices.Concat(p, s) + if n := c.Split(key); n != len(p) { + return errors.Errorf("incorrect Split result %d on '%x' (prefix '%x' suffix '%x')", n, key, p, s) + } + } + for i := 1; i < len(suffixes); i++ { + a := slices.Concat(p, suffixes[i-1]) + b := slices.Concat(p, suffixes[i]) + if err := c.ValidateKey.Validate(a); err != nil { + return err + } + if err := c.ValidateKey.Validate(b); err != nil { + return err + } + + // Make sure the Compare function agrees with ComparePointSuffixes. + if cmp := c.Compare(a, b); cmp > 0 { + return errors.Errorf("Compare(%s, %s)=%d, expected <= 0", c.FormatKey(a), c.FormatKey(b), cmp) + } + } + } + + // Check the Compare/Equals functions on all possible combinations. + for _, ap := range prefixes { + for _, as := range suffixes { + a := slices.Concat(ap, as) + if err := c.ValidateKey.Validate(a); err != nil { + return err + } + for _, bp := range prefixes { + for _, bs := range suffixes { + b := slices.Concat(bp, bs) + if err := c.ValidateKey.Validate(b); err != nil { + return err + } + result := c.Compare(a, b) + if (result == 0) != c.Equal(a, b) { + return errors.Errorf("Equal(%s, %s) doesn't agree with Compare", c.FormatKey(a), c.FormatKey(b)) + } + + if prefixCmp := bytes.Compare(ap, bp); prefixCmp != 0 { + if result != prefixCmp { + return errors.Errorf("Compare(%s, %s)=%d, expected %d", c.FormatKey(a), c.FormatKey(b), result, prefixCmp) + } + } else { + // The prefixes are equal, so Compare's result should + // agree with ComparePointSuffixes. + if suffixCmp := c.ComparePointSuffixes(as, bs); result != suffixCmp { + return errors.Errorf("Compare(%s, %s)=%d but ComparePointSuffixes(%q, %q)=%d", + c.FormatKey(a), c.FormatKey(b), result, as, bs, suffixCmp) + } + // If result == 0, CompareRangeSuffixes may not agree + // with ComparePointSuffixes, but otherwise it should. + if result != 0 { + if suffixCmp := c.CompareRangeSuffixes(as, bs); result != suffixCmp { + return errors.Errorf("Compare(%s, %s)=%d, expected %d", + c.FormatKey(a), c.FormatKey(b), result, suffixCmp) + } + } + } + } + } + } + } + + // TODO(radu): check more methods. + return nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/base/error.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/error.go new file mode 100644 index 0000000..dbcc1e3 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/error.go @@ -0,0 +1,62 @@ +// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package base + +import ( + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/invariants" +) + +// ErrNotFound means that a get or delete call did not find the requested key. +var ErrNotFound = errors.New("pebble: not found") + +// ErrCorruption is a marker to indicate that data in a file (WAL, MANIFEST, +// sstable) isn't in the expected format (or the file is missing). +var ErrCorruption = errors.New("pebble: corruption") + +// MarkCorruptionError marks given error as a corruption error. +func MarkCorruptionError(err error) error { + if errors.Is(err, ErrCorruption) { + return err + } + return errors.Mark(err, ErrCorruption) +} + +// IsCorruptionError returns true if the given error indicates corruption. +func IsCorruptionError(err error) bool { + return errors.Is(err, ErrCorruption) +} + +// CorruptionErrorf formats according to a format specifier and returns +// the string as an error value that is marked as a corruption error. +func CorruptionErrorf(format string, args ...interface{}) error { + return errors.Mark(errors.Newf(format, args...), ErrCorruption) +} + +// AssertionFailedf creates an assertion error and panics in invariants.Enabled +// builds. It should only be used when it indicates a bug. +func AssertionFailedf(format string, args ...interface{}) error { + err := errors.AssertionFailedf(format, args...) + if invariants.Enabled { + panic(err) + } + return err +} + +// CatchErrorPanic runs a function and catches any panic that contains an +// error, returning that error. Used in tests, in particular to catch panics +// threw by AssertionFailedf. +func CatchErrorPanic(f func() error) (err error) { + defer func() { + if r := recover(); r != nil { + if e, ok := r.(error); ok { + err = e + } else { + panic(r) + } + } + }() + return f() +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/base/filenames.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/filenames.go new file mode 100644 index 0000000..3a82ce6 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/filenames.go @@ -0,0 +1,271 @@ +// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package base + +import ( + "fmt" + "path/filepath" + "strconv" + "strings" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/redact" +) + +// TableNum is an internal DB identifier for a table. Tables can be physical (in +// which case the numeric TableNum value coincides with the DiskFileNum of the +// backing object) or virtual. +type TableNum uint64 + +// FileNum is a deprecated alias for TableNum. +type FileNum = TableNum + +// String returns a string representation of the file number. +func (tn TableNum) String() string { return fmt.Sprintf("%06d", tn) } + +// SafeFormat implements redact.SafeFormatter. +func (tn TableNum) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("%06d", redact.SafeUint(tn)) +} + +// PhysicalTableDiskFileNum converts the TableNum of a physical table to the +// backing DiskFileNum. The underlying numbers always match for physical tables. +func PhysicalTableDiskFileNum(n TableNum) DiskFileNum { + return DiskFileNum(n) +} + +// PhysicalTableFileNum converts the DiskFileNum backing a physical table into +// the table's TableNum. The underlying numbers always match for physical tables. +func PhysicalTableFileNum(f DiskFileNum) TableNum { + return TableNum(f) +} + +// BlobFileID is an internal identifier for a blob file. +// +// Initially there exists a physical blob file with a DiskFileNum that equals +// the value of the BlobFileID. However, if the blob file is replaced, the +// manifest.Version may re-map the BlobFileID to a new DiskFileNum. +type BlobFileID uint64 + +// String returns a string representation of the blob file ID. +func (id BlobFileID) String() string { return fmt.Sprintf("B%06d", id) } + +// SafeFormat implements redact.SafeFormatter. +func (id BlobFileID) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("B%06d", redact.SafeUint(id)) +} + +// A DiskFileNum identifies a file or object with exists on disk. +type DiskFileNum uint64 + +func (dfn DiskFileNum) String() string { return fmt.Sprintf("%06d", dfn) } + +// SafeFormat implements redact.SafeFormatter. +func (dfn DiskFileNum) SafeFormat(w redact.SafePrinter, verb rune) { + w.Printf("%06d", redact.SafeUint(dfn)) +} + +// FileType enumerates the types of files found in a DB. +type FileType int + +// The FileType enumeration. +const ( + FileTypeLog FileType = iota + FileTypeLock + FileTypeTable + FileTypeManifest + FileTypeOptions + FileTypeOldTemp + FileTypeTemp + FileTypeBlob +) + +var fileTypeStrings = [...]string{ + FileTypeLog: "log", + FileTypeLock: "lock", + FileTypeTable: "sstable", + FileTypeManifest: "manifest", + FileTypeOptions: "options", + FileTypeOldTemp: "old-temp", + FileTypeTemp: "temp", + FileTypeBlob: "blob", +} + +// FileTypeFromName parses a FileType from its string representation. +func FileTypeFromName(name string) FileType { + for i, s := range fileTypeStrings { + if s == name { + return FileType(i) + } + } + panic(fmt.Sprintf("unknown file type: %q", name)) +} + +// SafeFormat implements redact.SafeFormatter. +func (ft FileType) SafeFormat(w redact.SafePrinter, _ rune) { + if ft < 0 || int(ft) >= len(fileTypeStrings) { + w.Print(redact.SafeString("unknown")) + return + } + w.Print(redact.SafeString(fileTypeStrings[ft])) +} + +// String implements fmt.Stringer. +func (ft FileType) String() string { + return redact.StringWithoutMarkers(ft) +} + +// MakeFilename builds a filename from components. +func MakeFilename(fileType FileType, dfn DiskFileNum) string { + switch fileType { + case FileTypeLog: + panic("the pebble/wal pkg is responsible for constructing WAL filenames") + case FileTypeLock: + return "LOCK" + case FileTypeTable: + return fmt.Sprintf("%s.sst", dfn) + case FileTypeManifest: + return fmt.Sprintf("MANIFEST-%s", dfn) + case FileTypeOptions: + return fmt.Sprintf("OPTIONS-%s", dfn) + case FileTypeOldTemp: + return fmt.Sprintf("CURRENT.%s.dbtmp", dfn) + case FileTypeTemp: + return fmt.Sprintf("temporary.%s.dbtmp", dfn) + case FileTypeBlob: + return fmt.Sprintf("%s.blob", dfn) + } + panic("unreachable") +} + +// MakeFilepath builds a filepath from components. +func MakeFilepath(fs vfs.FS, dirname string, fileType FileType, dfn DiskFileNum) string { + return fs.PathJoin(dirname, MakeFilename(fileType, dfn)) +} + +// ParseFilename parses the components from a filename. +func ParseFilename(fs vfs.FS, filename string) (fileType FileType, dfn DiskFileNum, ok bool) { + filename = fs.PathBase(filename) + switch { + case filename == "LOCK": + return FileTypeLock, 0, true + case strings.HasPrefix(filename, "MANIFEST-"): + dfn, ok = ParseDiskFileNum(filename[len("MANIFEST-"):]) + if !ok { + break + } + return FileTypeManifest, dfn, true + case strings.HasPrefix(filename, "OPTIONS-"): + dfn, ok = ParseDiskFileNum(filename[len("OPTIONS-"):]) + if !ok { + break + } + return FileTypeOptions, dfn, ok + case strings.HasPrefix(filename, "CURRENT.") && strings.HasSuffix(filename, ".dbtmp"): + s := strings.TrimSuffix(filename[len("CURRENT."):], ".dbtmp") + dfn, ok = ParseDiskFileNum(s) + if !ok { + break + } + return FileTypeOldTemp, dfn, ok + case strings.HasPrefix(filename, "temporary.") && strings.HasSuffix(filename, ".dbtmp"): + s := strings.TrimSuffix(filename[len("temporary."):], ".dbtmp") + dfn, ok = ParseDiskFileNum(s) + if !ok { + break + } + return FileTypeTemp, dfn, ok + default: + i := strings.IndexByte(filename, '.') + if i < 0 { + break + } + dfn, ok = ParseDiskFileNum(filename[:i]) + if !ok { + break + } + switch filename[i+1:] { + case "sst": + return FileTypeTable, dfn, true + case "blob": + return FileTypeBlob, dfn, true + } + } + return 0, dfn, false +} + +// ParseDiskFileNum parses the provided string as a disk file number. +func ParseDiskFileNum(s string) (dfn DiskFileNum, ok bool) { + u, err := strconv.ParseUint(s, 10, 64) + if err != nil { + return dfn, false + } + return DiskFileNum(u), true +} + +// A Fataler fatals a process with a message when called. +type Fataler interface { + Fatalf(format string, args ...interface{}) +} + +// MustExist checks if err is an error indicating a file does not exist. +// If it is, it lists the containing directory's files to annotate the error +// with counts of the various types of files and invokes the provided fataler. +// See cockroachdb/cockroach#56490. +func MustExist(fs vfs.FS, filename string, fataler Fataler, err error) { + if err == nil || !oserror.IsNotExist(err) { + return + } + err = AddDetailsToNotExistError(fs, filename, err) + fataler.Fatalf("%+v", err) +} + +// AddDetailsToNotExistError annotates an unexpected not-exist error with +// information about the directory contents. +func AddDetailsToNotExistError(fs vfs.FS, filename string, err error) error { + ls, lsErr := fs.List(fs.PathDir(filename)) + if lsErr != nil { + // TODO(jackson): if oserror.IsNotExist(lsErr), the data directory + // doesn't exist anymore. Another process likely deleted it before + // killing the process. We want to fatal the process, but without + // triggering error reporting like Sentry. + return errors.WithDetailf(err, "list err: %+v", lsErr) + } + var total, unknown, tables, logs, manifests int + total = len(ls) + for _, f := range ls { + // The file format of log files is an implementation detail of the wal/ + // package that the internal/base package is not privy to. We can't call + // into the wal package because that would introduce a cyclical + // dependency. For our purposes, an exact count isn't important and we + // just count files with .log extensions. + if filepath.Ext(f) == ".log" { + logs++ + continue + } + typ, _, ok := ParseFilename(fs, f) + if !ok { + unknown++ + continue + } + switch typ { + case FileTypeTable: + tables++ + case FileTypeManifest: + manifests++ + } + } + + return errors.WithDetailf(err, "filename: %s; directory contains %d files, %d unknown, %d tables, %d logs, %d manifests", + filename, total, unknown, tables, logs, manifests) +} + +// FileInfo provides some rudimentary information about a file. +type FileInfo struct { + FileNum DiskFileNum + FileSize uint64 +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/internal.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/internal.go similarity index 59% rename from vendor/github.com/cockroachdb/pebble/internal/base/internal.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/base/internal.go index 259ef98..c337682 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/base/internal.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/internal.go @@ -2,26 +2,70 @@ // of this source code is governed by a BSD-style license that can be found in // the LICENSE file. -package base // import "github.com/cockroachdb/pebble/internal/base" +package base // import "github.com/cockroachdb/pebble/v2/internal/base" import ( + "cmp" "encoding/binary" "fmt" "strconv" "strings" + "sync/atomic" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/invariants" "github.com/cockroachdb/redact" ) +// SeqNum is a sequence number defining precedence among identical keys. A key +// with a higher sequence number takes precedence over a key with an equal user +// key of a lower sequence number. Sequence numbers are stored durably within +// the internal key "trailer" as a 7-byte (uint56) uint, and the maximum +// sequence number is 2^56-1. As keys are committed to the database, they're +// assigned increasing sequence numbers. Readers use sequence numbers to read a +// consistent database state, ignoring keys with sequence numbers larger than +// the readers' "visible sequence number." +// +// The database maintains an invariant that no two point keys with equal user +// keys may have equal sequence numbers. Keys with differing user keys may have +// equal sequence numbers. A point key and a range deletion or range key that +// include that point key can have equal sequence numbers - in that case, the +// range key does not apply to the point key. A key's sequence number may be +// changed to zero during compactions when it can be proven that no identical +// keys with lower sequence numbers exist. +type SeqNum uint64 + const ( // SeqNumZero is the zero sequence number, set by compactions if they can // guarantee there are no keys underneath an internal key. - SeqNumZero = uint64(0) + SeqNumZero SeqNum = 0 // SeqNumStart is the first sequence number assigned to a key. Sequence // numbers 1-9 are reserved for potential future use. - SeqNumStart = uint64(10) + SeqNumStart SeqNum = 10 + // SeqNumMax is the largest valid sequence number. + SeqNumMax SeqNum = 1<<56 - 1 + // SeqNumBatchBit is set on batch sequence numbers which prevents those + // entries from being excluded from iteration. + SeqNumBatchBit SeqNum = 1 << 55 ) +func (s SeqNum) String() string { + if s == SeqNumMax { + return "inf" + } + var batch string + if s&SeqNumBatchBit != 0 { + batch = "b" + s &^= SeqNumBatchBit + } + return fmt.Sprintf("%s%d", batch, s) +} + +// SafeFormat implements redact.SafeFormatter. +func (s SeqNum) SafeFormat(w redact.SafePrinter, _ rune) { + w.Print(redact.SafeString(s.String())) +} + // InternalKeyKind enumerates the kind of key: a deletion tombstone, a set // value, a merged value, etc. type InternalKeyKind uint8 @@ -67,7 +111,7 @@ const ( // InternalKeyKindSetWithDelete keys are SET keys that have met with a // DELETE or SINGLEDEL key in a prior compaction. This key kind is // specific to Pebble. See - // https://github.com/cockroachdb/pebble/issues/1255. + // https://github.com/cockroachdb/pebble/v2/issues/1255. InternalKeyKindSetWithDelete InternalKeyKind = 18 // InternalKeyKindRangeKeyDelete removes all range keys within a key range. @@ -79,10 +123,13 @@ const ( InternalKeyKindRangeKeyUnset InternalKeyKind = 20 InternalKeyKindRangeKeySet InternalKeyKind = 21 + InternalKeyKindRangeKeyMin InternalKeyKind = InternalKeyKindRangeKeyDelete + InternalKeyKindRangeKeyMax InternalKeyKind = InternalKeyKindRangeKeySet + // InternalKeyKindIngestSST is used to distinguish a batch that corresponds to // the WAL entry for ingested sstables that are added to the flushable - // queue. This InternalKeyKind cannot appear, amongst other key kinds in a - // batch, or in an sstable. + // queue. This InternalKeyKind cannot appear amongst other key kinds in a + // batch (with the exception of alongside InternalKeyKindExcise), or in an sstable. InternalKeyKindIngestSST InternalKeyKind = 22 // InternalKeyKindDeleteSized keys behave identically to @@ -92,6 +139,14 @@ const ( // heuristics, but is not required to be accurate for correctness. InternalKeyKindDeleteSized InternalKeyKind = 23 + // InternalKeyKindExcise is used to persist the Excise part of an IngestAndExcise + // to a WAL. An Excise is similar to a RangeDel+RangeKeyDel combined, in that it + // deletes all point and range keys in a given key range while also immediately + // truncating sstables to exclude this key span. This InternalKeyKind cannot + // appear amongst other key kinds in a batch (with the exception of alongside + // InternalKeyKindIngestSST), or in an sstable. + InternalKeyKindExcise InternalKeyKind = 24 + // This maximum value isn't part of the file format. Future extensions may // increase this value. // @@ -101,7 +156,13 @@ const ( // which sorts 'less than or equal to' any other valid internalKeyKind, when // searching for any kind of internal key formed by a certain user key and // seqNum. - InternalKeyKindMax InternalKeyKind = 23 + InternalKeyKindMax InternalKeyKind = 24 + + // InternalKeyKindMaxForSSTable is the largest valid key kind that can exist + // in an SSTable. This should usually equal InternalKeyKindMax, except + // if the current InternalKeyKindMax is a kind that is never added to an + // SSTable or memtable (eg. InternalKeyKindExcise). + InternalKeyKindMaxForSSTable InternalKeyKind = InternalKeyKindDeleteSized // Internal to the sstable format. Not exposed by any sstable iterator. // Declared here to prevent definition of valid key kinds that set this bit. @@ -110,30 +171,23 @@ const ( // InternalKeyZeroSeqnumMaxTrailer is the largest trailer with a // zero sequence number. - InternalKeyZeroSeqnumMaxTrailer = uint64(255) + InternalKeyZeroSeqnumMaxTrailer InternalKeyTrailer = 255 // A marker for an invalid key. InternalKeyKindInvalid InternalKeyKind = InternalKeyKindSSTableInternalObsoleteMask - // InternalKeySeqNumBatch is a bit that is set on batch sequence numbers - // which prevents those entries from being excluded from iteration. - InternalKeySeqNumBatch = uint64(1 << 55) - - // InternalKeySeqNumMax is the largest valid sequence number. - InternalKeySeqNumMax = uint64(1<<56 - 1) - // InternalKeyRangeDeleteSentinel is the marker for a range delete sentinel // key. This sequence number and kind are used for the upper stable boundary // when a range deletion tombstone is the largest key in an sstable. This is // necessary because sstable boundaries are inclusive, while the end key of a // range deletion tombstone is exclusive. - InternalKeyRangeDeleteSentinel = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeDelete) + InternalKeyRangeDeleteSentinel = (InternalKeyTrailer(SeqNumMax) << 8) | InternalKeyTrailer(InternalKeyKindRangeDelete) // InternalKeyBoundaryRangeKey is the marker for a range key boundary. This // sequence number and kind are used during interleaved range key and point // iteration to allow an iterator to stop at range key start keys where // there exists no point key. - InternalKeyBoundaryRangeKey = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeKeySet) + InternalKeyBoundaryRangeKey = (InternalKeyTrailer(SeqNumMax) << 8) | InternalKeyTrailer(InternalKeyKindRangeKeySet) ) // Assert InternalKeyKindSSTableInternalObsoleteBit > InternalKeyKindMax @@ -153,6 +207,7 @@ var internalKeyKindNames = []string{ InternalKeyKindRangeKeyDelete: "RANGEKEYDEL", InternalKeyKindIngestSST: "INGESTSST", InternalKeyKindDeleteSized: "DELSIZED", + InternalKeyKindExcise: "EXCISE", InternalKeyKindInvalid: "INVALID", } @@ -168,6 +223,36 @@ func (k InternalKeyKind) SafeFormat(w redact.SafePrinter, _ rune) { w.Print(redact.SafeString(k.String())) } +// InternalKeyTrailer encodes a SeqNum and an InternalKeyKind. +type InternalKeyTrailer uint64 + +// MakeTrailer constructs an internal key trailer from the specified sequence +// number and kind. +func MakeTrailer(seqNum SeqNum, kind InternalKeyKind) InternalKeyTrailer { + return (InternalKeyTrailer(seqNum) << 8) | InternalKeyTrailer(kind) +} + +// String imlements the fmt.Stringer interface. +func (t InternalKeyTrailer) String() string { + return fmt.Sprintf("%s,%s", SeqNum(t>>8), InternalKeyKind(t&0xff)) +} + +// SeqNum returns the sequence number component of the trailer. +func (t InternalKeyTrailer) SeqNum() SeqNum { + return SeqNum(t >> 8) +} + +// Kind returns the key kind component of the trailer. +func (t InternalKeyTrailer) Kind() InternalKeyKind { + return InternalKeyKind(t & 0xff) +} + +// IsExclusiveSentinel returns true if the trailer is a sentinel for an +// exclusive boundary. +func (t InternalKeyTrailer) IsExclusiveSentinel() bool { + return t.SeqNum() == SeqNumMax +} + // InternalKey is a key used for the in-memory and on-disk partial DBs that // make up a pebble DB. // @@ -177,37 +262,28 @@ func (k InternalKeyKind) SafeFormat(w redact.SafePrinter, _ rune) { // - 7 bytes for a uint56 sequence number, in little-endian format. type InternalKey struct { UserKey []byte - Trailer uint64 + Trailer InternalKeyTrailer } // InvalidInternalKey is an invalid internal key for which Valid() will return // false. -var InvalidInternalKey = MakeInternalKey(nil, 0, InternalKeyKindInvalid) +var InvalidInternalKey = MakeInternalKey(nil, SeqNumZero, InternalKeyKindInvalid) // MakeInternalKey constructs an internal key from a specified user key, // sequence number and kind. -func MakeInternalKey(userKey []byte, seqNum uint64, kind InternalKeyKind) InternalKey { +func MakeInternalKey(userKey []byte, seqNum SeqNum, kind InternalKeyKind) InternalKey { return InternalKey{ UserKey: userKey, - Trailer: (seqNum << 8) | uint64(kind), + Trailer: MakeTrailer(seqNum, kind), } } -// MakeTrailer constructs an internal key trailer from the specified sequence -// number and kind. -func MakeTrailer(seqNum uint64, kind InternalKeyKind) uint64 { - return (seqNum << 8) | uint64(kind) -} - // MakeSearchKey constructs an internal key that is appropriate for searching // for a the specified user key. The search key contain the maximal sequence // number and kind ensuring that it sorts before any other internal keys for // the same user key. func MakeSearchKey(userKey []byte) InternalKey { - return InternalKey{ - UserKey: userKey, - Trailer: (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindMax), - } + return MakeInternalKey(userKey, SeqNumMax, InternalKeyKindMax) } // MakeRangeDeleteSentinelKey constructs an internal key that is a range @@ -224,10 +300,7 @@ func MakeRangeDeleteSentinelKey(userKey []byte) InternalKey { // exclusive sentinel key, used as the upper boundary for an sstable // when a ranged key is the largest key in an sstable. func MakeExclusiveSentinelKey(kind InternalKeyKind, userKey []byte) InternalKey { - return InternalKey{ - UserKey: userKey, - Trailer: (InternalKeySeqNumMax << 8) | uint64(kind), - } + return MakeInternalKey(userKey, SeqNumMax, kind) } var kindsMap = map[string]InternalKeyKind{ @@ -245,27 +318,29 @@ var kindsMap = map[string]InternalKeyKind{ "RANGEKEYDEL": InternalKeyKindRangeKeyDelete, "INGESTSST": InternalKeyKindIngestSST, "DELSIZED": InternalKeyKindDeleteSized, + "EXCISE": InternalKeyKindExcise, } -// ParseInternalKey parses the string representation of an internal key. The -// format is ... If the seq-num starts with a "b" it -// is marked as a batch-seq-num (i.e. the InternalKeySeqNumBatch bit is set). -func ParseInternalKey(s string) InternalKey { - x := strings.Split(s, ".") - ukey := x[0] - kind, ok := kindsMap[x[1]] - if !ok { - panic(fmt.Sprintf("unknown kind: %q", x[1])) +// ParseSeqNum parses the string representation of a sequence number. +// "inf" is supported as the maximum sequence number (mainly used for exclusive +// end keys). +func ParseSeqNum(s string) SeqNum { + if s == "inf" { + return SeqNumMax + } + batch := s[0] == 'b' + if batch { + s = s[1:] } - j := 0 - if x[2][0] == 'b' { - j = 1 + n, err := strconv.ParseUint(s, 10, 64) + if err != nil { + panic(fmt.Sprintf("error parsing %q as seqnum: %s", s, err)) } - seqNum, _ := strconv.ParseUint(x[2][j:], 10, 64) - if x[2][0] == 'b' { - seqNum |= InternalKeySeqNumBatch + seqNum := SeqNum(n) + if batch { + seqNum |= SeqNumBatchBit } - return MakeInternalKey([]byte(ukey), seqNum, kind) + return seqNum } // ParseKind parses the string representation of an internal key kind. @@ -283,12 +358,12 @@ const InternalTrailerLen = 8 // DecodeInternalKey decodes an encoded internal key. See InternalKey.Encode(). func DecodeInternalKey(encodedKey []byte) InternalKey { n := len(encodedKey) - InternalTrailerLen - var trailer uint64 + var trailer InternalKeyTrailer if n >= 0 { - trailer = binary.LittleEndian.Uint64(encodedKey[n:]) + trailer = InternalKeyTrailer(binary.LittleEndian.Uint64(encodedKey[n:])) encodedKey = encodedKey[:n:n] } else { - trailer = uint64(InternalKeyKindInvalid) + trailer = InternalKeyTrailer(InternalKeyKindInvalid) encodedKey = nil } return InternalKey{ @@ -306,26 +381,21 @@ func InternalCompare(userCmp Compare, a, b InternalKey) int { if x := userCmp(a.UserKey, b.UserKey); x != 0 { return x } - if a.Trailer > b.Trailer { - return -1 - } - if a.Trailer < b.Trailer { - return 1 - } - return 0 + // Reverse order for trailer comparison. + return cmp.Compare(b.Trailer, a.Trailer) } // Encode encodes the receiver into the buffer. The buffer must be large enough // to hold the encoded data. See InternalKey.Size(). func (k InternalKey) Encode(buf []byte) { i := copy(buf, k.UserKey) - binary.LittleEndian.PutUint64(buf[i:], k.Trailer) + binary.LittleEndian.PutUint64(buf[i:], uint64(k.Trailer)) } // EncodeTrailer returns the trailer encoded to an 8-byte array. func (k InternalKey) EncodeTrailer() [8]byte { var buf [8]byte - binary.LittleEndian.PutUint64(buf[:], k.Trailer) + binary.LittleEndian.PutUint64(buf[:], uint64(k.Trailer)) return buf } @@ -336,6 +406,9 @@ func (k InternalKey) EncodeTrailer() [8]byte { func (k InternalKey) Separator( cmp Compare, sep Separator, buf []byte, other InternalKey, ) InternalKey { + if invariants.Enabled && (len(k.UserKey) == 0 || len(other.UserKey) == 0) { + panic(errors.AssertionFailedf("empty keys passed to Separator: %s, %s", k, other)) + } buf = sep(buf, k.UserKey, other.UserKey) if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 { // The separator user key is physically shorter than k.UserKey (if it is @@ -344,7 +417,7 @@ func (k InternalKey) Separator( // any sequence number and kind here to create a valid separator key. We // use the max sequence number to match the behavior of LevelDB and // RocksDB. - return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator) + return MakeInternalKey(buf, SeqNumMax, InternalKeyKindSeparator) } return k } @@ -354,14 +427,14 @@ func (k InternalKey) Separator( // InternalKey.UserKey, though it is valid to pass a nil. func (k InternalKey) Successor(cmp Compare, succ Successor, buf []byte) InternalKey { buf = succ(buf, k.UserKey) - if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 { + if (len(k.UserKey) == 0 || len(buf) <= len(k.UserKey)) && cmp(k.UserKey, buf) < 0 { // The successor user key is physically shorter that k.UserKey (if it is // longer, we'll continue to use "k"), but logically after. Tack on the max // sequence number to the shortened user key. Note that we could tack on // any sequence number and kind here to create a valid separator key. We // use the max sequence number to match the behavior of LevelDB and // RocksDB. - return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator) + return MakeInternalKey(buf, SeqNumMax, InternalKeyKindSeparator) } return k } @@ -372,29 +445,32 @@ func (k InternalKey) Size() int { } // SetSeqNum sets the sequence number component of the key. -func (k *InternalKey) SetSeqNum(seqNum uint64) { - k.Trailer = (seqNum << 8) | (k.Trailer & 0xff) +func (k *InternalKey) SetSeqNum(seqNum SeqNum) { + k.Trailer = (InternalKeyTrailer(seqNum) << 8) | (k.Trailer & 0xff) } // SeqNum returns the sequence number component of the key. -func (k InternalKey) SeqNum() uint64 { - return k.Trailer >> 8 +func (k InternalKey) SeqNum() SeqNum { + return SeqNum(k.Trailer >> 8) } -// SeqNumFromTrailer returns the sequence number component of a trailer. -func SeqNumFromTrailer(t uint64) uint64 { - return t >> 8 +// IsUpperBoundFor returns true if a range ending in k contains the userKey: +// either userKey < k.UserKey or they are equal and k is not an exclusive +// sentinel. +func (k InternalKey) IsUpperBoundFor(cmp Compare, userKey []byte) bool { + c := cmp(userKey, k.UserKey) + return c < 0 || (c == 0 && !k.IsExclusiveSentinel()) } // Visible returns true if the key is visible at the specified snapshot // sequence number. -func (k InternalKey) Visible(snapshot, batchSnapshot uint64) bool { +func (k InternalKey) Visible(snapshot, batchSnapshot SeqNum) bool { return Visible(k.SeqNum(), snapshot, batchSnapshot) } // Visible returns true if a key with the provided sequence number is visible at // the specified snapshot sequence numbers. -func Visible(seqNum uint64, snapshot, batchSnapshot uint64) bool { +func Visible(seqNum SeqNum, snapshot, batchSnapshot SeqNum) bool { // There are two snapshot sequence numbers, one for committed keys and one // for batch keys. If a seqNum is less than `snapshot`, then seqNum // corresponds to a committed key that is visible. If seqNum has its batch @@ -408,23 +484,18 @@ func Visible(seqNum uint64, snapshot, batchSnapshot uint64) bool { // larger snapshot. We dictate that the maximal sequence number is always // visible. return seqNum < snapshot || - ((seqNum&InternalKeySeqNumBatch) != 0 && seqNum < batchSnapshot) || - seqNum == InternalKeySeqNumMax + ((seqNum&SeqNumBatchBit) != 0 && seqNum < batchSnapshot) || + seqNum == SeqNumMax } // SetKind sets the kind component of the key. func (k *InternalKey) SetKind(kind InternalKeyKind) { - k.Trailer = (k.Trailer &^ 0xff) | uint64(kind) + k.Trailer = (k.Trailer &^ 0xff) | InternalKeyTrailer(kind) } // Kind returns the kind component of the key. func (k InternalKey) Kind() InternalKeyKind { - return TrailerKind(k.Trailer) -} - -// TrailerKind returns the key kind of the key trailer. -func TrailerKind(trailer uint64) InternalKeyKind { - return InternalKeyKind(trailer & 0xff) + return k.Trailer.Kind() } // Valid returns true if the key has a valid kind. @@ -452,7 +523,7 @@ func (k *InternalKey) CopyFrom(k2 InternalKey) { // String returns a string representation of the key. func (k InternalKey) String() string { - return fmt.Sprintf("%s#%d,%d", FormatBytes(k.UserKey), k.SeqNum(), k.Kind()) + return fmt.Sprintf("%s#%s,%s", FormatBytes(k.UserKey), k.SeqNum(), k.Kind()) } // Pretty returns a formatter for the key. @@ -464,11 +535,13 @@ func (k InternalKey) Pretty(f FormatKey) fmt.Formatter { // with the same user key if used as an end boundary. See the comment on // InternalKeyRangeDeletionSentinel. func (k InternalKey) IsExclusiveSentinel() bool { + if k.SeqNum() != SeqNumMax { + return false + } switch kind := k.Kind(); kind { - case InternalKeyKindRangeDelete: - return k.Trailer == InternalKeyRangeDeleteSentinel - case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet: - return (k.Trailer >> 8) == InternalKeySeqNumMax + case InternalKeyKindRangeDelete, InternalKeyKindRangeKeyDelete, + InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet: + return true default: return false } @@ -480,27 +553,131 @@ type prettyInternalKey struct { } func (k prettyInternalKey) Format(s fmt.State, c rune) { - if seqNum := k.SeqNum(); seqNum == InternalKeySeqNumMax { - fmt.Fprintf(s, "%s#inf,%s", k.formatKey(k.UserKey), k.Kind()) - } else { - fmt.Fprintf(s, "%s#%d,%s", k.formatKey(k.UserKey), k.SeqNum(), k.Kind()) - } + fmt.Fprintf(s, "%s#%s,%s", k.formatKey(k.UserKey), k.SeqNum(), k.Kind()) } -// ParsePrettyInternalKey parses the pretty string representation of an -// internal key. The format is #,. -func ParsePrettyInternalKey(s string) InternalKey { +// ParseInternalKey parses the string representation of an internal key. The +// format is #,. The older format +// .. is also supported (for now). +// +// If the seq-num starts with a "b" it is marked as a batch-seq-num (i.e. the +// SeqNumBatchBit bit is set). +func ParseInternalKey(s string) InternalKey { + if !strings.Contains(s, "#") { + // Parse the old format: .. + // TODO(radu): get rid of this. + x := strings.Split(s, ".") + if len(x) != 3 { + panic(fmt.Sprintf("invalid internal key %q", s)) + } + ukey := x[0] + kind, ok := kindsMap[x[1]] + if !ok { + panic(fmt.Sprintf("unknown kind: %q", x[1])) + } + seqNum := ParseSeqNum(x[2]) + return MakeInternalKey([]byte(ukey), seqNum, kind) + } x := strings.FieldsFunc(s, func(c rune) bool { return c == '#' || c == ',' }) - ukey := x[0] + if len(x) != 3 { + panic(fmt.Sprintf("invalid key internal %q", s)) + } + userKey := []byte(x[0]) + seqNum := ParseSeqNum(x[1]) kind, ok := kindsMap[x[2]] if !ok { panic(fmt.Sprintf("unknown kind: %q", x[2])) } - var seqNum uint64 - if x[1] == "max" || x[1] == "inf" { - seqNum = InternalKeySeqNumMax - } else { - seqNum, _ = strconv.ParseUint(x[1], 10, 64) + return MakeInternalKey(userKey, seqNum, kind) +} + +// ParseInternalKeyRange parses a string of the form: +// +// [#,-#,] +func ParseInternalKeyRange(s string) (start, end InternalKey) { + s, ok1 := strings.CutPrefix(s, "[") + s, ok2 := strings.CutSuffix(s, "]") + x := strings.Split(s, "-") + if !ok1 || !ok2 || len(x) != 2 { + panic(fmt.Sprintf("invalid key range %q", s)) + } + return ParseInternalKey(x[0]), ParseInternalKey(x[1]) +} + +// MakeInternalKV constructs an InternalKV with the provided internal key and +// value. The value is encoded in-place. +func MakeInternalKV(k InternalKey, v []byte) InternalKV { + return InternalKV{ + K: k, + V: MakeInPlaceValue(v), } - return MakeInternalKey([]byte(ukey), seqNum, kind) +} + +// InternalKV represents a single internal key-value pair. +type InternalKV struct { + K InternalKey + V InternalValue +} + +// Kind returns the KV's internal key kind. +func (kv *InternalKV) Kind() InternalKeyKind { + return kv.K.Kind() +} + +// SeqNum returns the KV's internal key sequence number. +func (kv *InternalKV) SeqNum() SeqNum { + return kv.K.SeqNum() +} + +// InPlaceValue returns the KV's in-place value. +func (kv *InternalKV) InPlaceValue() []byte { + return kv.V.InPlaceValue() +} + +// LazyValue returns a LazyValue containing the KV's value. +func (kv *InternalKV) LazyValue() LazyValue { + return kv.V.LazyValue() +} + +// Value returns the KV's underlying value. +func (kv *InternalKV) Value(buf []byte) (val []byte, callerOwned bool, err error) { + return kv.V.Value(buf) +} + +// Visible returns true if the key is visible at the specified snapshot +// sequence number. +func (kv *InternalKV) Visible(snapshot, batchSnapshot SeqNum) bool { + return Visible(kv.K.SeqNum(), snapshot, batchSnapshot) +} + +// IsExclusiveSentinel returns whether this key excludes point keys +// with the same user key if used as an end boundary. See the comment on +// InternalKeyRangeDeletionSentinel. +func (kv *InternalKV) IsExclusiveSentinel() bool { + return kv.K.IsExclusiveSentinel() +} + +// AtomicSeqNum is an atomic SeqNum. +type AtomicSeqNum struct { + value atomic.Uint64 +} + +// Load atomically loads and returns the stored SeqNum. +func (asn *AtomicSeqNum) Load() SeqNum { + return SeqNum(asn.value.Load()) +} + +// Store atomically stores s. +func (asn *AtomicSeqNum) Store(s SeqNum) { + asn.value.Store(uint64(s)) +} + +// Add atomically adds delta to asn and returns the new value. +func (asn *AtomicSeqNum) Add(delta SeqNum) SeqNum { + return SeqNum(asn.value.Add(uint64(delta))) +} + +// CompareAndSwap executes the compare-and-swap operation. +func (asn *AtomicSeqNum) CompareAndSwap(old, new SeqNum) bool { + return asn.value.CompareAndSwap(uint64(old), uint64(new)) } diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/iterator.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/iterator.go similarity index 89% rename from vendor/github.com/cockroachdb/pebble/internal/base/iterator.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/base/iterator.go index c27cccc..63d8b2f 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/base/iterator.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/iterator.go @@ -5,10 +5,12 @@ package base import ( + "context" "fmt" "time" - "github.com/cockroachdb/pebble/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" "github.com/cockroachdb/redact" ) @@ -101,7 +103,7 @@ type InternalIterator interface { // is pointing at a valid entry, and (nil, nilv) otherwise. Note that SeekGE // only checks the upper bound. It is up to the caller to ensure that key // is greater than or equal to the lower bound. - SeekGE(key []byte, flags SeekGEFlags) (*InternalKey, LazyValue) + SeekGE(key []byte, flags SeekGEFlags) *InternalKV // SeekPrefixGE moves the iterator to the first key/value pair whose key is // greater than or equal to the given key. Returns the key and value if the @@ -109,13 +111,15 @@ type InternalIterator interface { // SeekPrefixGE only checks the upper bound. It is up to the caller to ensure // that key is greater than or equal to the lower bound. // - // The prefix argument is used by some InternalIterator implementations (e.g. - // sstable.Reader) to avoid expensive operations. A user-defined Split - // function must be supplied to the Comparer for the DB. The supplied prefix - // will be the prefix of the given key returned by that Split function. If - // the iterator is able to determine that no key with the prefix exists, it - // can return (nil,nilv). Unlike SeekGE, this is not an indication that - // iteration is exhausted. + // The prefix argument is used by some InternalIterator implementations + // (e.g. sstable.Reader) to avoid expensive operations. This operation is + // only useful when a user-defined Split function is supplied to the + // Comparer for the DB. The supplied prefix will be the prefix of the given + // key returned by that Split function. If the iterator is able to determine + // that no key with the prefix exists, it can return (nil,nilv). Unlike + // SeekGE, this is not an indication that iteration is exhausted. The prefix + // byte slice is guaranteed to be stable until the next absolute positioning + // operation. // // Note that the iterator may return keys not matching the prefix. It is up // to the caller to check if the prefix matches. @@ -126,28 +130,28 @@ type InternalIterator interface { // not supporting reverse iteration in prefix iteration mode until a // different positioning routine (SeekGE, SeekLT, First or Last) switches the // iterator out of prefix iteration. - SeekPrefixGE(prefix, key []byte, flags SeekGEFlags) (*InternalKey, LazyValue) + SeekPrefixGE(prefix, key []byte, flags SeekGEFlags) *InternalKV // SeekLT moves the iterator to the last key/value pair whose key is less // than the given key. Returns the key and value if the iterator is pointing // at a valid entry, and (nil, nilv) otherwise. Note that SeekLT only checks // the lower bound. It is up to the caller to ensure that key is less than // the upper bound. - SeekLT(key []byte, flags SeekLTFlags) (*InternalKey, LazyValue) + SeekLT(key []byte, flags SeekLTFlags) *InternalKV - // First moves the iterator the the first key/value pair. Returns the key and + // First moves the iterator the first key/value pair. Returns the key and // value if the iterator is pointing at a valid entry, and (nil, nilv) // otherwise. Note that First only checks the upper bound. It is up to the // caller to ensure that First() is not called when there is a lower bound, // and instead call SeekGE(lower). - First() (*InternalKey, LazyValue) + First() *InternalKV - // Last moves the iterator the the last key/value pair. Returns the key and + // Last moves the iterator the last key/value pair. Returns the key and // value if the iterator is pointing at a valid entry, and (nil, nilv) // otherwise. Note that Last only checks the lower bound. It is up to the // caller to ensure that Last() is not called when there is an upper bound, // and instead call SeekLT(upper). - Last() (*InternalKey, LazyValue) + Last() *InternalKV // Next moves the iterator to the next key/value pair. Returns the key and // value if the iterator is pointing at a valid entry, and (nil, nilv) @@ -158,7 +162,7 @@ type InternalIterator interface { // key/value pair due to either a prior call to SeekLT or Prev which returned // (nil, nilv). It is not allowed to call Next when the previous call to SeekGE, // SeekPrefixGE or Next returned (nil, nilv). - Next() (*InternalKey, LazyValue) + Next() *InternalKV // NextPrefix moves the iterator to the next key/value pair with a different // prefix than the key at the current iterator position. Returns the key and @@ -174,7 +178,7 @@ type InternalIterator interface { // positioning operation or a call to a forward positioning method that // returned (nil, nilv). It is also not allowed to call NextPrefix when the // iterator is in prefix iteration mode. - NextPrefix(succKey []byte) (*InternalKey, LazyValue) + NextPrefix(succKey []byte) *InternalKV // Prev moves the iterator to the previous key/value pair. Returns the key // and value if the iterator is pointing at a valid entry, and (nil, nilv) @@ -185,7 +189,7 @@ type InternalIterator interface { // key/value pair due to either a prior call to SeekGE or Next which returned // (nil, nilv). It is not allowed to call Prev when the previous call to SeekLT // or Prev returned (nil, nilv). - Prev() (*InternalKey, LazyValue) + Prev() *InternalKV // Error returns any accumulated error. It may not include errors returned // to the client when calling LazyValue.Value(). @@ -193,8 +197,10 @@ type InternalIterator interface { // Close closes the iterator and returns any accumulated error. Exhausting // all the key/value pairs in a table is not considered to be an error. - // It is valid to call Close multiple times. Other methods should not be - // called after the iterator has been closed. + // + // Once Close is called, the iterator should not be used again. Specific + // implementations may support multiple calls to Close (but no other calls + // after the first Close). Close() error // SetBounds sets the lower and upper bounds for the iterator. Note that the @@ -207,7 +213,23 @@ type InternalIterator interface { // optimizations. SetBounds(lower, upper []byte) + // SetContext replaces the context provided at iterator creation, or the + // last one provided by SetContext. + SetContext(ctx context.Context) + fmt.Stringer + + IteratorDebug +} + +// TopLevelIterator extends InternalIterator to include an additional absolute +// positioning method, SeekPrefixGEStrict. +type TopLevelIterator interface { + InternalIterator + + // SeekPrefixGEStrict extends InternalIterator.SeekPrefixGE with a guarantee + // that the iterator only returns keys matching the prefix. + SeekPrefixGEStrict(prefix, key []byte, flags SeekGEFlags) *InternalKV } // SeekGEFlags holds flags that may configure the behavior of a forward seek. @@ -373,7 +395,7 @@ type InternalIteratorStats struct { PointCount uint64 // Points that were iterated over that were covered by range tombstones. It // can be useful for discovering instances of - // https://github.com/cockroachdb/pebble/issues/1070. + // https://github.com/cockroachdb/pebble/v2/issues/1070. PointsCoveredByRangeTombstones uint64 // Stats related to points in value blocks encountered during iteration. @@ -394,6 +416,9 @@ type InternalIteratorStats struct { // ValueBytesFetched is the total byte length of the values (in value // blocks) that were retrieved. ValueBytesFetched uint64 + + // TODO(jackson): Add stats for distinguishing between value-block + // values and blob values. } } @@ -442,3 +467,19 @@ func (s *InternalIteratorStats) SafeFormat(p redact.SafePrinter, verb rune) { humanize.Bytes.Uint64(s.SeparatedPointValue.ValueBytesFetched)) } } + +// IteratorDebug is an interface implemented by all internal iterators and +// fragment iterators. +type IteratorDebug interface { + // DebugTree prints the entire iterator stack, used for debugging. + // + // Each implementation should perform a single Child/Childf call on tp. + DebugTree(tp treeprinter.Node) +} + +// DebugTree returns the iterator tree as a multi-line string. +func DebugTree(iter IteratorDebug) string { + tp := treeprinter.New() + iter.DebugTree(tp) + return tp.String() +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/base/key_bounds.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/key_bounds.go new file mode 100644 index 0000000..9467300 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/key_bounds.go @@ -0,0 +1,247 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package base + +import ( + "fmt" + "slices" + + "github.com/cockroachdb/pebble/v2/internal/invariants" +) + +// KeyRange encodes a key range in user key space. A KeyRange's Start is +// inclusive while its End is exclusive. +// +// KeyRange is equivalent to UserKeyBounds with exclusive end. +type KeyRange struct { + Start, End []byte +} + +// Valid returns true if the KeyRange is defined. +func (k *KeyRange) Valid() bool { + return k.Start != nil && k.End != nil +} + +// Contains returns whether the specified key exists in the KeyRange. +func (k *KeyRange) Contains(cmp Compare, key InternalKey) bool { + v := cmp(key.UserKey, k.End) + return (v < 0 || (v == 0 && key.IsExclusiveSentinel())) && cmp(k.Start, key.UserKey) <= 0 +} + +// UserKeyBounds returns the KeyRange as UserKeyBounds. Also implements the internal `bounded` interface. +func (k KeyRange) UserKeyBounds() UserKeyBounds { + return UserKeyBoundsEndExclusive(k.Start, k.End) +} + +// OverlapsInternalKeyRange checks if the specified internal key range has an +// overlap with the KeyRange. Note that we aren't checking for full containment +// of smallest-largest within k, rather just that there's some intersection +// between the two ranges. +func (k *KeyRange) OverlapsInternalKeyRange(cmp Compare, smallest, largest InternalKey) bool { + ukb := k.UserKeyBounds() + b := UserKeyBoundsFromInternal(smallest, largest) + return ukb.Overlaps(cmp, &b) +} + +// OverlapsKeyRange checks if this span overlaps with the provided KeyRange. +// Note that we aren't checking for full containment of either span in the other, +// just that there's a key x that is in both key ranges. +func (k *KeyRange) OverlapsKeyRange(cmp Compare, span KeyRange) bool { + return cmp(k.Start, span.End) < 0 && cmp(k.End, span.Start) > 0 +} + +// BoundaryKind indicates if a boundary is exclusive or inclusive. +type BoundaryKind uint8 + +// The two possible values of BoundaryKind. +// +// Note that we prefer Exclusive to be the zero value, so that zero +// UserKeyBounds are not valid. +const ( + Exclusive BoundaryKind = iota + Inclusive +) + +// UserKeyBoundary represents the endpoint of a bound which can be exclusive or +// inclusive. +type UserKeyBoundary struct { + Key []byte + Kind BoundaryKind +} + +// UserKeyInclusive creates an inclusive user key boundary. +func UserKeyInclusive(userKey []byte) UserKeyBoundary { + return UserKeyBoundary{ + Key: userKey, + Kind: Inclusive, + } +} + +// UserKeyExclusive creates an exclusive user key boundary. +func UserKeyExclusive(userKey []byte) UserKeyBoundary { + return UserKeyBoundary{ + Key: userKey, + Kind: Exclusive, + } +} + +// UserKeyExclusiveIf creates a user key boundary which can be either inclusive +// or exclusive. +func UserKeyExclusiveIf(userKey []byte, exclusive bool) UserKeyBoundary { + kind := Inclusive + if exclusive { + kind = Exclusive + } + return UserKeyBoundary{ + Key: userKey, + Kind: kind, + } +} + +// IsUpperBoundFor returns true if the boundary is an upper bound for the key; +// i.e. the key is less than the boundary key OR they are equal and the boundary +// is inclusive. +func (eb UserKeyBoundary) IsUpperBoundFor(cmp Compare, userKey []byte) bool { + c := cmp(userKey, eb.Key) + return c < 0 || (c == 0 && eb.Kind == Inclusive) +} + +// IsUpperBoundForInternalKey returns true if boundary is an upper bound for the +// given internal key. +func (eb UserKeyBoundary) IsUpperBoundForInternalKey(cmp Compare, key InternalKey) bool { + c := cmp(key.UserKey, eb.Key) + return c < 0 || (c == 0 && (eb.Kind == Inclusive || key.IsExclusiveSentinel())) +} + +// CompareUpperBounds compares two UserKeyBoundaries as upper bounds (e.g. when +// they are used for UserKeyBounds.End). +func (eb UserKeyBoundary) CompareUpperBounds(cmp Compare, other UserKeyBoundary) int { + switch c := cmp(eb.Key, other.Key); { + case c != 0: + return c + case eb.Kind == other.Kind: + return 0 + case eb.Kind == Inclusive: + // eb is inclusive, other is exclusive. + return 1 + default: + // eb is exclusive, other is inclusive. + return -1 + } +} + +// UserKeyBounds is a user key interval with an inclusive start boundary and +// with an end boundary that can be either inclusive or exclusive. +type UserKeyBounds struct { + Start []byte + End UserKeyBoundary +} + +// UserKeyBoundsInclusive creates the bounds [start, end]. +func UserKeyBoundsInclusive(start []byte, end []byte) UserKeyBounds { + return UserKeyBounds{ + Start: start, + End: UserKeyInclusive(end), + } +} + +// UserKeyBoundsEndExclusive creates the bounds [start, end). +func UserKeyBoundsEndExclusive(start []byte, end []byte) UserKeyBounds { + return UserKeyBounds{ + Start: start, + End: UserKeyExclusive(end), + } +} + +// UserKeyBoundsEndExclusiveIf creates either [start, end] or [start, end) bounds. +func UserKeyBoundsEndExclusiveIf(start []byte, end []byte, exclusive bool) UserKeyBounds { + return UserKeyBounds{ + Start: start, + End: UserKeyExclusiveIf(end, exclusive), + } +} + +// UserKeyBoundsFromInternal creates the bounds +// [smallest.UserKey, largest.UserKey] or [smallest.UserKey, largest.UserKey) if +// largest is an exclusive sentinel. +// +// smallest must not be an exclusive sentinel. +func UserKeyBoundsFromInternal(smallest, largest InternalKey) UserKeyBounds { + if invariants.Enabled && smallest.IsExclusiveSentinel() { + panic("smallest key is exclusive sentinel") + } + return UserKeyBoundsEndExclusiveIf(smallest.UserKey, largest.UserKey, largest.IsExclusiveSentinel()) +} + +// Valid returns true if the bounds contain at least a user key. +func (b *UserKeyBounds) Valid(cmp Compare) bool { + return b.End.IsUpperBoundFor(cmp, b.Start) +} + +// Overlaps returns true if the bounds overlap. +func (b *UserKeyBounds) Overlaps(cmp Compare, other *UserKeyBounds) bool { + // There is no overlap iff one interval starts after the other ends. + return other.End.IsUpperBoundFor(cmp, b.Start) && b.End.IsUpperBoundFor(cmp, other.Start) +} + +// ContainsBounds returns true if b completely overlaps other. +func (b *UserKeyBounds) ContainsBounds(cmp Compare, other *UserKeyBounds) bool { + if cmp(b.Start, other.Start) > 0 { + return false + } + return other.End.CompareUpperBounds(cmp, b.End) <= 0 +} + +// ContainsUserKey returns true if the user key is within the bounds. +func (b *UserKeyBounds) ContainsUserKey(cmp Compare, userKey []byte) bool { + return cmp(b.Start, userKey) <= 0 && b.End.IsUpperBoundFor(cmp, userKey) +} + +// ContainsInternalKey returns true if the internal key is within the bounds. +func (b *UserKeyBounds) ContainsInternalKey(cmp Compare, key InternalKey) bool { + c := cmp(b.Start, key.UserKey) + return (c < 0 || (c == 0 && !key.IsExclusiveSentinel())) && + b.End.IsUpperBoundForInternalKey(cmp, key) +} + +// Clone returns a copy of the bounds. +func (b UserKeyBounds) Clone() UserKeyBounds { + return UserKeyBounds{ + Start: slices.Clone(b.Start), + End: UserKeyBoundary{Key: slices.Clone(b.End.Key), Kind: b.End.Kind}, + } +} + +func (b UserKeyBounds) String() string { + return b.Format(DefaultFormatter) +} + +// Format converts the bounds to a string of the form "[foo, bar]" or +// "[foo, bar)", using the given key formatter. +func (b UserKeyBounds) Format(fmtKey FormatKey) string { + endC := ']' + if b.End.Kind == Exclusive { + endC = ')' + } + return fmt.Sprintf("[%s, %s%c", fmtKey(b.Start), fmtKey(b.End.Key), endC) +} + +// Union returns bounds that encompass both the receiver and the provided +// bounds. +// +// If the receiver has nil bounds, the other bounds are returned. +func (b *UserKeyBounds) Union(cmp Compare, other UserKeyBounds) UserKeyBounds { + if b.Start == nil && b.End.Key == nil { + return other + } + union := *b + if cmp(union.Start, other.Start) > 0 { + union.Start = other.Start + } + if union.End.CompareUpperBounds(cmp, other.End) < 0 { + union.End = other.End + } + return union +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/lazy_value.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/lazy_value.go similarity index 85% rename from vendor/github.com/cockroachdb/pebble/internal/base/lazy_value.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/base/lazy_value.go index cc6d56d..d784d54 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/base/lazy_value.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/lazy_value.go @@ -4,7 +4,7 @@ package base -import "github.com/cockroachdb/pebble/internal/invariants" +import "context" // A value can have user-defined attributes that are a function of the value // byte slice. For now, we only support "short attributes", which can be @@ -40,7 +40,7 @@ type ShortAttributeExtractor func( // AttributeAndLen represents the pair of value length and the short // attribute. type AttributeAndLen struct { - ValueLen int32 + ValueLen uint32 ShortAttribute ShortAttribute } @@ -157,23 +157,21 @@ type LazyValue struct { // LazyFetcher supports fetching a lazy value. // -// Fetcher and Attribute are to be initialized at creation time. The fields -// are arranged to reduce the sizeof this struct. +// The fields are to be initialized at creation time. type LazyFetcher struct { // Fetcher, given a handle, returns the value. Fetcher ValueFetcher - err error - value []byte // Attribute includes the short attribute and value length. - Attribute AttributeAndLen - fetched bool - callerOwned bool + Attribute AttributeAndLen + // BlobFileID identifies the blob file containing the value. It is only + // populated if the value is stored in a blob file. + BlobFileID BlobFileID } // ValueFetcher is an interface for fetching a value. type ValueFetcher interface { - // Fetch returns the value, given the handle. It is acceptable to call the - // ValueFetcher.Fetch as long as the DB is open. However, one should assume + // FetchHandle returns the value, given the handle. It is acceptable to call the + // ValueFetcher.FetchHandle as long as the DB is open. However, one should assume // there is a fast-path when the iterator tree has not moved off the sstable // iterator that initially provided this LazyValue. Hence, to utilize this // fast-path the caller should try to decide whether it needs the value or @@ -183,45 +181,19 @@ type ValueFetcher interface { // If the fetcher attempted to use buf *and* len(buf) was insufficient, it // will allocate a new slice for the value. In either case it will set // callerOwned to true. - Fetch( - handle []byte, valLen int32, buf []byte) (val []byte, callerOwned bool, err error) + FetchHandle( + ctx context.Context, handle []byte, blobFileID BlobFileID, valLen uint32, buf []byte, + ) (val []byte, callerOwned bool, err error) } // Value returns the underlying value. func (lv *LazyValue) Value(buf []byte) (val []byte, callerOwned bool, err error) { - if lv.Fetcher == nil { - return lv.ValueOrHandle, false, nil - } - // Do the rest of the work in a separate method to attempt mid-stack - // inlining of Value(). Unfortunately, this still does not inline since the - // cost of 85 exceeds the budget of 80. - // - // TODO(sumeer): Packing the return values into a struct{[]byte error bool} - // causes it to be below the budget. Consider this if we need to recover - // more performance. I suspect that inlining this only matters in - // micro-benchmarks, and in actual use cases in CockroachDB it will not - // matter because there is substantial work done with a fetched value. - return lv.fetchValue(buf) -} - -// INVARIANT: lv.Fetcher != nil -func (lv *LazyValue) fetchValue(buf []byte) (val []byte, callerOwned bool, err error) { f := lv.Fetcher - if !f.fetched { - f.fetched = true - f.value, f.callerOwned, f.err = f.Fetcher.Fetch( - lv.ValueOrHandle, lv.Fetcher.Attribute.ValueLen, buf) - } - return f.value, f.callerOwned, f.err -} - -// InPlaceValue returns the value under the assumption that it is in-place. -// This is for Pebble-internal code. -func (lv *LazyValue) InPlaceValue() []byte { - if invariants.Enabled && lv.Fetcher != nil { - panic("value must be in-place") + if f == nil { + return lv.ValueOrHandle, false, nil } - return lv.ValueOrHandle + return f.Fetcher.FetchHandle(context.TODO(), + lv.ValueOrHandle, f.BlobFileID, f.Attribute.ValueLen, buf) } // Len returns the length of the value. @@ -265,8 +237,9 @@ func (lv *LazyValue) Clone(buf []byte, fetcher *LazyFetcher) (LazyValue, []byte) var lvCopy LazyValue if lv.Fetcher != nil { *fetcher = LazyFetcher{ - Fetcher: lv.Fetcher.Fetcher, - Attribute: lv.Fetcher.Attribute, + Fetcher: lv.Fetcher.Fetcher, + Attribute: lv.Fetcher.Attribute, + BlobFileID: lv.Fetcher.BlobFileID, // Not copying anything that has been extracted. } lvCopy.Fetcher = fetcher @@ -281,7 +254,20 @@ func (lv *LazyValue) Clone(buf []byte, fetcher *LazyFetcher) (LazyValue, []byte) return lvCopy, buf } -// MakeInPlaceValue constructs an in-place value. -func MakeInPlaceValue(val []byte) LazyValue { - return LazyValue{ValueOrHandle: val} +// NoBlobFetches is a ValueFetcher that returns an error. It's intended to be +// used in situations where sstables should not encode a blob value, or the +// caller should not fetch the handle's value. +var NoBlobFetches = errValueFetcher{} + +// errValueFetcher is a ValueFetcher that returns an error. +type errValueFetcher struct{} + +var _ ValueFetcher = errValueFetcher{} + +// FetchHandle implements base.ValueFetcher. +func (e errValueFetcher) FetchHandle( + _ context.Context, _ []byte, blobFileID BlobFileID, valLen uint32, _ []byte, +) (val []byte, callerOwned bool, err error) { + err = AssertionFailedf("unexpected blob value: %d-byte from %s", valLen, blobFileID) + return nil, false, err } diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/logger.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/logger.go similarity index 79% rename from vendor/github.com/cockroachdb/pebble/internal/base/logger.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/base/logger.go index e4a2f44..a0308e9 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/base/logger.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/logger.go @@ -10,15 +10,15 @@ import ( "fmt" "log" "os" - "runtime" "sync" - "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/invariants" ) // Logger defines an interface for writing log messages. type Logger interface { Infof(format string, args ...interface{}) + Errorf(format string, args ...interface{}) Fatalf(format string, args ...interface{}) } type defaultLogger struct{} @@ -33,6 +33,11 @@ func (defaultLogger) Infof(format string, args ...interface{}) { _ = log.Output(2, fmt.Sprintf(format, args...)) } +// Errorf implements the Logger.Errorf interface. +func (defaultLogger) Errorf(format string, args ...interface{}) { + _ = log.Output(2, fmt.Sprintf(format, args...)) +} + // Fatalf implements the Logger.Fatalf interface. func (defaultLogger) Fatalf(format string, args ...interface{}) { _ = log.Output(2, fmt.Sprintf(format, args...)) @@ -75,21 +80,27 @@ func (b *InMemLogger) Infof(format string, args ...interface{}) { } } +// Errorf is part of the Logger interface. +func (b *InMemLogger) Errorf(format string, args ...interface{}) { + b.Infof(format, args...) +} + // Fatalf is part of the Logger interface. func (b *InMemLogger) Fatalf(format string, args ...interface{}) { - b.Infof(format, args...) - runtime.Goexit() + b.Infof("FATAL: "+format, args...) } // LoggerAndTracer defines an interface for logging and tracing. type LoggerAndTracer interface { Logger // Eventf formats and emits a tracing log, if tracing is enabled in the - // current context. + // current context. It can also emit to a regular log, if expensive + // logging is enabled. Eventf(ctx context.Context, format string, args ...interface{}) - // IsTracingEnabled returns true if tracing is enabled. It can be used as an - // optimization to avoid calling Eventf (which will be a noop when tracing - // is not enabled) to avoid the overhead of boxing the args. + // IsTracingEnabled returns true if tracing is enabled for this context, + // or expensive logging is enabled. It can be used as an optimization to + // avoid calling Eventf (which will be a noop when tracing or expensive + // logging is not enabled) to avoid the overhead of boxing the args. IsTracingEnabled(ctx context.Context) bool } @@ -125,6 +136,9 @@ var _ LoggerAndTracer = NoopLoggerAndTracer{} // Infof implements LoggerAndTracer. func (l NoopLoggerAndTracer) Infof(format string, args ...interface{}) {} +// Errorf implements LoggerAndTracer. +func (l NoopLoggerAndTracer) Errorf(format string, args ...interface{}) {} + // Fatalf implements LoggerAndTracer. func (l NoopLoggerAndTracer) Fatalf(format string, args ...interface{}) {} diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/merger.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/merger.go similarity index 98% rename from vendor/github.com/cockroachdb/pebble/internal/base/merger.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/base/merger.go index 757d150..c483109 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/base/merger.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/merger.go @@ -7,7 +7,7 @@ package base import "io" // Merge creates a ValueMerger for the specified key initialized with the value -// of one merge operand. +// of one merge operand. The caller retains ownership of key and value. type Merge func(key, value []byte) (ValueMerger, error) // ValueMerger receives merge operands one by one. The operand received is either diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/metrics.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/metrics.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/base/metrics.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/base/metrics.go diff --git a/vendor/github.com/cockroachdb/pebble/internal/base/options.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/options.go similarity index 73% rename from vendor/github.com/cockroachdb/pebble/internal/base/options.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/base/options.go index 316717e..0877a14 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/base/options.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/options.go @@ -6,9 +6,10 @@ package base // SSTable block defaults. const ( - DefaultBlockRestartInterval = 16 - DefaultBlockSize = 4096 - DefaultBlockSizeThreshold = 90 + DefaultBlockRestartInterval = 16 + DefaultBlockSize = 4096 + DefaultBlockSizeThreshold = 90 + SizeClassAwareBlockSizeThreshold = 60 ) // FilterType is the level at which to apply a filter: block or table. @@ -64,6 +65,15 @@ type FilterPolicy interface { NewWriter(ftype FilterType) FilterWriter } +// NoFilterPolicy implements the "none" filter policy. +var NoFilterPolicy FilterPolicy = noFilter{} + +type noFilter struct{} + +func (noFilter) Name() string { return "none" } +func (noFilter) MayContain(ftype FilterType, filter, key []byte) bool { return true } +func (noFilter) NewWriter(ftype FilterType) FilterWriter { panic("not implemented") } + // BlockPropertyFilter is used in an Iterator to filter sstables and blocks // within the sstable. It should not maintain any per-sstable state, and must // be thread-safe. @@ -73,4 +83,9 @@ type BlockPropertyFilter interface { // Intersects returns true if the set represented by prop intersects with // the set in the filter. Intersects(prop []byte) (bool, error) + // SyntheticSuffixIntersects runs Intersects, but only after using the passed in + // suffix arg to modify a decoded copy of the passed in prop. This method only + // needs to be implemented for filters which that will be used with suffix + // replacement. + SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) } diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/base/test_utils.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/test_utils.go new file mode 100644 index 0000000..f6cab13 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/test_utils.go @@ -0,0 +1,270 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package base + +import ( + "context" + "fmt" + "io" + "strconv" + "strings" + + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// NewDeletableSumValueMerger return a ValueMerger which computes the sum of its +// arguments, but transforms a zero sum into a non-existent entry. +func NewDeletableSumValueMerger(key, value []byte) (ValueMerger, error) { + m := &deletableSumValueMerger{} + return m, m.MergeNewer(value) +} + +type deletableSumValueMerger struct { + sum int64 +} + +func (m *deletableSumValueMerger) parseAndCalculate(value []byte) error { + v, err := strconv.ParseInt(string(value), 10, 64) + if err == nil { + m.sum += v + } + return err +} + +func (m *deletableSumValueMerger) MergeNewer(value []byte) error { + return m.parseAndCalculate(value) +} + +func (m *deletableSumValueMerger) MergeOlder(value []byte) error { + return m.parseAndCalculate(value) +} + +func (m *deletableSumValueMerger) Finish(includesBase bool) ([]byte, io.Closer, error) { + if m.sum == 0 { + return nil, nil, nil + } + return []byte(strconv.FormatInt(m.sum, 10)), nil, nil +} + +func (m *deletableSumValueMerger) DeletableFinish( + includesBase bool, +) ([]byte, bool, io.Closer, error) { + value, closer, err := m.Finish(includesBase) + return value, len(value) == 0, closer, err +} + +// FakeKVs constructs InternalKVs from the given key strings, in the format +// "key:seq-num". The values are empty. +func FakeKVs(keys ...string) []InternalKV { + kvs := make([]InternalKV, len(keys)) + for i, k := range keys { + kvs[i] = InternalKV{K: fakeIkey(k)} + } + return kvs +} + +func fakeIkey(s string) InternalKey { + j := strings.Index(s, ":") + seqNum, err := strconv.Atoi(s[j+1:]) + if err != nil { + panic(err) + } + return MakeInternalKey([]byte(s[:j]), SeqNum(seqNum), InternalKeyKindSet) +} + +// NewFakeIter returns an iterator over the given KVs. +func NewFakeIter(kvs []InternalKV) *FakeIter { + return &FakeIter{ + kvs: kvs, + index: 0, + valid: len(kvs) > 0, + } +} + +// FakeIter is an iterator over a fixed set of KVs. +type FakeIter struct { + lower []byte + upper []byte + kvs []InternalKV + index int + valid bool + closeErr error +} + +// FakeIter implements the InternalIterator interface. +var _ InternalIterator = (*FakeIter)(nil) + +// SetCloseErr causes future calls to Error() and Close() to return this error. +func (f *FakeIter) SetCloseErr(closeErr error) { + f.closeErr = closeErr +} + +func (f *FakeIter) String() string { + return "fake" +} + +// SeekGE is part of the InternalIterator interface. +func (f *FakeIter) SeekGE(key []byte, flags SeekGEFlags) *InternalKV { + f.valid = false + for f.index = 0; f.index < len(f.kvs); f.index++ { + if DefaultComparer.Compare(key, f.key().UserKey) <= 0 { + if f.upper != nil && DefaultComparer.Compare(f.upper, f.key().UserKey) <= 0 { + return nil + } + f.valid = true + return f.KV() + } + } + return nil +} + +// SeekPrefixGE is part of the InternalIterator interface. +func (f *FakeIter) SeekPrefixGE(prefix, key []byte, flags SeekGEFlags) *InternalKV { + return f.SeekGE(key, flags) +} + +// SeekLT is part of the InternalIterator interface. +func (f *FakeIter) SeekLT(key []byte, flags SeekLTFlags) *InternalKV { + f.valid = false + for f.index = len(f.kvs) - 1; f.index >= 0; f.index-- { + if DefaultComparer.Compare(key, f.key().UserKey) > 0 { + if f.lower != nil && DefaultComparer.Compare(f.lower, f.key().UserKey) > 0 { + return nil + } + f.valid = true + return f.KV() + } + } + return nil +} + +// First is part of the InternalIterator interface. +func (f *FakeIter) First() *InternalKV { + f.valid = false + f.index = -1 + if kv := f.Next(); kv == nil { + return nil + } + if f.upper != nil && DefaultComparer.Compare(f.upper, f.key().UserKey) <= 0 { + return nil + } + f.valid = true + return f.KV() +} + +// Last is part of the InternalIterator interface. +func (f *FakeIter) Last() *InternalKV { + f.valid = false + f.index = len(f.kvs) + if kv := f.Prev(); kv == nil { + return nil + } + if f.lower != nil && DefaultComparer.Compare(f.lower, f.key().UserKey) > 0 { + return nil + } + f.valid = true + return f.KV() +} + +// Next is part of the InternalIterator interface. +func (f *FakeIter) Next() *InternalKV { + f.valid = false + if f.index == len(f.kvs) { + return nil + } + f.index++ + if f.index == len(f.kvs) { + return nil + } + if f.upper != nil && DefaultComparer.Compare(f.upper, f.key().UserKey) <= 0 { + return nil + } + f.valid = true + return f.KV() +} + +// Prev is part of the InternalIterator interface. +func (f *FakeIter) Prev() *InternalKV { + f.valid = false + if f.index < 0 { + return nil + } + f.index-- + if f.index < 0 { + return nil + } + if f.lower != nil && DefaultComparer.Compare(f.lower, f.key().UserKey) > 0 { + return nil + } + f.valid = true + return f.KV() +} + +// NextPrefix is part of the InternalIterator interface. +func (f *FakeIter) NextPrefix(succKey []byte) *InternalKV { + return f.SeekGE(succKey, SeekGEFlagsNone) +} + +// key returns the current Key the iterator is positioned at regardless of the +// value of f.valid. +func (f *FakeIter) key() *InternalKey { + return &f.kvs[f.index].K +} + +// KV is part of the InternalIterator interface. +func (f *FakeIter) KV() *InternalKV { + if f.valid { + return &f.kvs[f.index] + } + // It is invalid to call Key() when Valid() returns false. Rather than + // returning nil here which would technically be more correct, return a + // non-nil key which is the behavior of some InternalIterator + // implementations. This provides better testing of users of + // InternalIterators. + if f.index < 0 { + return &f.kvs[0] + } + return &f.kvs[len(f.kvs)-1] +} + +// Valid is part of the InternalIterator interface. +func (f *FakeIter) Valid() bool { + return f.index >= 0 && f.index < len(f.kvs) && f.valid +} + +// Error is part of the InternalIterator interface. +func (f *FakeIter) Error() error { + return f.closeErr +} + +// Close is part of the InternalIterator interface. +func (f *FakeIter) Close() error { + return f.closeErr +} + +// SetBounds is part of the InternalIterator interface. +func (f *FakeIter) SetBounds(lower, upper []byte) { + f.lower = lower + f.upper = upper +} + +// SetContext is part of the InternalIterator interface. +func (f *FakeIter) SetContext(_ context.Context) {} + +// DebugTree is part of the InternalIterator interface. +func (f *FakeIter) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p)", f, f) +} + +// ParseUserKeyBounds parses UserKeyBounds from a string representation of the +// form "[foo, bar]" or "[foo, bar)". +func ParseUserKeyBounds(s string) UserKeyBounds { + first, last, s := s[0], s[len(s)-1], s[1:len(s)-1] + start, end, ok := strings.Cut(s, ", ") + if !ok || first != '[' || (last != ']' && last != ')') { + panic(fmt.Sprintf("invalid bounds %q", s)) + } + return UserKeyBoundsEndExclusiveIf([]byte(start), []byte(end), last == ')') +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/base/value.go b/vendor/github.com/cockroachdb/pebble/v2/internal/base/value.go new file mode 100644 index 0000000..ec83403 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/base/value.go @@ -0,0 +1,90 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package base + +import "github.com/cockroachdb/pebble/v2/internal/invariants" + +// An InternalValue represents a value. The value may be in-memory, immediately +// accessible, or it may be stored out-of-band and need to be fetched when +// required. +// +// InternalValue is distinct from LazyValue. The LazyValue type is used within +// Pebble's public interface, while InternalValue is an intermediary +// representation of a value used only internally within Pebble. +type InternalValue struct { + lazyValue LazyValue +} + +// MakeLazyValue constructs an InternalValue from a LazyValue. +func MakeLazyValue(v LazyValue) InternalValue { + return InternalValue{lazyValue: v} +} + +// MakeInPlaceValue constructs an in-place value. +func MakeInPlaceValue(val []byte) InternalValue { + return InternalValue{lazyValue: LazyValue{ValueOrHandle: val}} +} + +// IsBlobValueHandle returns true iff the value is a blob value handle, pointing +// to a value stored externally in a blob file. +func (v *InternalValue) IsBlobValueHandle() bool { + f := v.lazyValue.Fetcher + return f != nil && f.BlobFileID > 0 +} + +// IsInPlaceValue returns true iff the value was stored in-place and does not +// need to be fetched externally. +func (v *InternalValue) IsInPlaceValue() bool { + return v.lazyValue.Fetcher == nil +} + +// InPlaceValue returns the value under the assumption that it is in-place. +// This is for Pebble-internal code. +func (v *InternalValue) InPlaceValue() []byte { + if invariants.Enabled && v.lazyValue.Fetcher != nil { + panic("value must be in-place") + } + return v.lazyValue.ValueOrHandle +} + +// LazyValue returns the InternalValue as a LazyValue. +func (v *InternalValue) LazyValue() LazyValue { + return v.lazyValue +} + +// Len returns the length of the value. This is the length of the logical value +// (i.e., the length of the byte slice returned by .Value()) +func (v *InternalValue) Len() int { + return v.lazyValue.Len() +} + +// InternalLen returns the length of the value, if the value is in-place, or the +// length of the handle describing the location of the value if the value is +// stored out-of-band. +func (v *InternalValue) InternalLen() int { + return len(v.lazyValue.ValueOrHandle) +} + +// ValueOrHandle returns the value or handle that is stored inlined. If the +// value is stored out-of-band, the returned slice contains a binary-encoded +// value handle. +func (v *InternalValue) ValueOrHandle() []byte { + return v.lazyValue.ValueOrHandle +} + +// Value returns the KV's underlying value. +func (v *InternalValue) Value(buf []byte) (val []byte, callerOwned bool, err error) { + return v.lazyValue.Value(buf) +} + +// Clone creates a stable copy of the value, by appending bytes to buf. The +// fetcher parameter must be non-nil and may be over-written and used inside the +// returned InternalValue -- this is needed to avoid an allocation. +// +// See LazyValue.Clone for more details. +func (v *InternalValue) Clone(buf []byte, fetcher *LazyFetcher) (InternalValue, []byte) { + lv, buf := v.lazyValue.Clone(buf, fetcher) + return InternalValue{lazyValue: lv}, buf +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/batchskl/README.md b/vendor/github.com/cockroachdb/pebble/v2/internal/batchskl/README.md similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/batchskl/README.md rename to vendor/github.com/cockroachdb/pebble/v2/internal/batchskl/README.md diff --git a/vendor/github.com/cockroachdb/pebble/internal/batchskl/iterator.go b/vendor/github.com/cockroachdb/pebble/v2/internal/batchskl/iterator.go similarity index 79% rename from vendor/github.com/cockroachdb/pebble/internal/batchskl/iterator.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/batchskl/iterator.go index 5917ed1..67989c1 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/batchskl/iterator.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/batchskl/iterator.go @@ -17,7 +17,7 @@ package batchskl -import "github.com/cockroachdb/pebble/internal/base" +import "github.com/cockroachdb/pebble/v2/internal/base" type splice struct { prev uint32 @@ -33,12 +33,25 @@ type Iterator struct { key base.InternalKey lower []byte upper []byte + // {lower,upper}Node are lazily populated with the offset of an arbitrary + // node that is beyond the lower and upper bound respectively. Once + // populated, [lower|upper]Node may be used to detect when iteration has + // reached a bound without performing a key comparison. This may be + // beneficial when performing repeated SeekGEs with TrySeekUsingNext and an + // upper bound set. Once the upper bound has been met, no additional key + // comparisons are necessary. + // + // Note that {lower,upper}Node may be zero if the iterator has not yet + // encountered a node beyond the respective bound. No valid node may ever + // have a zero offset because the skiplist head sentinel node is always + // allocated first, ensuring all other nodes have non-zero offsets. + lowerNode uint32 + upperNode uint32 } // Close resets the iterator. func (it *Iterator) Close() error { - it.list = nil - it.nd = 0 + *it = Iterator{} return nil } @@ -49,7 +62,7 @@ func (it *Iterator) Close() error { // bound. func (it *Iterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKey { if flags.TrySeekUsingNext() { - if it.nd == it.list.tail { + if it.nd == it.list.tail || it.nd == it.upperNode { // Iterator is done. return nil } @@ -73,12 +86,12 @@ func (it *Iterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKey } _, it.nd = it.seekForBaseSplice(key, it.list.abbreviatedKey(key)) - if it.nd == it.list.tail { + if it.nd == it.list.tail || it.nd == it.upperNode { return nil } nodeKey := it.list.getKey(it.nd) if it.upper != nil && it.list.cmp(it.upper, nodeKey.UserKey) <= 0 { - it.nd = it.list.tail + it.upperNode = it.nd return nil } it.key = nodeKey @@ -91,12 +104,12 @@ func (it *Iterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKey // caller to ensure that key is less than the upper bound. func (it *Iterator) SeekLT(key []byte) *base.InternalKey { it.nd, _ = it.seekForBaseSplice(key, it.list.abbreviatedKey(key)) - if it.nd == it.list.head { + if it.nd == it.list.head || it.nd == it.lowerNode { return nil } nodeKey := it.list.getKey(it.nd) if it.lower != nil && it.list.cmp(it.lower, nodeKey.UserKey) > 0 { - it.nd = it.list.head + it.lowerNode = it.nd return nil } it.key = nodeKey @@ -109,12 +122,12 @@ func (it *Iterator) SeekLT(key []byte) *base.InternalKey { // the lower bound (e.g. via a call to SeekGE(lower)). func (it *Iterator) First() *base.InternalKey { it.nd = it.list.getNext(it.list.head, 0) - if it.nd == it.list.tail { + if it.nd == it.list.tail || it.nd == it.upperNode { return nil } nodeKey := it.list.getKey(it.nd) if it.upper != nil && it.list.cmp(it.upper, nodeKey.UserKey) <= 0 { - it.nd = it.list.tail + it.upperNode = it.nd return nil } it.key = nodeKey @@ -127,12 +140,12 @@ func (it *Iterator) First() *base.InternalKey { // bound (e.g. via a call to SeekLT(upper)). func (it *Iterator) Last() *base.InternalKey { it.nd = it.list.getPrev(it.list.tail, 0) - if it.nd == it.list.head { + if it.nd == it.list.head || it.nd == it.lowerNode { return nil } nodeKey := it.list.getKey(it.nd) if it.lower != nil && it.list.cmp(it.lower, nodeKey.UserKey) > 0 { - it.nd = it.list.head + it.lowerNode = it.nd return nil } it.key = nodeKey @@ -143,12 +156,12 @@ func (it *Iterator) Last() *base.InternalKey { // Valid() will be false after this call. func (it *Iterator) Next() *base.InternalKey { it.nd = it.list.getNext(it.nd, 0) - if it.nd == it.list.tail { + if it.nd == it.list.tail || it.nd == it.upperNode { return nil } nodeKey := it.list.getKey(it.nd) if it.upper != nil && it.list.cmp(it.upper, nodeKey.UserKey) <= 0 { - it.nd = it.list.tail + it.upperNode = it.nd return nil } it.key = nodeKey @@ -159,23 +172,18 @@ func (it *Iterator) Next() *base.InternalKey { // Valid() will be false after this call. func (it *Iterator) Prev() *base.InternalKey { it.nd = it.list.getPrev(it.nd, 0) - if it.nd == it.list.head { + if it.nd == it.list.head || it.nd == it.lowerNode { return nil } nodeKey := it.list.getKey(it.nd) if it.lower != nil && it.list.cmp(it.lower, nodeKey.UserKey) > 0 { - it.nd = it.list.head + it.lowerNode = it.nd return nil } it.key = nodeKey return &it.key } -// Key returns the key at the current position. -func (it *Iterator) Key() *base.InternalKey { - return &it.key -} - // KeyInfo returns the offset of the start of the record, the start of the key, // and the end of the key. func (it *Iterator) KeyInfo() (offset, keyStart, keyEnd uint32) { @@ -183,21 +191,6 @@ func (it *Iterator) KeyInfo() (offset, keyStart, keyEnd uint32) { return n.offset, n.keyStart, n.keyEnd } -// Head true iff the iterator is positioned at the sentinel head node. -func (it *Iterator) Head() bool { - return it.nd == it.list.head -} - -// Tail true iff the iterator is positioned at the sentinel tail node. -func (it *Iterator) Tail() bool { - return it.nd == it.list.tail -} - -// Valid returns nil iff the iterator is positioned at a valid node. -func (it *Iterator) Valid() bool { - return it.list != nil && it.nd != it.list.head && it.nd != it.list.tail -} - func (it *Iterator) String() string { return "batch" } @@ -208,6 +201,8 @@ func (it *Iterator) String() string { func (it *Iterator) SetBounds(lower, upper []byte) { it.lower = lower it.upper = upper + it.lowerNode = 0 + it.upperNode = 0 } func (it *Iterator) seekForBaseSplice(key []byte, abbreviatedKey uint64) (prev, next uint32) { diff --git a/vendor/github.com/cockroachdb/pebble/internal/batchskl/skl.go b/vendor/github.com/cockroachdb/pebble/v2/internal/batchskl/skl.go similarity index 97% rename from vendor/github.com/cockroachdb/pebble/internal/batchskl/skl.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/batchskl/skl.go index f56d95c..eb80299 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/batchskl/skl.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/batchskl/skl.go @@ -51,20 +51,19 @@ Key differences: - Node storage grows to an arbitrary size. */ -package batchskl // import "github.com/cockroachdb/pebble/internal/batchskl" +package batchskl // import "github.com/cockroachdb/pebble/v2/internal/batchskl" import ( "bytes" "encoding/binary" "fmt" "math" - "time" + "math/rand/v2" "unsafe" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/constants" - "golang.org/x/exp/rand" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/constants" ) const ( @@ -125,7 +124,7 @@ type Skiplist struct { head uint32 tail uint32 height uint32 // Current height: 1 <= height <= maxHeight - rand rand.PCGSource + rand rand.PCG } var ( @@ -173,7 +172,7 @@ func (s *Skiplist) Init(storage *[]byte, cmp base.Compare, abbreviatedKey base.A nodes: s.nodes[:0], height: 1, } - s.rand.Seed(uint64(time.Now().UnixNano())) + s.rand.Seed(0, rand.Uint64()) const initBufSize = 256 if cap(s.nodes) < initBufSize { @@ -415,7 +414,7 @@ func (s *Skiplist) getKey(nd uint32) base.InternalKey { n := s.node(nd) kind := base.InternalKeyKind((*s.storage)[n.offset]) key := (*s.storage)[n.keyStart:n.keyEnd] - return base.MakeInternalKey(key, uint64(n.offset)|base.InternalKeySeqNumBatch, kind) + return base.MakeInternalKey(key, base.SeqNum(n.offset)|base.SeqNumBatchBit, kind) } func (s *Skiplist) getNext(nd, h uint32) uint32 { diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/binfmt/binfmt.go b/vendor/github.com/cockroachdb/pebble/v2/internal/binfmt/binfmt.go new file mode 100644 index 0000000..5b84b86 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/binfmt/binfmt.go @@ -0,0 +1,300 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package binfmt exposes utilities for formatting binary data with descriptive +// comments. +package binfmt + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "strconv" + "strings" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// New constructs a new binary formatter. +func New(data []byte) *Formatter { + offsetWidth := strconv.Itoa(max(int(math.Log10(float64(len(data)-1)))+1, 1)) + return &Formatter{ + data: data, + lineWidth: 40, + offsetFormatStr: "%0" + offsetWidth + "d-%0" + offsetWidth + "d: ", + } +} + +// Formatter is a utility for formatting binary data with descriptive comments. +type Formatter struct { + buf bytes.Buffer + lines [][2]string // (binary data, comment) tuples + data []byte + off int + anchorOff int + + // config + lineWidth int + linePrefix string + offsetFormatStr string +} + +// SetLinePrefix sets a prefix for each line of formatted output. +func (f *Formatter) SetLinePrefix(prefix string) { + f.linePrefix = prefix +} + +// SetAnchorOffset sets the reference point for relative offset calculations to +// the current offset. Future calls to RelativeOffset() will return an offset +// relative to the current offset. +func (f *Formatter) SetAnchorOffset() { + f.anchorOff = f.off +} + +// RelativeOffset retrieves the current offset relative to the offset at the +// last time SetAnchorOffset was called. If SetAnchorOffset was never called, +// RelativeOffset is equivalent to Offset. +func (f *Formatter) RelativeOffset() int { + return f.off - f.anchorOff +} + +// RelativeData returns the subslice of the original data slice beginning at the +// offset at which SetAnchorOffset was last called. If SetAnchorOffset was never +// called, RelativeData is equivalent to Data. +func (f *Formatter) RelativeData() []byte { + return f.data[f.anchorOff:] +} + +// LineWidth sets the Formatter's maximum line width for binary data. +func (f *Formatter) LineWidth(width int) *Formatter { + f.lineWidth = width + return f +} + +// More returns true if there is more data in the byte slice that can be formatted. +func (f *Formatter) More() bool { + return f.off < len(f.data) +} + +// Remaining returns the number of unformatted bytes remaining in the byte slice. +func (f *Formatter) Remaining() int { + return len(f.data) - f.off +} + +// Offset returns the current offset within the original data slice. +func (f *Formatter) Offset() int { + return f.off +} + +// PeekUint reads a little-endian unsigned integer of the specified width at the +// current offset. +func (f *Formatter) PeekUint(w int) uint64 { + switch w { + case 1: + return uint64(f.data[f.off]) + case 2: + return uint64(binary.LittleEndian.Uint16(f.data[f.off:])) + case 4: + return uint64(binary.LittleEndian.Uint32(f.data[f.off:])) + case 8: + return binary.LittleEndian.Uint64(f.data[f.off:]) + default: + panic("unsupported width") + } +} + +// Byte formats a single byte in binary format, displaying each bit as a zero or +// one. +func (f *Formatter) Byte(format string, args ...interface{}) int { + f.printOffsets(1) + f.printf("b %08b", f.data[f.off]) + f.off++ + f.newline(f.buf.String(), fmt.Sprintf(format, args...)) + return 1 +} + +// HexBytesln formats the next n bytes in hexadecimal format, appending the +// formatted comment string to each line and ending on a newline. +func (f *Formatter) HexBytesln(n int, format string, args ...interface{}) int { + commentLine := strings.TrimSpace(fmt.Sprintf(format, args...)) + printLine := func() { + bytesInLine := min(f.lineWidth/2, n) + if f.buf.Len() == 0 { + f.printOffsets(bytesInLine) + } + f.printf("x %0"+strconv.Itoa(bytesInLine*2)+"x", f.data[f.off:f.off+bytesInLine]) + f.newline(f.buf.String(), commentLine) + f.off += bytesInLine + n -= bytesInLine + } + printLine() + commentLine = "(continued...)" + for n > 0 { + printLine() + } + return n +} + +// HexTextln formats the next n bytes in hexadecimal format, appending a comment +// to each line showing the ASCII equivalent characters for each byte for bytes +// that are human-readable. +func (f *Formatter) HexTextln(n int) int { + printLine := func() { + bytesInLine := min(f.lineWidth/2, n) + if f.buf.Len() == 0 { + f.printOffsets(bytesInLine) + } + f.printf("x %0"+strconv.Itoa(bytesInLine*2)+"x", f.data[f.off:f.off+bytesInLine]) + commentLine := asciiChars(f.data[f.off : f.off+bytesInLine]) + f.newline(f.buf.String(), commentLine) + f.off += bytesInLine + n -= bytesInLine + } + printLine() + for n > 0 { + printLine() + } + return n +} + +// Uvarint decodes the bytes at the current offset as a uvarint, formatting them +// in hexadecimal and prefixing the comment with the encoded decimal value. +func (f *Formatter) Uvarint(format string, args ...interface{}) { + comment := fmt.Sprintf(format, args...) + v, n := binary.Uvarint(f.data[f.off:]) + f.HexBytesln(n, "uvarint(%d): %s", v, comment) +} + +// Line prepares a single line of formatted output that will consume n bytes, +// but formatting those n bytes in multiple ways. The line will be prefixed with +// the offsets for the line's entire data. +func (f *Formatter) Line(n int) Line { + f.printOffsets(n) + return Line{f: f, n: n, i: 0} +} + +// String returns the current formatted output. +func (f *Formatter) String() string { + f.buf.Reset() + // Identify the max width of the binary data so that we can add padding to + // align comments on the right. + binaryLineWidth := 0 + for _, lineData := range f.lines { + binaryLineWidth = max(binaryLineWidth, len(lineData[0])) + } + for _, lineData := range f.lines { + fmt.Fprint(&f.buf, f.linePrefix) + fmt.Fprint(&f.buf, lineData[0]) + if len(lineData[1]) > 0 { + if len(lineData[0]) == 0 { + // There's no binary data on this line, just a comment. Print + // the comment left-aligned. + fmt.Fprint(&f.buf, "# ") + } else { + // Align the comment to the right of the binary data. + fmt.Fprint(&f.buf, strings.Repeat(" ", binaryLineWidth-len(lineData[0]))) + fmt.Fprint(&f.buf, " # ") + } + fmt.Fprint(&f.buf, lineData[1]) + } + fmt.Fprintln(&f.buf) + } + return f.buf.String() +} + +// ToTreePrinter formats the current output and creates a treeprinter child node +// for each line. The current output is reset; the position within the binary +// buffer is not. +func (f *Formatter) ToTreePrinter(tp treeprinter.Node) { + for _, l := range strings.Split(strings.TrimRight(f.String(), "\n"), "\n") { + tp.Child(l) + } + f.buf.Reset() + f.lines = f.lines[:0] +} + +// Pointer returns a pointer into the original data slice at the specified +// offset. +func (f *Formatter) Pointer(off int) unsafe.Pointer { + return unsafe.Pointer(&f.data[f.off+off]) +} + +// Data returns the original data slice. Offset may be used to retrieve the +// current offset within the slice. +func (f *Formatter) Data() []byte { + return f.data +} + +func (f *Formatter) newline(binaryData, comment string) { + f.lines = append(f.lines, [2]string{binaryData, comment}) + f.buf.Reset() +} + +func (f *Formatter) printOffsets(n int) { + f.printf(f.offsetFormatStr, f.off, f.off+n) +} + +func (f *Formatter) printf(format string, args ...interface{}) { + fmt.Fprintf(&f.buf, format, args...) +} + +// Line is a pending line of formatted binary output. +type Line struct { + f *Formatter + n int + i int +} + +// Append appends the provided string to the current line. +func (l Line) Append(s string) Line { + fmt.Fprint(&l.f.buf, s) + return l +} + +// Binary formats the next n bytes in binary format, displaying each bit as +// a zero or one. +func (l Line) Binary(n int) Line { + if n+l.i > l.n { + panic("binary data exceeds consumed line length") + } + for i := 0; i < n; i++ { + l.f.printf("%08b", l.f.data[l.f.off+l.i]) + l.i++ + } + return l +} + +// HexBytes formats the next n bytes in hexadecimal format. +func (l Line) HexBytes(n int) Line { + if n+l.i > l.n { + panic("binary data exceeds consumed line length") + } + l.f.printf("%0"+strconv.Itoa(n*2)+"x", l.f.data[l.f.off+l.i:l.f.off+l.i+n]) + l.i += n + return l +} + +// Done finishes the line, appending the provided comment if any. +func (l Line) Done(format string, args ...interface{}) int { + if l.n != l.i { + panic("unconsumed data in line") + } + l.f.newline(l.f.buf.String(), fmt.Sprintf(format, args...)) + l.f.off += l.n + return l.n +} + +func asciiChars(b []byte) string { + s := make([]byte, len(b)) + for i := range b { + if b[i] >= 32 && b[i] <= 126 { + s[i] = b[i] + } else { + s[i] = '.' + } + } + return string(s) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/binfmt/hexdump.go b/vendor/github.com/cockroachdb/pebble/v2/internal/binfmt/hexdump.go new file mode 100644 index 0000000..4586fa0 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/binfmt/hexdump.go @@ -0,0 +1,53 @@ +package binfmt + +import ( + "bytes" + "fmt" + "io" + "strconv" +) + +// HexDump returns a string representation of the data in a hex dump format. +// The width is the number of bytes per line. +func HexDump(data []byte, width int, includeOffsets bool) string { + var buf bytes.Buffer + FHexDump(&buf, data, width, includeOffsets) + return buf.String() +} + +// FHexDump writes a hex dump of the data to w. +func FHexDump(w io.Writer, data []byte, width int, includeOffsets bool) { + offsetFormatWidth := len(fmt.Sprintf("%x", max(1, len(data)-1))) + offsetFormatStr := "%0" + strconv.Itoa(offsetFormatWidth) + "x" + for i := 0; i < len(data); i += width { + if includeOffsets { + fmt.Fprintf(w, offsetFormatStr+": ", i) + } + for j := 0; j < width; j++ { + if j%4 == 0 { + fmt.Fprint(w, " ") + } + if i+j >= len(data) { + fmt.Fprintf(w, " ") + } else { + fmt.Fprintf(w, "%02x", data[i+j]) + } + } + + fmt.Fprint(w, " | ") + for j := 0; j < width; j++ { + if i+j >= len(data) { + break + } + if j%4 == 0 { + fmt.Fprint(w, " ") + } + if data[i+j] < 32 || data[i+j] > 126 { + fmt.Fprint(w, ".") + } else { + fmt.Fprintf(w, "%c", data[i+j]) + } + } + fmt.Fprintln(w) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/bitflip/bitflip.go b/vendor/github.com/cockroachdb/pebble/v2/internal/bitflip/bitflip.go new file mode 100644 index 0000000..144317d --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/bitflip/bitflip.go @@ -0,0 +1,31 @@ +package bitflip + +// CheckSliceForBitFlip flips bits in data to see if it matches the expected checksum. +// Returns the index and bit if successful. +func CheckSliceForBitFlip( + data []byte, computeChecksum func([]byte) uint32, expectedChecksum uint32, +) (found bool, indexFound int, bitFound int) { + // TODO(edward) This checking process likely can be made faster. + iterationLimit := 40 * (1 << 10) // 40KB + for i := 0; i < min(len(data), iterationLimit); i++ { + foundFlip, bit := checkByteForFlip(data, i, computeChecksum, expectedChecksum) + if foundFlip { + return true, i, bit + } + } + return false, 0, 0 +} + +func checkByteForFlip( + data []byte, i int, computeChecksum func([]byte) uint32, expectedChecksum uint32, +) (found bool, bit int) { + for bit := 0; bit < 8; bit++ { + data[i] ^= (1 << bit) + var computedChecksum = computeChecksum(data) + data[i] ^= (1 << bit) + if computedChecksum == expectedChecksum { + return true, bit + } + } + return false, 0 +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/blobtest/handles.go b/vendor/github.com/cockroachdb/pebble/v2/internal/blobtest/handles.go new file mode 100644 index 0000000..ae7a27e --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/blobtest/handles.go @@ -0,0 +1,304 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package blobtest contains helpers for interacting with value separation and +// blob files in tests. +package blobtest + +import ( + "cmp" + "context" + "math/rand/v2" + "slices" + "strings" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/strparse" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/sstable/blob" +) + +// Values is a helper for using blob handles in tests. It supports parsing a +// human-readable string describing a blob handle, synthesizing unspecified +// fields, and tracking the blob handle to support future fetches. +type Values struct { + References References + + mostRecentBlobFileID base.BlobFileID + mostRecentHandles map[base.BlobFileID]blob.Handle + // trackedHandles maps from a blob handle to its value. The value may be nil + // if the value was not specified (in which case Fetch will + // deterministically derive a random value from the handle itself.) + trackedHandles map[blob.Handle]string +} + +// FetchHandle returns the value corresponding to the given handle. +func (bv *Values) FetchHandle( + ctx context.Context, handleSuffix []byte, blobFileID base.BlobFileID, valLen uint32, _ []byte, +) (val []byte, callerOwned bool, err error) { + if bv.trackedHandles == nil { + return nil, false, errors.New("no tracked handles") + } + + decodedHandleSuffix := blob.DecodeHandleSuffix(handleSuffix) + decodedHandle := blob.Handle{ + BlobFileID: blobFileID, + ValueLen: valLen, + BlockID: decodedHandleSuffix.BlockID, + ValueID: decodedHandleSuffix.ValueID, + } + + value, ok := bv.trackedHandles[decodedHandle] + if !ok { + return nil, false, errors.Newf("unknown handle %s", decodedHandle) + } + + // If there was not an explicitly specified value, generate a random one + // deterministically from the file number, block number and offset in block. + if len(value) == 0 { + return deriveValueFromHandle(decodedHandle), false, nil + } + return []byte(value), false, nil +} + +func deriveValueFromHandle(handle blob.Handle) []byte { + rng := rand.New(rand.NewPCG((uint64(handle.BlobFileID)<<32)|uint64(handle.BlockID), uint64(handle.ValueID))) + b := make([]byte, handle.ValueLen) + for i := range b { + b[i] = 'a' + byte(rng.IntN(26)) + } + return b +} + +// ParseInternalValue parses a debug blob handle from the string, returning the +// handle as an InternalValue and recording the handle's corresponding value. +func (bv *Values) ParseInternalValue(input string) (base.InternalValue, error) { + h, _, err := bv.Parse(input) + if err != nil { + return base.InternalValue{}, err + } + + // Encode the handle suffix to be the 'ValueOrHandle' of the InternalValue. + handleSuffix := blob.HandleSuffix{ + BlockID: h.BlockID, + ValueID: h.ValueID, + } + handleSuffixBytes := make([]byte, blob.MaxInlineHandleLength) + i := handleSuffix.Encode(handleSuffixBytes) + + return base.MakeLazyValue(base.LazyValue{ + ValueOrHandle: handleSuffixBytes[:i], + Fetcher: &base.LazyFetcher{ + Fetcher: bv, + Attribute: base.AttributeAndLen{ + ValueLen: h.ValueLen, + // TODO(jackson): Support user-specified short attributes. + ShortAttribute: base.ShortAttribute(h.ValueLen & 0x07), + }, + BlobFileID: h.BlobFileID, + }, + }), nil +} + +// IsBlobHandle returns true if the input string looks like it's a debug blob +// handle. +func IsBlobHandle(input string) bool { + return strings.HasPrefix(input, "blob{") +} + +// Parse parses a debug blob handle from the string, returning the handle and +// recording the handle's corresponding value. +func (bv *Values) Parse(input string) (h blob.Handle, remaining string, err error) { + if bv.trackedHandles == nil { + bv.trackedHandles = make(map[blob.Handle]string) + bv.mostRecentHandles = make(map[base.BlobFileID]blob.Handle) + } + + defer func() { + if r := recover(); r != nil { + h, err = blob.Handle{}, errFromPanic(r) + } + }() + const debugParserSeparators = `(){};=` + p := strparse.MakeParser(debugParserSeparators, input) + p.Expect("blob") + p.Expect("{") + var value string + var fileNumSet, blockIDSet, valueLenSet, valueIDSet bool + for done := false; !done; { + if p.Done() { + return blob.Handle{}, "", errors.New("unexpected end of input") + } + switch x := p.Next(); x { + case "}": + done = true + case "fileNum": + p.Expect("=") + h.BlobFileID = base.BlobFileID(p.Uint64()) + fileNumSet = true + case "blockID": + p.Expect("=") + h.BlockID = blob.BlockID(p.Uint32()) + blockIDSet = true + case "valueID": + p.Expect("=") + h.ValueID = blob.BlockValueID(p.Uint32()) + valueIDSet = true + case "valueLen": + p.Expect("=") + h.ValueLen = p.Uint32() + valueLenSet = true + case "value": + p.Expect("=") + value = p.Next() + if valueLenSet && h.ValueLen != uint32(len(value)) { + return blob.Handle{}, "", errors.Newf("valueLen mismatch: %d != %d", h.ValueLen, len(value)) + } + default: + return blob.Handle{}, "", errors.Newf("unknown field: %q", x) + } + } + + if !fileNumSet { + h.BlobFileID = bv.mostRecentBlobFileID + } + if !blockIDSet { + h.BlockID = bv.mostRecentHandles[h.BlobFileID].BlockID + } + if !valueIDSet { + if recentHandle, ok := bv.mostRecentHandles[h.BlobFileID]; ok { + h.ValueID = recentHandle.ValueID + 1 + } else { + h.ValueID = 0 + } + } + if !valueLenSet { + if len(value) > 0 { + h.ValueLen = uint32(len(value)) + } else { + h.ValueLen = 12 + } + } + bv.mostRecentBlobFileID = h.BlobFileID + bv.mostRecentHandles[h.BlobFileID] = h + bv.trackedHandles[h] = value + return h, p.Remaining(), nil +} + +// ParseInlineHandle parses a debug blob handle from the string. It maps the +// file number to a reference index using the provided *BlobReferences, +// returning an inline handle. +// +// It's intended for tests that must manually construct inline blob references. +func (bv *Values) ParseInlineHandle( + input string, +) (h blob.InlineHandle, remaining string, err error) { + fullHandle, remaining, err := bv.Parse(input) + if err != nil { + return blob.InlineHandle{}, "", err + } + h = blob.InlineHandle{ + InlineHandlePreface: blob.InlineHandlePreface{ + ReferenceID: bv.References.MapToReferenceID(fullHandle.BlobFileID), + ValueLen: fullHandle.ValueLen, + }, + HandleSuffix: blob.HandleSuffix{ + BlockID: fullHandle.BlockID, + ValueID: fullHandle.ValueID, + }, + } + return h, remaining, nil +} + +// WriteFiles writes all the blob files referenced by Values, using +// newBlobObject to construct new objects. +// +// Return the FileWriterStats for the written blob files. +func (bv *Values) WriteFiles( + newBlobObject func(fileNum base.DiskFileNum) (objstorage.Writable, error), + writerOpts blob.FileWriterOptions, +) (map[base.DiskFileNum]blob.FileWriterStats, error) { + // Organize the handles by file number. + files := make(map[base.DiskFileNum][]blob.Handle) + for handle := range bv.trackedHandles { + diskFileNum := base.DiskFileNum(handle.BlobFileID) + files[diskFileNum] = append(files[diskFileNum], handle) + } + + stats := make(map[base.DiskFileNum]blob.FileWriterStats) + for fileNum, handles := range files { + slices.SortFunc(handles, func(a, b blob.Handle) int { + if v := cmp.Compare(a.BlockID, b.BlockID); v != 0 { + return v + } + return cmp.Compare(a.ValueID, b.ValueID) + }) + writable, err := newBlobObject(fileNum) + if err != nil { + return nil, err + } + writer := blob.NewFileWriter(fileNum, writable, writerOpts) + prevID := -1 + for i, handle := range handles { + if i > 0 && handles[i-1].BlockID != handle.BlockID { + writer.FlushForTesting() + prevID = -1 + } + // The user of a blobtest.Values may specify a value ID for a handle. If + // there's a gap in the value IDs, we need to fill in the missing values + // with synthesized values. + prevID++ + for prevID < int(handle.ValueID) { + writer.AddValue(deriveValueFromHandle(blob.Handle{ + BlobFileID: base.BlobFileID(fileNum), + BlockID: handle.BlockID, + ValueID: blob.BlockValueID(prevID), + ValueLen: 12, + })) + prevID++ + } + + if value, ok := bv.trackedHandles[handle]; ok { + writer.AddValue([]byte(value)) + } else { + writer.AddValue(deriveValueFromHandle(handle)) + } + } + fileStats, err := writer.Close() + if err != nil { + return nil, err + } + stats[fileNum] = fileStats + } + return stats, nil +} + +// errFromPanic can be used in a recover block to convert panics into errors. +func errFromPanic(r any) error { + if err, ok := r.(error); ok { + return err + } + return errors.Errorf("%v", r) +} + +// References is a helper for tests that manually construct inline blob +// references. It tracks the set of file numbers used within a sstable, and maps +// each file number to a reference index (encoded within the +// blob.InlineHandlePreface). +type References struct { + fileIDs []base.BlobFileID +} + +// MapToReferenceID maps the given file number to a reference ID. +func (b *References) MapToReferenceID(fileID base.BlobFileID) blob.ReferenceID { + for i, fn := range b.fileIDs { + if fn == fileID { + return blob.ReferenceID(i) + } + } + i := uint32(len(b.fileIDs)) + b.fileIDs = append(b.fileIDs, fileID) + return blob.ReferenceID(i) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/cgo_off.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/cgo_off.go new file mode 100644 index 0000000..4924eec --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/cgo_off.go @@ -0,0 +1,10 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build !cgo + +package buildtags + +// Cgo is true if we were built with the "cgo" build tag. +const Cgo = false diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/cgo_on.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/cgo_on.go new file mode 100644 index 0000000..fc2da79 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/cgo_on.go @@ -0,0 +1,10 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build cgo + +package buildtags + +// Cgo is true if we were built with the "cgo" build tag. +const Cgo = true diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/invariants_off.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/invariants_off.go new file mode 100644 index 0000000..2e3d26e --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/invariants_off.go @@ -0,0 +1,11 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build !invariants + +package buildtags + +// Invariants indicates if the invariants tag is used. +// See invariants.Enabled. +const Invariants = false diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/invariants_on.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/invariants_on.go new file mode 100644 index 0000000..5880ab5 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/invariants_on.go @@ -0,0 +1,11 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build invariants + +package buildtags + +// Invariants indicates if the invariants tag is used. +// See invariants.Enabled. +const Invariants = true diff --git a/vendor/github.com/cockroachdb/pebble/internal/invariants/race_off.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/race_off.go similarity index 60% rename from vendor/github.com/cockroachdb/pebble/internal/invariants/race_off.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/race_off.go index b2b8c5e..7354895 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/invariants/race_off.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/race_off.go @@ -3,9 +3,8 @@ // the LICENSE file. //go:build !race -// +build !race -package invariants +package buildtags -// RaceEnabled is true if we were built with the "race" build tag. -const RaceEnabled = false +// Race is true if we were built with the "race" build tag. +const Race = false diff --git a/vendor/github.com/cockroachdb/pebble/internal/invariants/race_on.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/race_on.go similarity index 60% rename from vendor/github.com/cockroachdb/pebble/internal/invariants/race_on.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/race_on.go index 46613f7..fead86a 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/invariants/race_on.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/race_on.go @@ -3,9 +3,8 @@ // the LICENSE file. //go:build race -// +build race -package invariants +package buildtags -// RaceEnabled is true if we were built with the "race" build tag. -const RaceEnabled = true +// Race is true if we were built with the "race" build tag. +const Race = true diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/slow_build_off.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/slow_build_off.go new file mode 100644 index 0000000..eb067b6 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/slow_build_off.go @@ -0,0 +1,13 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build !race && !slowbuild + +package buildtags + +// SlowBuild is true if this is an instrumented testing build that is likely +// to be significantly slower (like race or address sanitizer builds). +// +// Slow builds are either race builds or those built with a `slowbuild` tag. +const SlowBuild = false diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/slow_build_on.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/slow_build_on.go new file mode 100644 index 0000000..2325722 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/slow_build_on.go @@ -0,0 +1,13 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build race || slowbuild + +package buildtags + +// SlowBuild is true if this is an instrumented testing build that is likely +// to be significantly slower (like race or address sanitizer builds). +// +// Slow builds are either race builds or those built with a `slowbuild` tag. +const SlowBuild = true diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/tracing_off.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/tracing_off.go new file mode 100644 index 0000000..d1f48e0 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/tracing_off.go @@ -0,0 +1,12 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build !tracing + +package buildtags + +// Tracing indicates if the tracing tag is used. +// +// This tag enables low-level tracing code in the block cache. +const Tracing = false diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/tracing_on.go b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/tracing_on.go new file mode 100644 index 0000000..defbd5f --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/buildtags/tracing_on.go @@ -0,0 +1,12 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build tracing + +package buildtags + +// Tracing indicates if the tracing tag is used. +// +// This tag enables low-level tracing code in the block cache. +const Tracing = true diff --git a/vendor/github.com/cockroachdb/pebble/internal/bytealloc/bytealloc.go b/vendor/github.com/cockroachdb/pebble/v2/internal/bytealloc/bytealloc.go similarity index 97% rename from vendor/github.com/cockroachdb/pebble/internal/bytealloc/bytealloc.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/bytealloc/bytealloc.go index b905270..52f6c7c 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/bytealloc/bytealloc.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/bytealloc/bytealloc.go @@ -14,7 +14,7 @@ package bytealloc -import "github.com/cockroachdb/pebble/internal/rawalloc" +import "github.com/cockroachdb/pebble/v2/internal/rawalloc" // An A provides chunk allocation of []byte, amortizing the overhead of each // allocation. Because the underlying storage for the slices is shared, they diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/LICENSE b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/LICENSE similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/cache/LICENSE rename to vendor/github.com/cockroachdb/pebble/v2/internal/cache/LICENSE diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/cache/block_map.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/block_map.go new file mode 100644 index 0000000..da0e1f3 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/block_map.go @@ -0,0 +1,86 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package cache + +import ( + "fmt" + "os" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/manual" + "github.com/cockroachdb/swiss" +) + +func fibonacciHash(k *key, seed uintptr) uintptr { + // See https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ + const m = 11400714819323198485 + h := uint64(seed) + h ^= uint64(k.id) * m + h ^= uint64(k.fileNum) * m + h ^= k.offset * m + return uintptr(h) +} + +type blockMapAllocator struct{} + +func (blockMapAllocator) Alloc(n int) []swiss.Group[key, *entry] { + size := uintptr(n) * unsafe.Sizeof(swiss.Group[key, *entry]{}) + buf := manual.New(manual.BlockCacheMap, size) + return unsafe.Slice((*swiss.Group[key, *entry])(buf.Data()), n) +} + +func (blockMapAllocator) Free(v []swiss.Group[key, *entry]) { + size := uintptr(len(v)) * unsafe.Sizeof(swiss.Group[key, *entry]{}) + buf := manual.MakeBufUnsafe(unsafe.Pointer(unsafe.SliceData(v)), size) + manual.Free(manual.BlockCacheMap, buf) +} + +var blockMapOptions = []swiss.Option[key, *entry]{ + swiss.WithHash[key, *entry](fibonacciHash), + swiss.WithMaxBucketCapacity[key, *entry](1 << 16), + swiss.WithAllocator[key, *entry](blockMapAllocator{}), +} + +type blockMap struct { + swiss.Map[key, *entry] + closed bool +} + +func newBlockMap(initialCapacity int) *blockMap { + m := &blockMap{} + m.Init(initialCapacity) + + // Note: this is a no-op if invariants are disabled or race is enabled. + invariants.SetFinalizer(m, func(obj interface{}) { + m := obj.(*blockMap) + if !m.closed { + fmt.Fprintf(os.Stderr, "%p: block-map not closed\n", m) + os.Exit(1) + } + }) + return m +} + +func (m *blockMap) Init(initialCapacity int) { + m.Map.Init(initialCapacity, blockMapOptions...) +} + +func (m *blockMap) Close() { + m.Map.Close() + m.closed = true +} + +func (m *blockMap) findByValue(v *entry) bool { + var found bool + m.Map.All(func(_ key, e *entry) bool { + if v == e { + found = true + return false + } + return true + }) + return found +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/cache/cache.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/cache.go new file mode 100644 index 0000000..85a3722 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/cache.go @@ -0,0 +1,336 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package cache + +import ( + "context" + "fmt" + "os" + "runtime" + "runtime/debug" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" +) + +// Metrics holds metrics for the cache. +type Metrics struct { + // The number of bytes inuse by the cache. + Size int64 + // The count of objects (blocks or tables) in the cache. + Count int64 + // The number of cache hits. + Hits int64 + // The number of cache misses. + Misses int64 +} + +// Cache implements Pebble's sharded block cache. The Clock-PRO algorithm is +// used for page replacement +// (http://static.usenix.org/event/usenix05/tech/general/full_papers/jiang/jiang_html/html.html). In +// order to provide better concurrency, 4 x NumCPUs shards are created, with +// each shard being given 1/n of the target cache size. The Clock-PRO algorithm +// is run independently on each shard. +// +// Blocks are keyed by an (handleID, fileNum, offset) triple. The handleID is a +// namespace for file numbers and allows a single Cache to be shared between +// multiple Pebble instances (via separate Handles). The fileNum and offset +// refer to an sstable file number and the offset of the block within the file. +// Because sstables are immutable and file numbers are never reused, +// (fileNum,offset) are unique for the lifetime of a Pebble instance. +// +// In addition to maintaining a map from (fileNum,offset) to data, each shard +// maintains a map of the cached blocks for a particular fileNum. This allows +// efficient eviction of all blocks for a file (is used when an sstable is +// deleted from disk). +// +// # Memory Management +// +// A normal implementation of the block cache would result in GC having to read +// through all the structures and keep track of the liveness of many objects. +// This was found to cause significant overhead in CRDB when compared to the +// earlier use of RocksDB. +// +// In order to reduce pressure on the Go GC, manual memory management is +// performed for the data stored in the cache. Manual memory management is +// performed by calling into C.{malloc,free} to allocate memory; this memory is +// outside the purview of the GC. Cache.Values are reference counted and the +// memory backing a manual value is freed when the reference count drops to 0. +// +// Manual memory management brings the possibility of memory leaks. It is +// imperative that every Handle returned by Cache.{Get,Set} is eventually +// released. The "invariants" build tag enables a leak detection facility that +// places a GC finalizer on cache.Value. When the cache.Value finalizer is run, +// if the underlying buffer is still present a leak has occurred. The "tracing" +// build tag enables tracing of cache.Value reference count manipulation and +// eases finding where a leak has occurred. These two facilities are usually +// used in combination by specifying `-tags invariants,tracing`. Note that +// "tracing" produces a significant slowdown, while "invariants" does not. +type Cache struct { + refs atomic.Int64 + maxSize int64 + idAlloc atomic.Uint64 + shards []shard + + // Traces recorded by Cache.trace. Used for debugging. + tr struct { + sync.Mutex + msgs []string + } + stack string +} + +// New creates a new cache of the specified size. Memory for the cache is +// allocated on demand, not during initialization. The cache is created with a +// reference count of 1. Each DB it is associated with adds a reference, so the +// creator of the cache should usually release their reference after the DB is +// created. +// +// c := cache.New(...) +// defer c.Unref() +// d, err := pebble.Open(pebble.Options{Cache: c}) +func New(size int64) *Cache { + // How many cache shards should we create? + // + // Note that the probability two processors will try to access the same + // shard at the same time increases superlinearly with the number of + // processors (consider the birthday problem where each CPU is a person, + // and each shard is a possible birthday). + // + // We could consider growing the number of shards superlinearly, but + // increasing the shard count may reduce the effectiveness of the caching + // algorithm if frequently-accessed blocks are insufficiently distributed + // across shards. + // + // Experimentally, we've observed contention contributing to tail latencies + // at 2 shards per processor. For now we use 4 shards per processor, + // recognizing this may not be final word. + m := 4 * runtime.GOMAXPROCS(0) + + // In tests we can use large CPU machines with small cache sizes and have + // many caches in existence at a time. If sharding into m shards would + // produce too small shards, constrain the number of shards to 4. + const minimumShardSize = 4 << 20 // 4 MiB + if m > 4 && int(size)/m < minimumShardSize { + m = 4 + } + return NewWithShards(size, m) +} + +// NewWithShards creates a new cache with the specified size and number of +// shards. +func NewWithShards(size int64, shards int) *Cache { + c := &Cache{ + maxSize: size, + shards: make([]shard, shards), + stack: string(debug.Stack()), + } + c.refs.Store(1) + c.trace("alloc", c.refs.Load()) + for i := range c.shards { + c.shards[i].init(size / int64(len(c.shards))) + } + + // Note: this is a no-op if invariants are disabled or race is enabled. + invariants.SetFinalizer(c, func(c *Cache) { + if v := c.refs.Load(); v != 0 { + c.tr.Lock() + fmt.Fprintf(os.Stderr, + "pebble: cache (%p) has non-zero reference count: %d\n\n%s\n\n", c, v, c.stack) + if len(c.tr.msgs) > 0 { + fmt.Fprintf(os.Stderr, "%s\n", strings.Join(c.tr.msgs, "\n")) + } + c.tr.Unlock() + os.Exit(1) + } + }) + return c +} + +// Ref adds a reference to the cache. The cache only remains valid as long a +// reference is maintained to it. +func (c *Cache) Ref() { + v := c.refs.Add(1) + if v <= 1 { + panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) + } + c.trace("ref", v) +} + +// Unref releases a reference on the cache. +func (c *Cache) Unref() { + v := c.refs.Add(-1) + c.trace("unref", v) + switch { + case v < 0: + panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) + case v == 0: + for i := range c.shards { + c.shards[i].Free() + } + } +} + +func (c *Cache) NewHandle() *Handle { + c.Ref() + id := handleID(c.idAlloc.Add(1)) + return &Handle{ + cache: c, + id: id, + } +} + +// Reserve N bytes in the cache. This effectively shrinks the size of the cache +// by N bytes, without actually consuming any memory. The returned closure +// should be invoked to release the reservation. +func (c *Cache) Reserve(n int) func() { + // Round-up the per-shard reservation. Most reservations should be large, so + // this probably doesn't matter in practice. + shardN := (n + len(c.shards) - 1) / len(c.shards) + for i := range c.shards { + c.shards[i].Reserve(shardN) + } + return func() { + if shardN == -1 { + panic("pebble: cache reservation already released") + } + for i := range c.shards { + c.shards[i].Reserve(-shardN) + } + shardN = -1 + } +} + +// Metrics returns the metrics for the cache. +func (c *Cache) Metrics() Metrics { + var m Metrics + for i := range c.shards { + s := &c.shards[i] + s.mu.RLock() + m.Count += int64(s.blocks.Len()) + m.Size += s.sizeHot + s.sizeCold + s.mu.RUnlock() + m.Hits += s.hits.Load() + m.Misses += s.misses.Load() + } + return m +} + +// MaxSize returns the max size of the cache. +func (c *Cache) MaxSize() int64 { + return c.maxSize +} + +// Size returns the current space used by the cache. +func (c *Cache) Size() int64 { + var size int64 + for i := range c.shards { + size += c.shards[i].Size() + } + return size +} + +func (c *Cache) getShard(k key) *shard { + idx := k.shardIdx(len(c.shards)) + return &c.shards[idx] +} + +// Handle is the interface through which a store uses the cache. Each store uses +// a separate "handle". A handle corresponds to a separate "namespace" inside +// the cache; a handle cannot see another handle's blocks. +type Handle struct { + cache *Cache + id handleID +} + +// handleID is an ID associated with a Handle; it is unique in the context of a +// Cache instance and serves as a namespace for file numbers, allowing a single +// Cache to be shared among multiple Pebble instances. +type handleID uint64 + +// Cache returns the Cache instance associated with the handle. +func (c *Handle) Cache() *Cache { + return c.cache +} + +// Get retrieves the cache value for the specified file and offset, returning +// nil if no value is present. +func (c *Handle) Get(fileNum base.DiskFileNum, offset uint64) *Value { + k := makeKey(c.id, fileNum, offset) + cv, re := c.cache.getShard(k).getWithMaybeReadEntry(k, false /* desireReadEntry */) + if invariants.Enabled && re != nil { + panic("readEntry should be nil") + } + return cv +} + +// GetWithReadHandle retrieves the cache value for the specified handleID, fileNum +// and offset. If found, a valid Handle is returned (with cacheHit set to +// true), else a valid ReadHandle is returned. +// +// See the ReadHandle declaration for the contract the caller must satisfy +// when getting a valid ReadHandle. +// +// This method can block before returning since multiple concurrent gets for +// the same cache value will take turns getting a ReadHandle, which represents +// permission to do the read. This blocking respects context cancellation, in +// which case an error is returned (and not a valid ReadHandle). +// +// When blocking, the errorDuration return value can be non-zero and is +// populated with the total duration that other readers that observed an error +// (see ReadHandle.SetReadError) spent in doing the read. This duration can be +// greater than the time spent blocked in this method, since some of these +// errors could have occurred prior to this call. But it serves as a rough +// indicator of whether turn taking could have caused higher latency due to +// context cancellation of other readers. +// +// While waiting, someone else may successfully read the value, which results +// in a valid Handle being returned. This is a case where cacheHit=false. +func (c *Handle) GetWithReadHandle( + ctx context.Context, fileNum base.DiskFileNum, offset uint64, +) (cv *Value, rh ReadHandle, errorDuration time.Duration, cacheHit bool, err error) { + k := makeKey(c.id, fileNum, offset) + cv, re := c.cache.getShard(k).getWithMaybeReadEntry(k, true /* desireReadEntry */) + if cv != nil { + return cv, ReadHandle{}, 0, true, nil + } + cv, errorDuration, err = re.waitForReadPermissionOrHandle(ctx) + if err != nil || cv != nil { + re.unrefAndTryRemoveFromMap() + return cv, ReadHandle{}, errorDuration, false, err + } + return nil, ReadHandle{entry: re}, errorDuration, false, nil +} + +// Set sets the cache value for the specified file and offset, overwriting an +// existing value if present. The value must have been allocated by Cache.Alloc. +// +// The cache takes a reference on the Value and holds it until it gets evicted. +func (c *Handle) Set(fileNum base.DiskFileNum, offset uint64, value *Value) { + k := makeKey(c.id, fileNum, offset) + c.cache.getShard(k).set(k, value) +} + +// Delete deletes the cached value for the specified file and offset. +func (c *Handle) Delete(fileNum base.DiskFileNum, offset uint64) { + k := makeKey(c.id, fileNum, offset) + c.cache.getShard(k).delete(k) +} + +// EvictFile evicts all cache values for the specified file. +func (c *Handle) EvictFile(fileNum base.DiskFileNum) { + for i := range c.cache.shards { + c.cache.shards[i].evictFile(c.id, fileNum) + } +} + +func (c *Handle) Close() { + c.cache.Unref() + *c = Handle{} +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/clockpro.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/clockpro.go similarity index 53% rename from vendor/github.com/cockroachdb/pebble/internal/cache/clockpro.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/cache/clockpro.go index cdae6a9..722f3bf 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/clockpro.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/clockpro.go @@ -15,30 +15,54 @@ // The original paper: http://static.usenix.org/event/usenix05/tech/general/full_papers/jiang/jiang_html/html.html // // It is MIT licensed, like the original. -package cache // import "github.com/cockroachdb/pebble/internal/cache" +package cache // import "github.com/cockroachdb/pebble/v2/internal/cache" import ( "fmt" "os" "runtime" "runtime/debug" - "strings" "sync" "sync/atomic" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" ) -type fileKey struct { +// key is associated with a specific block. +type key struct { // id is the namespace for fileNums. - id uint64 + id handleID fileNum base.DiskFileNum + offset uint64 } -type key struct { - fileKey - offset uint64 +func makeKey(id handleID, fileNum base.DiskFileNum, offset uint64) key { + return key{ + id: id, + fileNum: fileNum, + offset: offset, + } +} + +// shardIdx determines the shard index for the given key. +func (k *key) shardIdx(numShards int) int { + if k.id == 0 { + panic("pebble: 0 cache handleID is invalid") + } + // Same as fibonacciHash() but without the cast to uintptr. + const m = 11400714819323198485 + h := uint64(k.id) * m + h ^= uint64(k.fileNum) * m + h ^= k.offset * m + + // We need a 32-bit value below; we use the upper bits as per + // https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ + h >>= 32 + + // This is a better alternative to (h % numShards); see + // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + return int(h * uint64(numShards) >> 32) } // file returns the "file key" for the receiver. This is the key used for the @@ -52,28 +76,6 @@ func (k key) String() string { return fmt.Sprintf("%d/%d/%d", k.id, k.fileNum, k.offset) } -// Handle provides a strong reference to a value in the cache. The reference -// does not pin the value in the cache, but it does prevent the underlying byte -// slice from being reused. -type Handle struct { - value *Value -} - -// Get returns the value stored in handle. -func (h Handle) Get() []byte { - if h.value != nil { - // NB: We don't increment shard.hits in this code path because we only want - // to record a hit when the handle is retrieved from the cache. - return h.value.buf - } - return nil -} - -// Release releases the reference to the cache entry. -func (h Handle) Release() { - h.value.release() -} - type shard struct { hits atomic.Int64 misses atomic.Int64 @@ -83,8 +85,8 @@ type shard struct { reservedSize int64 maxSize int64 coldTarget int64 - blocks robinHoodMap // fileNum+offset -> block - files robinHoodMap // fileNum -> list of blocks + blocks blockMap // fileNum+offset -> block + files blockMap // fileNum -> list of blocks // The blocks and files maps store values in manually managed memory that is // invisible to the Go GC. This is fine for Value and entry objects that are @@ -109,27 +111,54 @@ type shard struct { countHot int64 countCold int64 countTest int64 + + // Some fields in readShard are protected by mu. See comments in declaration + // of readShard. + readShard readShard +} + +func (c *shard) init(maxSize int64) { + *c = shard{ + maxSize: maxSize, + coldTarget: maxSize, + } + if entriesGoAllocated { + c.entries = make(map[*entry]struct{}) + } + c.blocks.Init(16) + c.files.Init(16) + c.readShard.Init(c) } -func (c *shard) Get(id uint64, fileNum base.DiskFileNum, offset uint64) Handle { +// getWithMaybeReadEntry is the internal helper for implementing +// Cache.{Get,GetWithReadHandle}. When desireReadEntry is true, and the block +// is not in the cache (nil Value), a non-nil readEntry is returned (in which +// case the caller is responsible to dereference the entry, via one of +// unrefAndTryRemoveFromMap(), setReadValue(), setReadError()). +func (c *shard) getWithMaybeReadEntry(k key, desireReadEntry bool) (*Value, *readEntry) { c.mu.RLock() var value *Value - if e := c.blocks.Get(key{fileKey{id, fileNum}, offset}); e != nil { + if e, _ := c.blocks.Get(k); e != nil { value = e.acquireValue() - if value != nil { + // Note: we Load first to avoid an atomic XCHG when not necessary. + if value != nil && !e.referenced.Load() { e.referenced.Store(true) } } + var re *readEntry + if value == nil && desireReadEntry { + re = c.readShard.acquireReadEntry(k) + } c.mu.RUnlock() if value == nil { c.misses.Add(1) - return Handle{} + } else { + c.hits.Add(1) } - c.hits.Add(1) - return Handle{value: value} + return value, re } -func (c *shard) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *Value) Handle { +func (c *shard) set(k key, value *Value) { if n := value.refs(); n != 1 { panic(fmt.Sprintf("pebble: Value has already been added to the cache: refs=%d", n)) } @@ -137,13 +166,12 @@ func (c *shard) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *V c.mu.Lock() defer c.mu.Unlock() - k := key{fileKey{id, fileNum}, offset} - e := c.blocks.Get(k) + e, _ := c.blocks.Get(k) switch { case e == nil: // no cache entry? add it - e = newEntry(c, k, int64(len(value.buf))) + e = newEntry(k, int64(len(value.buf))) e.setValue(value) if c.metaAdd(k, e) { value.ref.trace("add-cold") @@ -155,7 +183,7 @@ func (c *shard) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *V e = nil } - case e.peekValue() != nil: + case e.val != nil: // cache entry was a hot or cold page e.setValue(value) e.referenced.Store(true) @@ -165,6 +193,11 @@ func (c *shard) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *V value.ref.trace("add-hot") c.sizeHot += delta } else { + // TODO(sumeer): unclear why we don't set e.ptype to etHot on this path. + // In the default case below, where the state is etTest we set it to + // etHot. But etTest is "colder" than etCold, since the only transition + // into etTest is etCold => etTest, so since etTest transitions to + // etHot, then etCold should also transition. value.ref.trace("add-cold") c.sizeCold += delta } @@ -174,7 +207,10 @@ func (c *shard) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *V // cache entry was a test page c.sizeTest -= e.size c.countTest-- - c.metaDel(e).release() + v := c.metaDel(e) + if invariants.Enabled && v != nil { + panic("value should be nil") + } c.metaCheck(e) e.size = int64(len(value.buf)) @@ -198,10 +234,6 @@ func (c *shard) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *V } c.checkConsistency() - - // Values are initialized with a reference count of 1. That reference count - // is being transferred to the returned Handle. - return Handle{value: value} } func (c *shard) checkConsistency() { @@ -220,12 +252,11 @@ func (c *shard) checkConsistency() { } // Delete deletes the cached value for the specified file and offset. -func (c *shard) Delete(id uint64, fileNum base.DiskFileNum, offset uint64) { +func (c *shard) delete(k key) { // The common case is there is nothing to delete, so do a quick check with // shared lock. - k := key{fileKey{id, fileNum}, offset} c.mu.RLock() - exists := c.blocks.Get(k) != nil + _, exists := c.blocks.Get(k) c.mu.RUnlock() if !exists { return @@ -236,7 +267,7 @@ func (c *shard) Delete(id uint64, fileNum base.DiskFileNum, offset uint64) { c.mu.Lock() defer c.mu.Unlock() - e := c.blocks.Get(k) + e, _ := c.blocks.Get(k) if e == nil { return } @@ -245,12 +276,12 @@ func (c *shard) Delete(id uint64, fileNum base.DiskFileNum, offset uint64) { }() // Now that the mutex has been dropped, release the reference which will // potentially free the memory associated with the previous cached value. - deletedValue.release() + deletedValue.Release() } // EvictFile evicts all of the cache values for the specified file. -func (c *shard) EvictFile(id uint64, fileNum base.DiskFileNum) { - fkey := key{fileKey{id, fileNum}, 0} +func (c *shard) evictFile(id handleID, fileNum base.DiskFileNum) { + fkey := makeKey(id, fileNum, 0) for c.evictFileRun(fkey) { // Sched switch to give another goroutine an opportunity to acquire the // shard mutex. @@ -276,11 +307,11 @@ func (c *shard) evictFileRun(fkey key) (moreRemaining bool) { defer func() { c.mu.Unlock() for _, v := range obsoleteValues { - v.release() + v.Release() } }() - blocks := c.files.Get(fkey) + blocks, _ := c.files.Get(fkey) if blocks == nil { // No blocks for this file. return false @@ -310,12 +341,12 @@ func (c *shard) Free() { // metaCheck call when the "invariants" build tag is specified. for c.handHot != nil { e := c.handHot - c.metaDel(c.handHot).release() + c.metaDel(c.handHot).Release() e.free() } - c.blocks.free() - c.files.free() + c.blocks.Close() + c.files.Close() } func (c *shard) Reserve(n int) { @@ -385,7 +416,7 @@ func (c *shard) metaAdd(key key, e *entry) bool { } fkey := key.file() - if fileBlocks := c.files.Get(fkey); fileBlocks == nil { + if fileBlocks, _ := c.files.Get(fkey); fileBlocks == nil { c.files.Put(fkey, e) } else { fileBlocks.linkFile(e) @@ -397,7 +428,7 @@ func (c *shard) metaAdd(key key, e *entry) bool { // the files map, and ensures that hand{Hot,Cold,Test} are not pointing at the // entry. Returns the deleted value that must be released, if any. func (c *shard) metaDel(e *entry) (deletedValue *Value) { - if value := e.peekValue(); value != nil { + if value := e.val; value != nil { value.ref.trace("metaDel") } // Remove the pointer to the value. @@ -439,19 +470,19 @@ func (c *shard) metaDel(e *entry) (deletedValue *Value) { // Check that the specified entry is not referenced by the cache. func (c *shard) metaCheck(e *entry) { - if invariants.Enabled { + if invariants.Enabled && invariants.Sometimes(1) { if _, ok := c.entries[e]; ok { fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in entries map\n%s", e, e.key, debug.Stack()) os.Exit(1) } - if c.blocks.findByValue(e) != nil { - fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in blocks map\n%s\n%s", + if c.blocks.findByValue(e) { + fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in blocks map\n%#v\n%s", e, e.key, &c.blocks, debug.Stack()) os.Exit(1) } - if c.files.findByValue(e) != nil { - fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in files map\n%s\n%s", + if c.files.findByValue(e) { + fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in files map\n%#v\n%s", e, e.key, &c.files, debug.Stack()) os.Exit(1) } @@ -604,306 +635,10 @@ func (c *shard) runHandTest() { if c.coldTarget < 0 { c.coldTarget = 0 } - c.metaDel(e).release() + c.metaDel(e).Release() c.metaCheck(e) e.free() } c.handTest = c.handTest.next() } - -// Metrics holds metrics for the cache. -type Metrics struct { - // The number of bytes inuse by the cache. - Size int64 - // The count of objects (blocks or tables) in the cache. - Count int64 - // The number of cache hits. - Hits int64 - // The number of cache misses. - Misses int64 -} - -// Cache implements Pebble's sharded block cache. The Clock-PRO algorithm is -// used for page replacement -// (http://static.usenix.org/event/usenix05/tech/general/full_papers/jiang/jiang_html/html.html). In -// order to provide better concurrency, 4 x NumCPUs shards are created, with -// each shard being given 1/n of the target cache size. The Clock-PRO algorithm -// is run independently on each shard. -// -// Blocks are keyed by an (id, fileNum, offset) triple. The ID is a namespace -// for file numbers and allows a single Cache to be shared between multiple -// Pebble instances. The fileNum and offset refer to an sstable file number and -// the offset of the block within the file. Because sstables are immutable and -// file numbers are never reused, (fileNum,offset) are unique for the lifetime -// of a Pebble instance. -// -// In addition to maintaining a map from (fileNum,offset) to data, each shard -// maintains a map of the cached blocks for a particular fileNum. This allows -// efficient eviction of all of the blocks for a file which is used when an -// sstable is deleted from disk. -// -// # Memory Management -// -// In order to reduce pressure on the Go GC, manual memory management is -// performed for the data stored in the cache. Manual memory management is -// performed by calling into C.{malloc,free} to allocate memory. Cache.Values -// are reference counted and the memory backing a manual value is freed when -// the reference count drops to 0. -// -// Manual memory management brings the possibility of memory leaks. It is -// imperative that every Handle returned by Cache.{Get,Set} is eventually -// released. The "invariants" build tag enables a leak detection facility that -// places a GC finalizer on cache.Value. When the cache.Value finalizer is run, -// if the underlying buffer is still present a leak has occurred. The "tracing" -// build tag enables tracing of cache.Value reference count manipulation and -// eases finding where a leak has occurred. These two facilities are usually -// used in combination by specifying `-tags invariants,tracing`. Note that -// "tracing" produces a significant slowdown, while "invariants" does not. -type Cache struct { - refs atomic.Int64 - maxSize int64 - idAlloc atomic.Uint64 - shards []shard - - // Traces recorded by Cache.trace. Used for debugging. - tr struct { - sync.Mutex - msgs []string - } -} - -// New creates a new cache of the specified size. Memory for the cache is -// allocated on demand, not during initialization. The cache is created with a -// reference count of 1. Each DB it is associated with adds a reference, so the -// creator of the cache should usually release their reference after the DB is -// created. -// -// c := cache.New(...) -// defer c.Unref() -// d, err := pebble.Open(pebble.Options{Cache: c}) -func New(size int64) *Cache { - // How many cache shards should we create? - // - // Note that the probability two processors will try to access the same - // shard at the same time increases superlinearly with the number of - // processors (Eg, consider the brithday problem where each CPU is a person, - // and each shard is a possible birthday). - // - // We could consider growing the number of shards superlinearly, but - // increasing the shard count may reduce the effectiveness of the caching - // algorithm if frequently-accessed blocks are insufficiently distributed - // across shards. If a shard's size is smaller than a single frequently - // scanned sstable, then the shard will be unable to hold the entire - // frequently-scanned table in memory despite other shards still holding - // infrequently accessed blocks. - // - // Experimentally, we've observed contention contributing to tail latencies - // at 2 shards per processor. For now we use 4 shards per processor, - // recognizing this may not be final word. - m := 4 * runtime.GOMAXPROCS(0) - - // In tests we can use large CPU machines with small cache sizes and have - // many caches in existence at a time. If sharding into m shards would - // produce too small shards, constrain the number of shards to 4. - const minimumShardSize = 4 << 20 // 4 MiB - if m > 4 && int(size)/m < minimumShardSize { - m = 4 - } - return newShards(size, m) -} - -func newShards(size int64, shards int) *Cache { - c := &Cache{ - maxSize: size, - shards: make([]shard, shards), - } - c.refs.Store(1) - c.idAlloc.Store(1) - c.trace("alloc", c.refs.Load()) - for i := range c.shards { - c.shards[i] = shard{ - maxSize: size / int64(len(c.shards)), - coldTarget: size / int64(len(c.shards)), - } - if entriesGoAllocated { - c.shards[i].entries = make(map[*entry]struct{}) - } - c.shards[i].blocks.init(16) - c.shards[i].files.init(16) - } - - // Note: this is a no-op if invariants are disabled or race is enabled. - invariants.SetFinalizer(c, func(obj interface{}) { - c := obj.(*Cache) - if v := c.refs.Load(); v != 0 { - c.tr.Lock() - fmt.Fprintf(os.Stderr, - "pebble: cache (%p) has non-zero reference count: %d\n", c, v) - if len(c.tr.msgs) > 0 { - fmt.Fprintf(os.Stderr, "%s\n", strings.Join(c.tr.msgs, "\n")) - } - c.tr.Unlock() - os.Exit(1) - } - }) - return c -} - -func (c *Cache) getShard(id uint64, fileNum base.DiskFileNum, offset uint64) *shard { - if id == 0 { - panic("pebble: 0 cache ID is invalid") - } - - // Inlined version of fnv.New64 + Write. - const offset64 = 14695981039346656037 - const prime64 = 1099511628211 - - h := uint64(offset64) - for i := 0; i < 8; i++ { - h *= prime64 - h ^= uint64(id & 0xff) - id >>= 8 - } - fileNumVal := uint64(fileNum.FileNum()) - for i := 0; i < 8; i++ { - h *= prime64 - h ^= uint64(fileNumVal) & 0xff - fileNumVal >>= 8 - } - for i := 0; i < 8; i++ { - h *= prime64 - h ^= uint64(offset & 0xff) - offset >>= 8 - } - - return &c.shards[h%uint64(len(c.shards))] -} - -// Ref adds a reference to the cache. The cache only remains valid as long a -// reference is maintained to it. -func (c *Cache) Ref() { - v := c.refs.Add(1) - if v <= 1 { - panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) - } - c.trace("ref", v) -} - -// Unref releases a reference on the cache. -func (c *Cache) Unref() { - v := c.refs.Add(-1) - c.trace("unref", v) - switch { - case v < 0: - panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) - case v == 0: - for i := range c.shards { - c.shards[i].Free() - } - } -} - -// Get retrieves the cache value for the specified file and offset, returning -// nil if no value is present. -func (c *Cache) Get(id uint64, fileNum base.DiskFileNum, offset uint64) Handle { - return c.getShard(id, fileNum, offset).Get(id, fileNum, offset) -} - -// Set sets the cache value for the specified file and offset, overwriting an -// existing value if present. A Handle is returned which provides faster -// retrieval of the cached value than Get (lock-free and avoidance of the map -// lookup). The value must have been allocated by Cache.Alloc. -func (c *Cache) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *Value) Handle { - return c.getShard(id, fileNum, offset).Set(id, fileNum, offset, value) -} - -// Delete deletes the cached value for the specified file and offset. -func (c *Cache) Delete(id uint64, fileNum base.DiskFileNum, offset uint64) { - c.getShard(id, fileNum, offset).Delete(id, fileNum, offset) -} - -// EvictFile evicts all of the cache values for the specified file. -func (c *Cache) EvictFile(id uint64, fileNum base.DiskFileNum) { - if id == 0 { - panic("pebble: 0 cache ID is invalid") - } - for i := range c.shards { - c.shards[i].EvictFile(id, fileNum) - } -} - -// MaxSize returns the max size of the cache. -func (c *Cache) MaxSize() int64 { - return c.maxSize -} - -// Size returns the current space used by the cache. -func (c *Cache) Size() int64 { - var size int64 - for i := range c.shards { - size += c.shards[i].Size() - } - return size -} - -// Alloc allocates a byte slice of the specified size, possibly reusing -// previously allocated but unused memory. The memory backing the value is -// manually managed. The caller MUST either add the value to the cache (via -// Cache.Set), or release the value (via Cache.Free). Failure to do so will -// result in a memory leak. -func Alloc(n int) *Value { - return newValue(n) -} - -// Free frees the specified value. The buffer associated with the value will -// possibly be reused, making it invalid to use the buffer after calling -// Free. Do not call Free on a value that has been added to the cache. -func Free(v *Value) { - if n := v.refs(); n > 1 { - panic(fmt.Sprintf("pebble: Value has been added to the cache: refs=%d", n)) - } - v.release() -} - -// Reserve N bytes in the cache. This effectively shrinks the size of the cache -// by N bytes, without actually consuming any memory. The returned closure -// should be invoked to release the reservation. -func (c *Cache) Reserve(n int) func() { - // Round-up the per-shard reservation. Most reservations should be large, so - // this probably doesn't matter in practice. - shardN := (n + len(c.shards) - 1) / len(c.shards) - for i := range c.shards { - c.shards[i].Reserve(shardN) - } - return func() { - if shardN == -1 { - panic("pebble: cache reservation already released") - } - for i := range c.shards { - c.shards[i].Reserve(-shardN) - } - shardN = -1 - } -} - -// Metrics returns the metrics for the cache. -func (c *Cache) Metrics() Metrics { - var m Metrics - for i := range c.shards { - s := &c.shards[i] - s.mu.RLock() - m.Count += int64(s.blocks.Count()) - m.Size += s.sizeHot + s.sizeCold - s.mu.RUnlock() - m.Hits += s.hits.Load() - m.Misses += s.misses.Load() - } - return m -} - -// NewID returns a new ID to be used as a namespace for cached file -// blocks. -func (c *Cache) NewID() uint64 { - return c.idAlloc.Add(1) -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/clockpro_normal.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/clockpro_normal.go similarity index 93% rename from vendor/github.com/cockroachdb/pebble/internal/cache/clockpro_normal.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/cache/clockpro_normal.go index ae49938..c8b250c 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/clockpro_normal.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/clockpro_normal.go @@ -3,7 +3,6 @@ // the LICENSE file. //go:build !tracing -// +build !tracing package cache diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/clockpro_tracing.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/clockpro_tracing.go similarity index 95% rename from vendor/github.com/cockroachdb/pebble/internal/cache/clockpro_tracing.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/cache/clockpro_tracing.go index d14c1cd..ccc592b 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/clockpro_tracing.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/clockpro_tracing.go @@ -3,7 +3,6 @@ // the LICENSE file. //go:build tracing -// +build tracing package cache diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/cache/entry.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/entry.go new file mode 100644 index 0000000..f1099b0 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/entry.go @@ -0,0 +1,268 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package cache + +import ( + "fmt" + "os" + "runtime" + "sync" + "sync/atomic" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/buildtags" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/manual" +) + +type entryType int8 + +const ( + etTest entryType = iota + etCold + etHot +) + +func (p entryType) String() string { + switch p { + case etTest: + return "test" + case etCold: + return "cold" + case etHot: + return "hot" + } + return "unknown" +} + +// entry holds the metadata for a cache entry. The memory for an entry is +// allocated from manually managed memory. +// +// Using manual memory management for entries may seem to be a violation of +// the Cgo pointer rules: +// +// https://golang.org/cmd/cgo/#hdr-Passing_pointers +// +// Specifically, Go pointers should not be stored in C allocated memory. The +// reason for this rule is that the Go GC will not look at C allocated memory +// to find pointers to Go objects. If the only reference to a Go object is +// stored in C allocated memory, the object will be reclaimed. The entry +// contains various pointers to other entries. This does not violate the Go +// pointer rules because either all entries are manually allocated or none +// are. Also, even if we had a mix of C and Go allocated memory, which would +// violate the rule, we would not have this reclamation problem since the +// lifetime of the entry is managed by the shard containing it, and not +// reliant on the entry pointers. +type entry struct { + key key + // The value associated with the entry. The entry holds a reference on the + // value which is maintained by entry.setValue(). + val *Value + blockLink struct { + next *entry + prev *entry + } + fileLink struct { + next *entry + prev *entry + } + size int64 + ptype entryType + // referenced is atomically set to indicate that this entry has been accessed + // since the last time one of the clock hands swept it. + referenced atomic.Bool +} + +func newEntry(key key, size int64) *entry { + e := entryAllocNew() + *e = entry{ + key: key, + size: size, + ptype: etCold, + } + e.blockLink.next = e + e.blockLink.prev = e + e.fileLink.next = e + e.fileLink.prev = e + return e +} + +func (e *entry) free() { + e.setValue(nil) + entryAllocFree(e) +} + +func (e *entry) next() *entry { + if e == nil { + return nil + } + return e.blockLink.next +} + +func (e *entry) prev() *entry { + if e == nil { + return nil + } + return e.blockLink.prev +} + +func (e *entry) link(s *entry) { + s.blockLink.prev = e.blockLink.prev + s.blockLink.prev.blockLink.next = s + s.blockLink.next = e + s.blockLink.next.blockLink.prev = s +} + +func (e *entry) unlink() *entry { + next := e.blockLink.next + e.blockLink.prev.blockLink.next = e.blockLink.next + e.blockLink.next.blockLink.prev = e.blockLink.prev + e.blockLink.prev = e + e.blockLink.next = e + return next +} + +func (e *entry) linkFile(s *entry) { + s.fileLink.prev = e.fileLink.prev + s.fileLink.prev.fileLink.next = s + s.fileLink.next = e + s.fileLink.next.fileLink.prev = s +} + +func (e *entry) unlinkFile() *entry { + next := e.fileLink.next + e.fileLink.prev.fileLink.next = e.fileLink.next + e.fileLink.next.fileLink.prev = e.fileLink.prev + e.fileLink.prev = e + e.fileLink.next = e + return next +} + +func (e *entry) setValue(v *Value) { + if v != nil { + v.acquire() + } + old := e.val + e.val = v + old.Release() +} + +func (e *entry) acquireValue() *Value { + v := e.val + if v != nil { + v.acquire() + } + return v +} + +// The entries are normally allocated using the manual package. We use a +// sync.Pool with each item in the pool holding multiple entries that can be +// reused. +// +// We cannot use manual memory when the Value is allocated using the Go +// allocator: in this case, we use the Go allocator because we need the entry +// pointers to the Values to be discoverable by the GC. +// +// We also use the Go allocator in race mode because the normal path relies on a +// finalizer (and historically there have been some finalizer-related bugs in +// the race detector, in go1.15 and earlier). +const entriesGoAllocated = valueEntryGoAllocated || buildtags.Race + +const entrySize = unsafe.Sizeof(entry{}) + +func entryAllocNew() *entry { + if invariants.UseFinalizers { + // We want to allocate each entry independently to check that it has been + // properly cleaned up. + e := &entry{} + invariants.SetFinalizer(e, func(obj interface{}) { + e := obj.(*entry) + if *e != (entry{}) { + fmt.Fprintf(os.Stderr, "%p: entry was not freed", e) + os.Exit(1) + } + }) + return e + } + a := entryAllocPool.Get().(*entryAllocCache) + e := a.alloc() + entryAllocPool.Put(a) + return e +} + +func entryAllocFree(e *entry) { + if invariants.UseFinalizers { + *e = entry{} + return + } + a := entryAllocPool.Get().(*entryAllocCache) + *e = entry{} + a.free(e) + entryAllocPool.Put(a) +} + +var entryAllocPool = sync.Pool{ + New: func() interface{} { + return newEntryAllocCache() + }, +} + +// entryAllocCacheLimit is the maximum number of entries that are cached inside +// a pooled object. +const entryAllocCacheLimit = 128 + +type entryAllocCache struct { + entries []*entry +} + +func newEntryAllocCache() *entryAllocCache { + c := &entryAllocCache{} + if !entriesGoAllocated { + // Note the use of a "real" finalizer here (as opposed to a build tag-gated + // no-op finalizer). Without the finalizer, objects released from the pool + // and subsequently GC'd by the Go runtime would fail to have their manually + // allocated memory freed, which results in a memory leak. + // lint:ignore SetFinalizer + runtime.SetFinalizer(c, freeEntryAllocCache) + } + return c +} + +func freeEntryAllocCache(obj interface{}) { + c := obj.(*entryAllocCache) + for i, e := range c.entries { + c.dealloc(e) + c.entries[i] = nil + } +} + +func (c *entryAllocCache) alloc() *entry { + n := len(c.entries) + if n == 0 { + if entriesGoAllocated { + return &entry{} + } + b := manual.New(manual.BlockCacheEntry, entrySize) + return (*entry)(b.Data()) + } + e := c.entries[n-1] + c.entries = c.entries[:n-1] + return e +} + +func (c *entryAllocCache) dealloc(e *entry) { + if !entriesGoAllocated { + buf := manual.MakeBufUnsafe(unsafe.Pointer(e), entrySize) + manual.Free(manual.BlockCacheEntry, buf) + } +} + +func (c *entryAllocCache) free(e *entry) { + if len(c.entries) == entryAllocCacheLimit { + c.dealloc(e) + return + } + c.entries = append(c.entries, e) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/cache/read_shard.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/read_shard.go new file mode 100644 index 0000000..d79f4e5 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/read_shard.go @@ -0,0 +1,378 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package cache + +import ( + "context" + "sync" + "time" + + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/swiss" +) + +// readShard coordinates the read of a block that will be put in the cache. It +// ensures only one goroutine is reading a block, and other callers block +// until that goroutine is done (with success or failure). In the case of +// success, the other goroutines will use the value that was read, even if it +// is too large to be placed in the cache, or got evicted from the cache +// before they got scheduled. In the case of a failure (read error or context +// cancellation), one of the waiters will be given a turn to do the read. +// +// This turn-taking ensures that a large number of concurrent attempts to read +// the same block that is not in the cache does not result in the same number +// of reads from the filesystem (or remote storage). We have seen large spikes +// in memory usage (and CPU usage for memory allocation/deallocation) without +// this turn-taking. +// +// It introduces a small risk related to context cancellation -- if many +// readers assigned a turn exceed their deadline while doing the read and +// report an error, a reader with a longer deadline can unnecessarily wait. We +// accept this risk for now since the primary production use in CockroachDB is +// filesystem reads, where context cancellation is not respected. We do +// introduce an error duration metric emitted in traces that can be used to +// quantify such wasteful waiting. Note that this same risk extends to waiting +// on the Options.LoadBlockSema, so the error duration metric includes the +// case of an error when waiting on the semaphore (as a side effect of that +// waiting happening in the caller, sstable.Reader). +// +// Design choices and motivation: +// +// - At its core, readShard is a map with synchronization. For the same reason +// the cache is sharded (for higher concurrency by sharding the mutex), it +// is beneficial to shard synchronization on readShard. By making readShard +// a member of shard, this sharding is trivially accomplished. readShard has +// its own mutex (separate from shard.mu), in order to avoid write-locking +// shard.mu when we start a read. +// +// - readShard is integrated with the corresponding cache shard; this allows +// the cache interaction to be hidden behind readEntry.SetReadValue. One +// disadvantage of this tightly integrated design is that it does not +// encompass readers that will put the read value into a block.BufferPool -- +// we don't worry about those since block.BufferPool is only used for +// compactions and there is at most one compaction reader of a block. There +// is the possibility that the compaction reader and a user-facing iterator +// reader will do duplicate reads, but we accept that deficiency. +// +// - readMap is separate from shard.blocks map: One could have a design which +// extends the cache entry and unifies the two maps. However, we never want +// to evict a readEntry while there are readers waiting for the block read +// (including the case where the corresponding file is being removed from +// shard.files). Also, the number of stable cache entries is huge and +// therefore is manually allocated, while the number of readEntries is small +// (so manual allocation isn't necessary). For these reasons we maintain a +// separate map. This separation also results in more modular code, instead +// of piling more stuff into shard. +type readShard struct { + // shard is only used for locking, and calling shard.Set. + shard *shard + // Protected by shard.mu. + mu struct { + sync.Mutex + readMap swiss.Map[key, *readEntry] + } +} + +func (rs *readShard) Init(shard *shard) *readShard { + *rs = readShard{ + shard: shard, + } + // Choice of 16 is arbitrary. + rs.mu.readMap.Init(16) + return rs +} + +// acquireReadEntry acquires a *readEntry for (id, fileNum, offset), creating +// one if necessary. +func (rs *readShard) acquireReadEntry(k key) *readEntry { + rs.mu.Lock() + defer rs.mu.Unlock() + + if e, ok := rs.mu.readMap.Get(k); ok { + // An entry we found in the map while holding the mutex must have a non-zero + // reference count. + if e.refCount < 1 { + panic("invalid reference count") + } + e.refCount++ + return e + } + + e := newReadEntry(rs, k) + rs.mu.readMap.Put(k, e) + return e +} + +func (rs *readShard) lenForTesting() int { + rs.mu.Lock() + defer rs.mu.Unlock() + return rs.mu.readMap.Len() +} + +// readEntry is used to coordinate between concurrent attempted readers of the +// same block. +type readEntry struct { + readShard *readShard + key key + mu struct { + sync.RWMutex + // v, when non-nil, has a ref from readEntry, which is unreffed when + // readEntry is deleted from the readMap. + v *Value + // isReading and ch together capture the state of whether someone has been + // granted a turn to read, and of readers waiting for that read to finish. + // ch is lazily allocated since most readEntries will not see concurrent + // readers. This lazy allocation results in one transition of ch from nil + // to non-nil, so waiters can read this non-nil ch and block on reading + // from it without holding mu. + // + // ch is written to, to signal one waiter to start doing the read. ch is + // closed when the value is successfully read and has been stored in v, so + // that all waiters wake up and read v. ch is a buffered channel with a + // capacity of 1. + // + // State transitions when trying to wait for turn: + // Case !isReading: + // set isReading=true; Drain the ch if non-nil and non-empty; proceed + // with turn to do the read. + // Case isReading: + // allocate ch if nil; wait on ch + // Finished reading successfully: + // set isReading=false; if ch is non-nil, close ch. + // Finished reading with failure: + // set isReading=false; if ch is non-nil, write to ch. + // + // INVARIANT: + // isReading => ch is nil or ch is empty. + isReading bool + ch chan struct{} + // Total duration of reads and semaphore waiting that resulted in error. + errorDuration time.Duration + readStart time.Time + } + // Count of ReadHandles that refer to this readEntry. Protected by readShard.mu. + refCount int32 +} + +var readEntryPool = sync.Pool{ + New: func() interface{} { + return &readEntry{} + }, +} + +func newReadEntry(rs *readShard, k key) *readEntry { + e := readEntryPool.Get().(*readEntry) + *e = readEntry{ + readShard: rs, + key: k, + refCount: 1, + } + return e +} + +// waitForReadPermissionOrHandle returns either an already read value (in +// Handle), an error (if the context was cancelled), or neither, which is a +// directive to the caller to do the read. In this last case the caller must +// call either setReadValue or setReadError. +// +// In all cases, errorDuration is populated with the total duration that +// readers that observed an error (setReadError) spent in doing the read. This +// duration can be greater than the time spend in waitForReadPermissionHandle, +// since some of these errors could have occurred prior to this call. But it +// serves as a rough indicator of whether turn taking could have caused higher +// latency due to context cancellation. +func (e *readEntry) waitForReadPermissionOrHandle( + ctx context.Context, +) (cv *Value, errorDuration time.Duration, err error) { + constructValueLocked := func() *Value { + if e.mu.v == nil { + panic("value is nil") + } + e.mu.v.acquire() + return e.mu.v + } + becomeReaderLocked := func() { + if e.mu.v != nil { + panic("value is non-nil") + } + if e.mu.isReading { + panic("isReading is already true") + } + e.mu.isReading = true + if e.mu.ch != nil { + // Drain the channel, so that no one else mistakenly believes they + // should read. + select { + case <-e.mu.ch: + default: + } + } + e.mu.readStart = time.Now() + } + + for { + e.mu.Lock() + if e.mu.v != nil { + // Value has already been read. + cv = constructValueLocked() + errorDuration = e.mu.errorDuration + e.mu.Unlock() + return cv, errorDuration, nil + } + // Not already read. Wait for turn to do the read or for someone else to do + // the read. + if !e.mu.isReading { + // Have permission to do the read. + becomeReaderLocked() + errorDuration = e.mu.errorDuration + e.mu.Unlock() + return nil, errorDuration, nil + } + if e.mu.ch == nil { + // Rare case when multiple readers are concurrently trying to read. If + // this turns out to be common enough we could use a sync.Pool. + e.mu.ch = make(chan struct{}, 1) + } + ch := e.mu.ch + e.mu.Unlock() + select { + case <-ctx.Done(): + e.mu.RLock() + errorDuration = e.mu.errorDuration + e.mu.RUnlock() + return nil, errorDuration, ctx.Err() + case _, ok := <-ch: + if !ok { + // Channel closed, so value was read. + e.mu.RLock() + if e.mu.v == nil { + panic("value is nil") + } + h := constructValueLocked() + errorDuration = e.mu.errorDuration + e.mu.RUnlock() + return h, errorDuration, nil + } + // Else, probably granted permission to do the read. NB: since isReading + // is false, someone else can slip through before this thread acquires + // e.mu, and take the turn. So try to actually get the turn by trying + // again in the loop. + } + } +} + +// unrefAndTryRemoveFromMap reduces the reference count of e and removes e.key +// => e from the readMap if necessary. +func (e *readEntry) unrefAndTryRemoveFromMap() { + rs := e.readShard + rs.mu.Lock() + e.refCount-- + if e.refCount > 0 { + // Entry still in use. + rs.mu.Unlock() + return + } + if e.refCount < 0 { + panic("invalid reference count") + } + // The refcount is now 0; remove from the map. + if invariants.Enabled { + if e2, ok := rs.mu.readMap.Get(e.key); !ok || e2 != e { + panic("entry not in readMap") + } + } + rs.mu.readMap.Delete(e.key) + rs.mu.Unlock() + + // Free s.e. + e.mu.v.Release() + *e = readEntry{} + readEntryPool.Put(e) +} + +func (e *readEntry) setReadValue(v *Value) { + // Add to the cache before taking another ref for readEntry, since the cache + // expects ref=1 when it is called. + // + // TODO(sumeer): if e.refCount > 1, we should consider overriding to ensure + // that it is added as etHot. The common case will be e.refCount = 1, and we + // don't want to acquire e.mu twice, so one way to do this would be relax + // the invariant in shard.Set that requires Value.refs() == 1. Then we can + // do the work under e.mu before calling shard.Set. + e.readShard.shard.set(e.key, v) + e.mu.Lock() + // Acquire a ref for readEntry, since we are going to remember it in e.mu.v. + v.acquire() + if e.mu.v != nil { + panic("value already set") + } + e.mu.v = v + if !e.mu.isReading { + panic("isReading is false") + } + e.mu.isReading = false + if e.mu.ch != nil { + // Inform all waiters so they can use e.mu.v. Not all readers have called + // readEntry.waitForReadPermissionOrHandle, and those will also use + // e.mu.v. + close(e.mu.ch) + } + e.mu.Unlock() + e.unrefAndTryRemoveFromMap() +} + +func (e *readEntry) setReadError(err error) { + e.mu.Lock() + if !e.mu.isReading { + panic("isReading is false") + } + e.mu.isReading = false + if e.mu.ch != nil { + select { + case e.mu.ch <- struct{}{}: + default: + panic("channel is not empty") + } + } + e.mu.errorDuration += time.Since(e.mu.readStart) + e.mu.Unlock() + e.unrefAndTryRemoveFromMap() +} + +// ReadHandle represents a contract with a caller that had a miss when doing a +// cache lookup, and wants to do a read and insert the read block into the +// cache. The contract applies when ReadHandle.Valid returns true, in which +// case the caller has been assigned the turn to do the read (and others are +// potentially waiting for it). +// +// Contract: +// +// The caller must immediately start doing a read, or can first wait on a +// shared resource that would also block a different reader if it was assigned +// the turn instead (specifically, this refers to Options.LoadBlockSema). +// After the read, it must either call SetReadValue or SetReadError depending +// on whether the read succeeded or failed. +type ReadHandle struct { + entry *readEntry +} + +// Valid returns true for a valid ReadHandle. +func (rh ReadHandle) Valid() bool { + return rh.entry != nil +} + +// SetReadValue provides the Value that the caller has read and sets it in the +// block cache. +// +// The cache takes a reference on the Value and holds it until it is evicted and +// no longer needed by other readers. +func (rh ReadHandle) SetReadValue(v *Value) { + rh.entry.setReadValue(v) +} + +// SetReadError specifies that the caller has encountered a read error. +func (rh ReadHandle) SetReadError(err error) { + rh.entry.setReadError(err) +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/refcnt_normal.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/refcnt_normal.go similarity index 92% rename from vendor/github.com/cockroachdb/pebble/internal/cache/refcnt_normal.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/cache/refcnt_normal.go index 9ab3348..a159267 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/refcnt_normal.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/refcnt_normal.go @@ -3,7 +3,6 @@ // the LICENSE file. //go:build !tracing -// +build !tracing package cache @@ -37,6 +36,8 @@ func (v *refcnt) acquire() { } } +// release decrements the reference count and returns true when the reference +// count becomes 0. func (v *refcnt) release() bool { switch v := v.val.Add(-1); { case v < 0: diff --git a/vendor/github.com/cockroachdb/pebble/internal/cache/refcnt_tracing.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/refcnt_tracing.go similarity index 98% rename from vendor/github.com/cockroachdb/pebble/internal/cache/refcnt_tracing.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/cache/refcnt_tracing.go index 1d5e6c0..151c68d 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/cache/refcnt_tracing.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/refcnt_tracing.go @@ -3,7 +3,6 @@ // the LICENSE file. //go:build tracing -// +build tracing package cache diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/cache/value.go b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/value.go new file mode 100644 index 0000000..0b8999f --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/cache/value.go @@ -0,0 +1,143 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package cache + +import ( + "fmt" + "os" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/buildtags" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/manual" +) + +// ValueMetadataSize denotes the number of bytes of metadata allocated for a +// cache entry. Note that builds with cgo disabled allocate no metadata, and +// 32-bit builds allocate less for a cache.Value. However, we keep the value +// constant to reduce friction for writing tests. +const ValueMetadataSize = 32 + +// Assert that the size of a Value{} is less than or equal to the +// ValueMetadataSize. +var _ uint = ValueMetadataSize - uint(unsafe.Sizeof(Value{})) + +// Value holds a reference counted immutable value. +type Value struct { + // buf is part of the slice allocated using the manual package. + buf []byte + // Reference count for the value. The value is freed when the reference count + // drops to zero. + ref refcnt +} + +// The Value struct is normally allocated together with the buffer using manual +// memory. +// +// If cgo is not available, the Value must be a normal Go object to keep +// the buffer reference visible to the GC. We also use the Go allocator if we +// want to add finalizer assertions. +const valueEntryGoAllocated = !buildtags.Cgo || invariants.UseFinalizers + +// Alloc allocates a byte slice of the specified size, possibly reusing +// previously allocated but unused memory. The memory backing the value is +// manually managed. The caller MUST either add the value to the cache (via +// Cache.Set), or release the value (via Cache.Free). Failure to do so will +// result in a memory leak. +func Alloc(n int) *Value { + if n == 0 { + return nil + } + + if valueEntryGoAllocated { + // Note: if cgo is not enabled, manual.New will do a regular Go allocation. + b := manual.New(manual.BlockCacheData, uintptr(n)) + v := &Value{buf: b.Slice()} + v.ref.init(1) + // Note: this is a no-op if invariants and tracing are disabled or race is + // enabled. + invariants.SetFinalizer(v, func(obj interface{}) { + v := obj.(*Value) + if v.buf != nil { + fmt.Fprintf(os.Stderr, "%p: cache value was not freed: refs=%d\n%s", + v, v.refs(), v.ref.traces()) + os.Exit(1) + } + }) + return v + } + // When we're not performing leak detection, the lifetime of the returned + // Value is exactly the lifetime of the backing buffer and we can manually + // allocate both. + b := manual.New(manual.BlockCacheData, ValueMetadataSize+uintptr(n)) + v := (*Value)(b.Data()) + v.buf = b.Slice()[ValueMetadataSize:] + v.ref.init(1) + return v +} + +func (v *Value) free() { + if invariants.Enabled { + // Poison the contents to help catch use-after-free bugs. + for i := range v.buf { + v.buf[i] = 0xff + } + } + if valueEntryGoAllocated { + buf := manual.MakeBufUnsafe(unsafe.Pointer(unsafe.SliceData(v.buf)), uintptr(cap(v.buf))) + manual.Free(manual.BlockCacheData, buf) + v.buf = nil + return + } + n := ValueMetadataSize + uintptr(cap(v.buf)) + buf := manual.MakeBufUnsafe(unsafe.Pointer(v), n) + v.buf = nil + manual.Free(manual.BlockCacheData, buf) +} + +// RawBuffer returns the buffer associated with the value. The contents of the buffer +// should not be changed once the value has been added to the cache. Instead, a +// new Value should be created and added to the cache to replace the existing +// value. +func (v *Value) RawBuffer() []byte { + if v == nil { + return nil + } + return v.buf +} + +// Truncate the buffer to the specified length. The buffer length should not be +// changed once the value has been added to the cache as there may be +// concurrent readers of the Value. Instead, a new Value should be created and +// added to the cache to replace the existing value. +func (v *Value) Truncate(n int) { + v.buf = v.buf[:n] +} + +func (v *Value) refs() int32 { + return v.ref.refs() +} + +func (v *Value) acquire() { + v.ref.acquire() +} + +// Release a ref count on the buffer. It is a no-op to call Release on a nil +// Value. +func (v *Value) Release() { + if v != nil && v.ref.release() { + v.free() + } +} + +// Free frees the specified value. The buffer associated with the value will +// possibly be reused, making it invalid to use the buffer after calling +// Free. Do not call Free on a value that has been added to the cache. +func Free(v *Value) { + if n := v.refs(); n > 1 { + panic(fmt.Sprintf("pebble: Value has been added to the cache: refs=%d", n)) + } + v.Release() +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compact/iterator.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/iterator.go new file mode 100644 index 0000000..19d1f93 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/iterator.go @@ -0,0 +1,1472 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compact + +import ( + "encoding/binary" + "io" + "strconv" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invalidating" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/rangekey" + "github.com/cockroachdb/redact" +) + +// Iter provides a forward-only iterator that encapsulates the logic for +// collapsing entries during compaction. It wraps an internal iterator and +// collapses entries that are no longer necessary because they are shadowed by +// newer entries. The simplest example of this is when the internal iterator +// contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries, +// compact.Iter collapses the second entry because it is no longer necessary. +// The high-level structure for compact.Iter is to iterate over its internal +// iterator and output 1 entry for every user-key. There are four complications +// to this story. +// +// 1. Eliding Deletion Tombstones +// +// Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to +// a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly +// shadows an entry at a lower level. If we're compacting to the base-level in +// the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower +// level and can be elided. +// +// We can do slightly better than only eliding deletion tombstones at the base +// level by observing that we can elide a deletion tombstone if there are no +// sstables that contain the entry's key. This check is performed by +// elideTombstone. +// +// 2. Merges +// +// The MERGE operation merges the value for an entry with the existing value +// for an entry. The logical value of an entry can be composed of a series of +// merge operations. When compact.Iter sees a MERGE, it scans forward in its +// internal iterator collapsing MERGE operations for the same key until it +// encounters a SET or DELETE operation. For example, the keys a.MERGE.4, +// a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be +// merged using the specified Merger. +// +// An interesting case here occurs when MERGE is combined with SET. Consider +// the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The +// reason that the kind is changed to SET is because the SET operation acts as +// a barrier preventing further merging. This can be seen better in the +// scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower +// (older) level and not involved in the compaction. If the compaction of +// a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with +// a.MERGE.1 would merge the values together incorrectly. +// +// 3. Snapshots +// +// Snapshots are lightweight point-in-time views of the DB state. At its core, +// a snapshot is a sequence number along with a guarantee from Pebble that it +// will maintain the view of the database at that sequence number. Part of this +// guarantee is relatively straightforward to achieve. When reading from the +// database Pebble will ignore sequence numbers that are larger than the +// snapshot sequence number. The primary complexity with snapshots occurs +// during compaction: the collapsing of entries that are shadowed by newer +// entries is at odds with the guarantee that Pebble will maintain the view of +// the database at the snapshot sequence number. Rather than collapsing entries +// up to the next user key, compact.Iter can only collapse entries up to the +// next snapshot boundary. That is, every snapshot boundary potentially causes +// another entry for the same user-key to be emitted. Another way to view this +// is that snapshots define stripes and entries are collapsed within stripes, +// but not across stripes. Consider the following scenario: +// +// a.PUT.9 +// a.DEL.8 +// a.PUT.7 +// a.DEL.6 +// a.PUT.5 +// +// In the absence of snapshots these entries would be collapsed to +// a.PUT.9. What if there is a snapshot at sequence number 7? The entries can +// be divided into two stripes and collapsed within the stripes: +// +// a.PUT.9 a.PUT.9 +// a.DEL.8 ---> +// a.PUT.7 +// -- -- +// a.DEL.6 ---> a.DEL.6 +// a.PUT.5 +// +// All of the rules described earlier still apply, but they are confined to +// operate within a snapshot stripe. Snapshots only affect compaction when the +// snapshot sequence number lies within the range of sequence numbers being +// compacted. In the above example, a snapshot at sequence number 10 or at +// sequence number 5 would not have any effect. +// +// 4. Range Deletions +// +// Range deletions provide the ability to delete all of the keys (and values) +// in a contiguous range. Range deletions are stored indexed by their start +// key. The end key of the range is stored in the value. In order to support +// lookup of the range deletions which overlap with a particular key, the range +// deletion tombstones need to be fragmented whenever they overlap. This +// fragmentation is performed by keyspan.Fragmenter. The fragments are then +// subject to the rules for snapshots. For example, consider the two range +// tombstones [a,e)#1 and [c,g)#2: +// +// 2: c-------g +// 1: a-------e +// +// These tombstones will be fragmented into: +// +// 2: c---e---g +// 1: a---c---e +// +// Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer +// depends on whether it is in a new snapshot stripe. +// +// In addition to the fragmentation of range tombstones, compaction also needs +// to take the range tombstones into consideration when outputting normal +// keys. Just as with point deletions, a range deletion covering an entry can +// cause the entry to be elided. +// +// A note on the stability of keys and values. +// +// The stability guarantees of keys and values returned by the iterator tree +// that backs a compact.Iter is nuanced and care must be taken when +// referencing any returned items. +// +// Keys and values returned by exported functions (i.e. First, Next, etc.) have +// lifetimes that fall into two categories: +// +// Lifetime valid for duration of compaction. Range deletion keys and values are +// stable for the duration of the compaction, due to way in which a +// compact.Iter is typically constructed (i.e. via (*compaction).newInputIter, +// which wraps the iterator over the range deletion block in a noCloseIter, +// preventing the release of the backing memory until the compaction is +// finished). +// +// Lifetime limited to duration of sstable block liveness. Point keys (SET, DEL, +// etc.) and values must be cloned / copied following the return from the +// exported function, and before a subsequent call to Next advances the iterator +// and mutates the contents of the returned key and value. +type Iter struct { + cmp base.Compare + cmpRangeSuffix base.CompareRangeSuffixes + + cfg IterConfig + + // rangeDelInterlaving is an interleaving iterator for range deletions, that + // interleaves range tombstones among the point keys. + rangeDelInterleaving keyspan.InterleavingIter + // rangeKeyInterleaving is the interleaving iter for range keys. + rangeKeyInterleaving keyspan.InterleavingIter + + // iter is the iterator which interleaves points with RANGEDELs and range + // keys. + iter base.InternalIterator + + delElider pointTombstoneElider + rangeDelCompactor RangeDelSpanCompactor + rangeKeyCompactor RangeKeySpanCompactor + err error + // kv is the next key-value pair that will be yielded to the iterator + // consumer. All public methods will return &kv or nil. + // + // The memory of kv.K.UserKey is always backed by `keyBuf`. The memory of + // kv.V may be backed by `valueBuf`, or may point directly into the block + // buffer of an input ssblock. + // + // When stepping the inner internal iterator, the contents of iterKV are + // moved into kv. + // + // The value of kv.K is also used internally within the compaction iterator + // when moving to the next key so it can determine whether the user key has + // changed from the previous key. + kv base.InternalKV + // keyTrailer is updated when `i.kv` is updated and holds the key's original + // trailer (eg, before any sequence-number zeroing or changes to key kind). + keyTrailer base.InternalKeyTrailer + valueCloser io.Closer + // Temporary buffer used for storing the previous user key in order to + // determine when iteration has advanced to a new user key and thus a new + // snapshot stripe. + keyBuf []byte + // Temporary buffer used for storing the previous value, which may be an + // unsafe, i.iter-owned slice that could be altered when the iterator is + // advanced. + valueBuf []byte + // valueFetcher is used by saveValue when Cloning InternalValues. + valueFetcher base.LazyFetcher + iterKV *base.InternalKV + iterStripeChange stripeChangeType + // skip indicates whether the remaining entries in the current snapshot + // stripe should be skipped or processed. `skip` has no effect when `pos == + // iterPosNext`. + skip bool + // pos indicates the iterator position at the top of Next(): + // - iterPosCurForward: the iterator is at the last key returned. + // - iterPosNext: the iterator has already been advanced to the next + // candidate key. For example, this happens when processing merge operands, + // where we advance the iterator all the way into the next stripe or next + // user key to ensure we've seen all mergeable operands. + pos iterPos + // snapshotPinned indicates whether the last point key returned by the + // compaction iterator was only returned because an open snapshot prevents + // its elision. This field only applies to point keys, and not to range + // deletions or range keys. + snapshotPinned bool + // forceObsoleteDueToRangeDel is set to true in a subset of the cases that + // snapshotPinned is true. This value is true when the point is obsolete due + // to a RANGEDEL but could not be deleted due to a snapshot. + // + // NB: it may seem that the additional cases that snapshotPinned captures + // are harmless in that they can also be used to mark a point as obsolete + // (it is merely a duplication of some logic that happens in + // Writer.AddWithForceObsolete), but that is not quite accurate as of this + // writing -- snapshotPinned originated in stats collection and for a + // sequence MERGE, SET, where the MERGE cannot merge with the (older) SET + // due to a snapshot, the snapshotPinned value for the SET is true. + // + // TODO(sumeer,jackson): improve the logic of snapshotPinned and reconsider + // whether we need forceObsoleteDueToRangeDel. + forceObsoleteDueToRangeDel bool + // The index of the snapshot for the current key within the snapshots slice. + curSnapshotIdx int + curSnapshotSeqNum base.SeqNum + // frontiers holds a heap of user keys that affect compaction behavior when + // they're exceeded. Before a new key is returned, the compaction iterator + // advances the frontier, notifying any code that subscribed to be notified + // when a key was reached. The primary use today is within the + // implementation of compactionOutputSplitters in compaction.go. Many of + // these splitters wait for the compaction iterator to call Advance(k) when + // it's returning a new key. If the key that they're waiting for is + // surpassed, these splitters update internal state recording that they + // should request a compaction split next time they're asked in + // [shouldSplitBefore]. + frontiers Frontiers + + // lastRangeDelSpan stores the last, not compacted tombstone span. It is used + // to elide points or mark them as snapshot-pinned. + lastRangeDelSpan keyspan.Span + // lastRangeDelSpanFrontier is the frontier used to clear out lastRangeDelSpan + // when we move beyond its end key. + lastRangeDelSpanFrontier frontier + + // span stores the last, compacted tombstone or range key span. It is provided + // to the caller via Span(). + span keyspan.Span + + stats IterStats +} + +// IterConfig contains the parameters necessary to create a compaction iterator. +type IterConfig struct { + Comparer *base.Comparer + Merge base.Merge + + // The snapshot sequence numbers that need to be maintained. These sequence + // numbers define the snapshot stripes. + Snapshots Snapshots + + TombstoneElision TombstoneElision + RangeKeyElision TombstoneElision + + // IsBottommostDataLayer indicates that the compaction inputs form the + // bottommost layer of data for the compaction's key range. This allows the + // sequence number of KVs in the bottom snapshot stripe to be simplified to + // 0 (which improves compression and enables an optimization during forward + // iteration). This can be enabled if there are no tables overlapping the + // output at lower levels (than the output) in the LSM. + // + // This field may be false even when nothing is overlapping in lower levels. + // At the time of writing, flushes always set this to false (because flushes + // almost never form the bottommost layer of data). + IsBottommostDataLayer bool + + // IneffectualPointDeleteCallback is called if a SINGLEDEL is being elided + // without deleting a point set/merge. False positives are rare but possible + // (because of delete-only compactions). + IneffectualSingleDeleteCallback func(userKey []byte) + + // NondeterministicSingleDeleteCallback is called in compactions/flushes if any + // single delete has consumed a Set/Merge, and there is another immediately older + // Set/SetWithDelete/Merge. False positives are rare but possible (because of + // delete-only compactions). + NondeterministicSingleDeleteCallback func(userKey []byte) + + // MissizedDeleteCallback is called in compactions/flushes when a DELSIZED + // tombstone is found that did not accurately record the size of the value it + // deleted. This can lead to incorrect behavior in compactions. + // + // For the second case, elidedSize and expectedSize will be set to the actual + // size of the elided key and the expected size that was recorded in the + // tombstone. For the first case (when a key doesn't exist), these will be 0. + MissizedDeleteCallback func(userKey []byte, elidedSize, expectedSize uint64) +} + +func (c *IterConfig) ensureDefaults() { + if c.IneffectualSingleDeleteCallback == nil { + c.IneffectualSingleDeleteCallback = func(userKey []byte) {} + } + if c.NondeterministicSingleDeleteCallback == nil { + c.NondeterministicSingleDeleteCallback = func(userKey []byte) {} + } + if c.MissizedDeleteCallback == nil { + c.MissizedDeleteCallback = func(userKey []byte, _, _ uint64) {} + } +} + +// IterStats are statistics produced by the compaction iterator. +type IterStats struct { + // Count of DELSIZED keys that were missized. + CountMissizedDels uint64 +} + +type iterPos int8 + +const ( + iterPosCurForward iterPos = 0 + iterPosNext iterPos = 1 +) + +// NewIter creates a new compaction iterator. See the comment for Iter for a +// detailed description. +// rangeDelIter and rangeKeyIter can be nil. +func NewIter( + cfg IterConfig, + pointIter base.InternalIterator, + rangeDelIter, rangeKeyIter keyspan.FragmentIterator, +) *Iter { + cfg.ensureDefaults() + i := &Iter{ + cmp: cfg.Comparer.Compare, + cmpRangeSuffix: cfg.Comparer.CompareRangeSuffixes, + cfg: cfg, + // We don't want a nil keyBuf because if the first key we encounter is + // empty, it would become nil. + keyBuf: make([]byte, 8), + } + + iter := pointIter + if rangeDelIter != nil { + i.rangeDelInterleaving.Init(cfg.Comparer, iter, rangeDelIter, keyspan.InterleavingIterOpts{}) + iter = &i.rangeDelInterleaving + } + if rangeKeyIter != nil { + i.rangeKeyInterleaving.Init(cfg.Comparer, iter, rangeKeyIter, keyspan.InterleavingIterOpts{}) + iter = &i.rangeKeyInterleaving + } + i.iter = invalidating.MaybeWrapIfInvariants(iter) + + i.frontiers.Init(i.cmp) + i.delElider.Init(i.cmp, cfg.TombstoneElision) + i.rangeDelCompactor = MakeRangeDelSpanCompactor(i.cmp, i.cfg.Comparer.Equal, cfg.Snapshots, cfg.TombstoneElision) + i.rangeKeyCompactor = MakeRangeKeySpanCompactor(i.cmp, i.cmpRangeSuffix, cfg.Snapshots, cfg.RangeKeyElision) + i.lastRangeDelSpanFrontier.Init(&i.frontiers, nil, i.lastRangeDelSpanFrontierReached) + return i +} + +// Frontiers returns the frontiers for the compaction iterator. +func (i *Iter) Frontiers() *Frontiers { + return &i.frontiers +} + +// SnapshotPinned returns whether the last point key returned by the compaction +// iterator was only returned because an open snapshot prevents its elision. +// This field only applies to point keys, and not to range deletions or range +// keys. +func (i *Iter) SnapshotPinned() bool { + return i.snapshotPinned +} + +// ForceObsoleteDueToRangeDel returns true in a subset of the cases when +// SnapshotPinned returns true. This value is true when the point is obsolete +// due to a RANGEDEL but could not be deleted due to a snapshot. +func (i *Iter) ForceObsoleteDueToRangeDel() bool { + return i.forceObsoleteDueToRangeDel +} + +// Stats returns the compaction iterator stats. +func (i *Iter) Stats() IterStats { + return i.stats +} + +// First has the same semantics as InternalIterator.First. +func (i *Iter) First() *base.InternalKV { + if i.err != nil { + return nil + } + i.iterKV = i.iter.First() + if i.iterKV != nil { + i.curSnapshotIdx, i.curSnapshotSeqNum = i.cfg.Snapshots.IndexAndSeqNum(i.iterKV.SeqNum()) + } + i.pos = iterPosNext + i.iterStripeChange = newStripeNewKey + return i.Next() +} + +// Next has the same semantics as InternalIterator.Next. Note that when Next +// returns a RANGEDEL or a range key, the caller can use Span() to get the +// corresponding span. +func (i *Iter) Next() *base.InternalKV { + if i.err != nil { + return nil + } + + // Close the closer for the current value if one was open. + if i.closeValueCloser() != nil { + return nil + } + + // Prior to this call to `Next()` we are in one of three situations with + // respect to `iterKey` and related state: + // + // - `!skip && pos == iterPosNext`: `iterKey` is already at the next key. + // - `!skip && pos == iterPosCurForward`: We are at the key that has been returned. + // To move forward we advance by one key, even if that lands us in the same + // snapshot stripe. + // - `skip && pos == iterPosCurForward`: We are at the key that has been returned. + // To move forward we skip skippable entries in the stripe. + if i.pos == iterPosCurForward { + if i.skip { + i.skipInStripe() + } else { + i.nextInStripe() + } + } else if i.skip { + panic(errors.AssertionFailedf("compaction iterator has skip=true, but iterator is at iterPosNext")) + } + + i.pos = iterPosCurForward + + for i.iterKV != nil { + i.frontiers.Advance(i.iterKV.K.UserKey) + + // If we entered a new snapshot stripe with the same key, any key we + // return on this iteration is only returned because the open snapshot + // prevented it from being elided or merged with the key returned for + // the previous stripe. Mark it as pinned so that the compaction loop + // can correctly populate output tables' pinned statistics. We might + // also set snapshotPinned=true down below if we observe that the key is + // deleted by a range deletion in a higher stripe or that this key is a + // tombstone that could be elided if only it were in the last snapshot + // stripe. + i.snapshotPinned = i.iterStripeChange == newStripeSameKey + + if i.iterKV.Kind() == base.InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKV.Kind()) { + // Return the span so the compaction can use it for file truncation and add + // it to the relevant fragmenter. In the case of range deletions, we do not + // set `skip` to true before returning as there may be any number of point + // keys with the same user key and sequence numbers ≥ the range deletion's + // sequence number. Such point keys must be visible (i.e., not skipped + // over) since we promise point keys are not deleted by range tombstones at + // the same sequence number (or higher). + // + // Note that `skip` must already be false here, because range keys and range + // deletions are interleaved at the maximal sequence numbers and neither will + // set `skip`=true. + if i.skip { + panic(errors.AssertionFailedf("pebble: compaction iterator: skip unexpectedly true")) + } + + if i.iterKV.Kind() == base.InternalKeyKindRangeDelete { + span := i.rangeDelInterleaving.Span() + i.setLastRangeDelSpan(span) + i.rangeDelCompactor.Compact(span, &i.span) + if i.span.Empty() { + // The range del span was elided entirely; don't return this key to the caller. + i.saveKey() + i.nextInStripe() + continue + } + } else { + i.rangeKeyCompactor.Compact(i.rangeKeyInterleaving.Span(), &i.span) + if i.span.Empty() { + // The range key span was elided entirely; don't return this key to the caller. + i.saveKey() + i.nextInStripe() + continue + } + } + + // NOTE: there is a subtle invariant violation here in that calling + // saveKey and returning a reference to the temporary slice violates + // the stability guarantee for range deletion keys. A potential + // mediation could return the original iterKey and iterValue + // directly, as the backing memory is guaranteed to be stable until + // the compaction completes. The violation here is only minor in + // that the caller immediately clones the range deletion InternalKey + // when passing the key to the deletion fragmenter (see the + // call-site in compaction.go). + // TODO(travers): address this violation by removing the call to + // saveKey and instead return the original iterKey and iterValue. + // This goes against the comment on i.kv in the struct, and + // therefore warrants some investigation. + i.saveKey() + i.kv.V = i.iterKV.V + if invariants.Enabled && !i.kv.V.IsInPlaceValue() { + panic(errors.AssertionFailedf("pebble: span key's value is not in-place")) + } + // TODO(jackson): Handle tracking pinned statistics for range keys + // and range deletions. This would require updating + // emitRangeDelChunk and rangeKeyCompactionTransform to update + // statistics when they apply their own snapshot striping logic. + i.snapshotPinned = false + return &i.kv + } + + // Check if the last tombstone covers the key. + // TODO(sumeer): we could avoid calling tombstoneCovers if + // i.iterStripeChange == sameStripeSameKey since that check has already been + // done in nextInStripeHelper. However, we also need to handle the case of + // CoversInvisibly below. + switch i.tombstoneCovers(i.iterKV.K, i.curSnapshotSeqNum) { + case coversVisibly: + // A pending range deletion deletes this key. Skip it. + i.saveKey() + i.skipInStripe() + continue + + case coversInvisibly: + // i.iterKV would be deleted by a range deletion if there weren't any open + // snapshots. Mark it as pinned. + // + // NB: there are multiple places in this file where we check for a + // covering tombstone and this is the only one where we are writing to + // i.snapshotPinned. Those other cases occur in mergeNext where the caller + // is deciding whether the value should be merged or not, and the key is + // in the same snapshot stripe. Hence, snapshotPinned is by definition + // false in those cases. + i.snapshotPinned = true + i.forceObsoleteDueToRangeDel = true + + default: + i.forceObsoleteDueToRangeDel = false + } + + switch i.iterKV.Kind() { + case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete, base.InternalKeyKindDeleteSized: + if i.delElider.ShouldElide(i.iterKV.K.UserKey) { + if i.curSnapshotIdx == 0 { + // If we're at the last snapshot stripe and the tombstone + // can be elided skip skippable keys in the same stripe. + i.saveKey() + if i.kv.K.Kind() == base.InternalKeyKindSingleDelete { + i.skipDueToSingleDeleteElision() + } else { + i.skipInStripe() + if !i.skip && i.iterStripeChange != newStripeNewKey { + panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe disabled skip without advancing to new key")) + } + } + if i.iterStripeChange == newStripeSameKey { + panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe found a new stripe within the same key")) + } + continue + } else { + // We're not at the last snapshot stripe, so the tombstone + // can NOT yet be elided. Mark it as pinned, so that it's + // included in table statistics appropriately. + i.snapshotPinned = true + } + } + + switch i.iterKV.Kind() { + case base.InternalKeyKindDelete: + i.saveKey() + i.kv.V = base.InternalValue{} // DELs are value-less. + i.skip = true + return &i.kv + + case base.InternalKeyKindDeleteSized: + // We may skip subsequent keys because of this tombstone. Scan + // ahead to see just how much data this tombstone drops and if + // the tombstone's value should be updated accordingly. + return i.deleteSizedNext() + + case base.InternalKeyKindSingleDelete: + if i.singleDeleteNext() { + return &i.kv + } else if i.err != nil { + return nil + } + continue + + default: + panic(errors.AssertionFailedf( + "unexpected kind %s", redact.SafeString(i.iterKV.Kind().String()))) + } + + case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete: + // The key we emit for this entry is a function of the current key + // kind, and whether this entry is followed by a DEL/SINGLEDEL + // entry. setNext() does the work to move the iterator forward, + // preserving the original value, and potentially mutating the key + // kind. + i.setNext() + if i.err != nil { + return nil + } + return &i.kv + + case base.InternalKeyKindMerge: + // Record the snapshot index before mergeNext as merging + // advances the iterator, adjusting curSnapshotIdx. + origSnapshotIdx := i.curSnapshotIdx + var valueMerger base.ValueMerger + // MERGE values are always stored in-place. + valueMerger, i.err = i.cfg.Merge(i.iterKV.K.UserKey, i.iterKV.InPlaceValue()) + if i.err == nil { + i.mergeNext(valueMerger) + } + var needDelete bool + if i.err == nil { + // If this is the oldest version of this key (the bottommost + // snapshot stripe), we can transform the sequence number to + // zero. This can improve compression and enables an + // optimization during forward iteration to skip some key + // comparisons. Additionally, we can transform the key kind to + // SET so that iteration and future compactions do not need to + // invoke the user's Merge operator. + if i.isBottommostSnapshotStripe(origSnapshotIdx) { + i.kv.K.SetSeqNum(base.SeqNumZero) + // During the merge (see mergeNext), we may have already + // transformed the key kind to SET or SETWITHDEL, in which case we want to preserve the existing key kind. + if i.kv.K.Kind() == base.InternalKeyKindMerge { + i.kv.K.SetKind(base.InternalKeyKindSet) + } + } + + // includesBase is true when we've merged the oldest operand in + // the LSM. + var includesBase bool + switch i.kv.K.Kind() { + case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete: + includesBase = true + case base.InternalKeyKindMerge: + default: + panic(errors.AssertionFailedf( + "unexpected kind %s", redact.SafeString(i.kv.K.Kind().String()))) + } + i.kv.V, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, includesBase) + } + if i.err == nil { + if needDelete { + if i.closeValueCloser() != nil { + return nil + } + continue + } + return &i.kv + } + if i.err != nil { + // TODO(sumeer): why is MarkCorruptionError only being called for + // MERGE? + i.err = base.MarkCorruptionError(i.err) + } + return nil + + default: + i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind())) + return nil + } + } + + return nil +} + +// Span returns the range deletion or range key span corresponding to the +// current key. Can only be called right after a Next() call that returned a +// RANGEDEL or a range key. The keys in the span should not be retained or +// modified. +func (i *Iter) Span() *keyspan.Span { + return &i.span +} + +func (i *Iter) closeValueCloser() error { + if i.valueCloser == nil { + return nil + } + + i.err = i.valueCloser.Close() + i.valueCloser = nil + return i.err +} + +// skipInStripe skips over skippable keys in the same stripe and user key. It +// may set i.err, in which case i.iterKV will be nil. +func (i *Iter) skipInStripe() { + i.skip = true + // TODO(sumeer): we can avoid the overhead of calling i.rangeDelFrag.Covers, + // in this case of nextInStripe, since we are skipping all of them anyway. + for i.nextInStripe() == sameStripe { + if i.err != nil { + panic(i.err) + } + } + // We landed outside the original stripe, so reset skip. + i.skip = false +} + +func (i *Iter) iterNext() bool { + i.iterKV = i.iter.Next() + if i.iterKV == nil { + i.err = i.iter.Error() + } + return i.iterKV != nil +} + +// stripeChangeType indicates how the snapshot stripe changed relative to the +// previous key. If the snapshot stripe changed, it also indicates whether the +// new stripe was entered because the iterator progressed onto an entirely new +// key or entered a new stripe within the same key. +type stripeChangeType int + +const ( + newStripeNewKey stripeChangeType = iota + newStripeSameKey + sameStripe +) + +// nextInStripe advances the iterator and returns one of the above const ints +// indicating how its state changed. +// +// All sameStripe keys that are covered by a RANGEDEL will be skipped and not +// returned. +// +// Calls to nextInStripe must be preceded by a call to saveKey to retain a +// temporary reference to the original key, so that forward iteration can +// proceed with a reference to the original key. Care should be taken to avoid +// overwriting or mutating the saved key or value before they have been returned +// to the caller of the exported function (i.e. the caller of Next, First, etc.) +// +// nextInStripe may set i.err, in which case the return value will be +// newStripeNewKey, and i.iterKV will be nil. +func (i *Iter) nextInStripe() stripeChangeType { + i.iterStripeChange = i.nextInStripeHelper() + return i.iterStripeChange +} + +// nextInStripeHelper is an internal helper for nextInStripe; callers should use +// nextInStripe and not call nextInStripeHelper. +func (i *Iter) nextInStripeHelper() stripeChangeType { + origSnapshotIdx := i.curSnapshotIdx + for { + if !i.iterNext() { + return newStripeNewKey + } + kv := i.iterKV + + // Is this a new key? There are two cases: + // + // 1. The new key has a different user key. + // 2. The previous key was an interleaved range deletion or range key + // boundary. These keys are interleaved in the same input iterator + // stream as point keys, but they do not obey the ordinary sequence + // number ordering within a user key. If the previous key was one + // of these keys, we consider the new key a `newStripeNewKey` to + // reflect that it's the beginning of a new stream of point keys. + if i.kv.K.IsExclusiveSentinel() || !i.cfg.Comparer.Equal(i.kv.K.UserKey, kv.K.UserKey) { + i.curSnapshotIdx, i.curSnapshotSeqNum = i.cfg.Snapshots.IndexAndSeqNum(kv.SeqNum()) + return newStripeNewKey + } + + // If i.kv and kv have the same user key, then + // 1. i.kv must not have had a zero sequence number (or it would've be the last + // key with its user key). + // 2. i.kv must have a strictly larger sequence number + // There's an exception in that either key may be a range delete. Range + // deletes may share a sequence number with a point key if the keys were + // ingested together. Range keys may also share the sequence number if they + // were ingested, but range keys are interleaved into the compaction + // iterator's input iterator at the maximal sequence number so their + // original sequence number will not be observed here. + if prevSeqNum := i.keyTrailer.SeqNum(); (prevSeqNum == 0 || prevSeqNum <= kv.SeqNum()) && + i.kv.K.Kind() != base.InternalKeyKindRangeDelete && kv.Kind() != base.InternalKeyKindRangeDelete { + prevKey := i.kv.K + prevKey.Trailer = i.keyTrailer + panic(errors.AssertionFailedf("pebble: invariant violation: %s and %s out of order", prevKey, kv.K)) + } + + i.curSnapshotIdx, i.curSnapshotSeqNum = i.cfg.Snapshots.IndexAndSeqNum(kv.SeqNum()) + switch kv.Kind() { + case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete, + base.InternalKeyKindRangeDelete: + // Range tombstones and range keys are interleaved at the max + // sequence number for a given user key, and the first key after one + // is always considered a newStripeNewKey, so we should never reach + // this. + panic("unreachable") + case base.InternalKeyKindDelete, base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindSingleDelete, + base.InternalKeyKindSetWithDelete, base.InternalKeyKindDeleteSized: + // Fall through + default: + kind := i.iterKV.Kind() + i.iterKV = nil + i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(kind)) + return newStripeNewKey + } + if i.curSnapshotIdx == origSnapshotIdx { + // Same snapshot. + if i.tombstoneCovers(i.iterKV.K, i.curSnapshotSeqNum) == coversVisibly { + continue + } + return sameStripe + } + return newStripeSameKey + } +} + +func (i *Iter) setNext() { + // Save the current key. + i.saveKey() + i.kv.V = i.iterKV.V + + // If this is the oldest version of this key (the bottommost snapshot + // stripe), we can transform the sequence number to zero. This can improve + // compression and enables an optimization during forward iteration to skip + // some key comparisons. + if i.isBottommostSnapshotStripe(i.curSnapshotIdx) { + i.kv.K.SetSeqNum(base.SeqNumZero) + } + + // If this key is already a SETWITHDEL we can early return and skip the remaining + // records in the stripe: + if i.iterKV.Kind() == base.InternalKeyKindSetWithDelete { + i.skip = true + return + } + + // We need to iterate forward. Save the current value so we don't lose it. + i.saveValue() + + // Else, we continue to loop through entries in the stripe looking for a + // DEL. Note that we may stop *before* encountering a DEL, if one exists. + // + // NB: nextInStripe will skip sameStripe keys that are visibly covered by a + // RANGEDEL. This can include DELs -- this is fine since such DELs don't + // need to be combined with SET to make SETWITHDEL. + for { + switch i.nextInStripe() { + case newStripeNewKey, newStripeSameKey: + i.pos = iterPosNext + return + case sameStripe: + // We're still in the same stripe. If this is a + // DEL/SINGLEDEL/DELSIZED, we stop looking and emit a SETWITHDEL. + // Subsequent keys are eligible for skipping. + switch i.iterKV.Kind() { + case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete, base.InternalKeyKindDeleteSized: + i.kv.K.SetKind(base.InternalKeyKindSetWithDelete) + i.skip = true + return + case base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindSetWithDelete: + // Do nothing + default: + i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind())) + } + default: + panic("pebble: unexpected stripeChangeType: " + strconv.Itoa(int(i.iterStripeChange))) + } + } +} + +func (i *Iter) mergeNext(valueMerger base.ValueMerger) { + // Save the current key. + i.saveKey() + + // Loop looking for older values in the current snapshot stripe and merge + // them. + for { + if i.nextInStripe() != sameStripe { + i.pos = iterPosNext + return + } + if i.err != nil { + panic(i.err) + } + // NB: MERGE#10+RANGEDEL#9 stays a MERGE, since nextInStripe skips + // sameStripe keys that are visibly covered by a RANGEDEL. There may be + // MERGE#7 that is invisibly covered and will be preserved, but there is + // no risk that MERGE#10 and MERGE#7 will get merged in the future as + // the RANGEDEL still exists and will be used in user-facing reads that + // see MERGE#10, and will also eventually cause MERGE#7 to be deleted in + // a compaction. + key := i.iterKV + switch key.Kind() { + case base.InternalKeyKindDelete, base.InternalKeyKindSingleDelete, base.InternalKeyKindDeleteSized: + // We've hit a deletion tombstone. Return everything up to this point and + // then skip entries until the next snapshot stripe. We change the kind + // of the result key to a Set so that it shadows keys in lower + // levels. That is, MERGE+DEL -> SETWITHDEL. + // + // We do the same for SingleDelete since SingleDelete is only + // permitted (with deterministic behavior) for keys that have been + // set once since the last SingleDelete/Delete, so everything + // older is acceptable to shadow. Note that this is slightly + // different from singleDeleteNext() which implements stricter + // semantics in terms of applying the SingleDelete to the single + // next Set. But those stricter semantics are not observable to + // the end-user since Iterator interprets SingleDelete as Delete. + // We could do something more complicated here and consume only a + // single Set, and then merge in any following Sets, but that is + // complicated wrt code and unnecessary given the narrow permitted + // use of SingleDelete. + i.kv.K.SetKind(base.InternalKeyKindSetWithDelete) + i.skip = true + return + + case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete: + // We've hit a Set or SetWithDel value. Merge with the existing + // value and return. We change the kind of the resulting key to a + // Set so that it shadows keys in lower levels. That is: + // MERGE + (SET*) -> SET. + // + // Because we must merge the value, we must retrieve it regardless + // of whether the value is a blob reference. + var v []byte + var callerOwned bool + v, callerOwned, i.err = i.iterKV.Value(i.valueBuf[:0]) + if i.err != nil { + return + } + if callerOwned && cap(v) > cap(i.valueBuf) { + i.valueBuf = v + } + i.err = valueMerger.MergeOlder(v) + if i.err != nil { + return + } + i.kv.K.SetKind(base.InternalKeyKindSet) + i.skip = true + return + + case base.InternalKeyKindMerge: + // We've hit another Merge value. Merge with the existing value and + // continue looping. + // + // MERGE values are always stored in-place. + i.err = valueMerger.MergeOlder(i.iterKV.InPlaceValue()) + if i.err != nil { + return + } + + default: + i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind())) + return + } + } +} + +// singleDeleteNext processes a SingleDelete point tombstone. A SingleDelete, or +// SINGLEDEL, is unique in that it deletes exactly 1 internal key. It's a +// performance optimization when the client knows a user key has not been +// overwritten, allowing the elision of the tombstone earlier, avoiding write +// amplification. +// +// singleDeleteNext returns a boolean indicating whether or not the caller +// should yield the SingleDelete key to the consumer of the Iter. If +// singleDeleteNext returns false, the caller may consume/elide the +// SingleDelete. +func (i *Iter) singleDeleteNext() bool { + // Save the current key. + i.saveKey() + if invariants.Enabled && (!i.iterKV.V.IsInPlaceValue() || i.iterKV.V.Len() != 0) { + panic(errors.AssertionFailedf("pebble: single delete value is not in-place or is non-empty")) + } + i.kv.V = base.InternalValue{} // SINGLEDELs are value-less. + + // Loop until finds a key to be passed to the next level. + for { + // If we find a key that can't be skipped, return true so that the + // caller yields the SingleDelete to the caller. + if i.nextInStripe() != sameStripe { + // This defers additional error checking regarding single delete + // invariants to the compaction where the keys with the same user key as + // the single delete are in the same stripe. + i.pos = iterPosNext + return i.err == nil + } + if i.err != nil { + panic(i.err) + } + // INVARIANT: sameStripe. + key := i.iterKV + kind := key.Kind() + switch kind { + case base.InternalKeyKindDelete, base.InternalKeyKindSetWithDelete, base.InternalKeyKindDeleteSized: + if kind == base.InternalKeyKindDelete || kind == base.InternalKeyKindDeleteSized { + i.cfg.IneffectualSingleDeleteCallback(i.kv.K.UserKey) + } + // We've hit a Delete, DeleteSized, SetWithDelete, transform + // the SingleDelete into a full Delete. + i.kv.K.SetKind(base.InternalKeyKindDelete) + i.skip = true + return true + + case base.InternalKeyKindSet, base.InternalKeyKindMerge: + // This SingleDelete deletes the Set/Merge, and we can now elide the + // SingleDel as well. We advance past the Set and return false to + // indicate to the main compaction loop that we should NOT yield the + // current SingleDel key to the compaction loop. + // + // NB: singleDeleteNext was called with i.pos == iterPosCurForward, and + // after the call to nextInStripe, we are still at iterPosCurForward, + // since we are at the key after the Set/Merge that was single deleted. + change := i.nextInStripe() + switch change { + case sameStripe, newStripeSameKey: + // On the same user key. + nextKind := i.iterKV.Kind() + switch nextKind { + case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete, base.InternalKeyKindMerge: + // sameStripe keys returned by nextInStripe() are already + // known to not be covered by a RANGEDEL, so it is an invariant + // violation. The rare case is newStripeSameKey, where it is a + // violation if not covered by a RANGEDEL. + if change == sameStripe || + i.tombstoneCovers(i.iterKV.K, i.curSnapshotSeqNum) == noCover { + i.cfg.NondeterministicSingleDeleteCallback(i.kv.K.UserKey) + } + case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete: + default: + panic(errors.AssertionFailedf( + "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind()))) + } + case newStripeNewKey: + default: + panic("unreachable") + } + return false + + case base.InternalKeyKindSingleDelete: + // Two single deletes met in a compaction. The first single delete is + // ineffectual. + i.cfg.IneffectualSingleDeleteCallback(i.kv.K.UserKey) + // Continue to apply the second single delete. + continue + + default: + i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind())) + return false + } + } +} + +// skipDueToSingleDeleteElision is called when the SingleDelete is being +// elided because it is in the final snapshot stripe and there are no keys +// with the same user key in lower levels in the LSM (below the files in this +// compaction). +// +// TODO(sumeer): the only difference between singleDeleteNext and +// skipDueToSingleDeleteElision is the fact that the caller knows it will be +// eliding the single delete in the latter case. There are some similar things +// happening in both implementations. My first attempt at combining them into +// a single method was hard to comprehend. Try again. +func (i *Iter) skipDueToSingleDeleteElision() { + for { + stripeChange := i.nextInStripe() + if i.err != nil { + panic(i.err) + } + switch stripeChange { + case newStripeNewKey: + // The single delete is only now being elided, meaning it did not elide + // any keys earlier in its descent down the LSM. We stepped onto a new + // user key, meaning that even now at its moment of elision, it still + // hasn't elided any other keys. The single delete was ineffectual (a + // no-op). + i.cfg.IneffectualSingleDeleteCallback(i.kv.K.UserKey) + i.skip = false + return + case newStripeSameKey: + // This should be impossible. If we're eliding a single delete, we + // determined that the tombstone is in the final snapshot stripe, but we + // stepped into a new stripe of the same key. + panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe")) + case sameStripe: + kind := i.iterKV.Kind() + switch kind { + case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete: + i.cfg.IneffectualSingleDeleteCallback(i.kv.K.UserKey) + switch kind { + case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized: + i.skipInStripe() + return + case base.InternalKeyKindSingleDelete: + // Repeat the same with this SingleDelete. We don't want to simply + // call skipInStripe(), since it increases the strength of the + // SingleDel, which hides bugs in the use of single delete. + continue + default: + panic(errors.AssertionFailedf( + "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind()))) + } + case base.InternalKeyKindSetWithDelete: + // The SingleDelete should behave like a Delete. + i.skipInStripe() + return + case base.InternalKeyKindSet, base.InternalKeyKindMerge: + // This SingleDelete deletes the Set/Merge, and we are eliding the + // SingleDel as well. Step to the next key (this is not deleted by the + // SingleDelete). + // + // NB: skipDueToSingleDeleteElision was called with i.pos == + // iterPosCurForward, and after the call to nextInStripe, we are still + // at iterPosCurForward, since we are at the key after the Set/Merge + // that was single deleted. + change := i.nextInStripe() + if i.err != nil { + panic(i.err) + } + switch change { + case newStripeSameKey: + panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe")) + case newStripeNewKey: + case sameStripe: + // On the same key. + nextKind := i.iterKV.Kind() + switch nextKind { + case base.InternalKeyKindSet, base.InternalKeyKindSetWithDelete, base.InternalKeyKindMerge: + i.cfg.NondeterministicSingleDeleteCallback(i.kv.K.UserKey) + case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete: + default: + panic(errors.AssertionFailedf( + "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind()))) + } + default: + panic("unreachable") + } + // Whether in same stripe or new stripe, this key is not consumed by + // the SingleDelete. + i.skip = false + return + default: + panic(errors.AssertionFailedf( + "unexpected internal key kind: %d", errors.Safe(i.iterKV.Kind()))) + } + default: + panic("unreachable") + } + } +} + +// deleteSizedNext processes a DELSIZED point tombstone. Unlike ordinary DELs, +// these tombstones carry a value that's a varint indicating the size of the +// entry (len(key)+len(value)) that the tombstone is expected to delete. +// +// When a deleteSizedNext is encountered, we skip ahead to see which keys, if +// any, are elided as a result of the tombstone. +func (i *Iter) deleteSizedNext() *base.InternalKV { + i.saveKey() + i.skip = true + + // The DELSIZED tombstone may have no value at all. This happens when the + // tombstone has already deleted the key that the user originally predicted. + // In this case, we still peek forward in case there's another DELSIZED key + // with a lower sequence number, in which case we'll adopt its value. + // If the DELSIZED does have a value, it must be in-place. + i.valueBuf = append(i.valueBuf[:0], i.iterKV.InPlaceValue()...) + i.kv.V = base.MakeInPlaceValue(i.valueBuf) + + // Loop through all the keys within this stripe that are skippable. + i.pos = iterPosNext + for i.nextInStripe() == sameStripe { + if i.err != nil { + panic(i.err) + } + switch i.iterKV.Kind() { + case base.InternalKeyKindDelete, base.InternalKeyKindDeleteSized, base.InternalKeyKindSingleDelete: + // We encountered a tombstone (DEL, or DELSIZED) that's deleted by + // the original DELSIZED tombstone. This can happen in two cases: + // + // (1) These tombstones were intended to delete two distinct values, + // and this DELSIZED has already dropped the relevant key. For + // example: + // + // a.DELSIZED.9 a.SET.7 a.DELSIZED.5 a.SET.4 + // + // If a.DELSIZED.9 has already deleted a.SET.7, its size has + // already been zeroed out. In this case, we want to adopt the + // value of the DELSIZED with the lower sequence number, in + // case the a.SET.4 key has not yet been elided. + // + // (2) This DELSIZED was missized. The user thought they were + // deleting a key with this user key, but this user key had + // already been deleted. + // + // We can differentiate these two cases by examining the length of + // the DELSIZED's value. A DELSIZED's value holds the size of both + // the user key and value that it intends to delete. For any user + // key with a length > 0, a DELSIZED that has not deleted a key must + // have a value with a length > 0. + // + // We treat both cases the same functionally, adopting the identity + // of the lower-sequence numbered tombstone. However in the second + // case, we also increment the stat counting missized tombstones. + if i.kv.V.Len() > 0 { + // The original DELSIZED key was missized. The key that the user + // thought they were deleting does not exist. + i.stats.CountMissizedDels++ + i.cfg.MissizedDeleteCallback(i.kv.K.UserKey, 0, 0) + } + // If the tombstone has a value, it must be in-place. To save it, we + // can just copy the in-place value directly. + i.valueBuf = append(i.valueBuf[:0], i.iterKV.InPlaceValue()...) + i.kv.V = base.MakeInPlaceValue(i.valueBuf) + if i.iterKV.Kind() != base.InternalKeyKindDeleteSized { + // Convert the DELSIZED to a DEL—The DEL/SINGLEDEL we're eliding + // may not have deleted the key(s) it was intended to yet. The + // ordinary DEL compaction heuristics are better suited at that, + // plus we don't want to count it as a missized DEL. We early + // exit in this case, after skipping the remainder of the + // snapshot stripe. + i.kv.K.SetKind(base.InternalKeyKindDelete) + // NB: We skipInStripe now, rather than returning leaving + // i.skip=true and returning early, because Next() requires + // that i.skip=true only if i.iterPos = iterPosCurForward. + // + // Ignore any error caused by skipInStripe since it does not affect + // the key/value being returned here, and the next call to Next() will + // expose it. + i.skipInStripe() + return &i.kv + } + // Continue, in case we uncover another DELSIZED or a key this + // DELSIZED deletes. + + case base.InternalKeyKindSet, base.InternalKeyKindMerge, base.InternalKeyKindSetWithDelete: + // If the DELSIZED is value-less, it already deleted the key that it + // was intended to delete. This is possible with a sequence like: + // + // DELSIZED.8 SET.7 SET.3 + // + // The DELSIZED only describes the size of the SET.7, which in this + // case has already been elided. We don't count it as a missizing, + // instead converting the DELSIZED to a DEL. Skip the remainder of + // the snapshot stripe and return. + if i.kv.V.Len() == 0 { + i.kv.K.SetKind(base.InternalKeyKindDelete) + // NB: We skipInStripe now, rather than returning leaving + // i.skip=true and returning early, because Next() requires + // that i.skip=true only if i.iterPos = iterPosCurForward. + // + // Ignore any error caused by skipInStripe since it does not affect + // the key/value being returned here, and the next call to Next() will + // expose it. + i.skipInStripe() + return &i.kv + } + // The deleted key is not a DEL, DELSIZED, and the DELSIZED in i.kv + // has a positive size. Note that the tombstone's value must be + // in-place. + v := i.kv.V.InPlaceValue() + expectedSize, n := binary.Uvarint(v) + if n != len(v) { + i.err = base.CorruptionErrorf("DELSIZED holds invalid value: %x", errors.Safe(v)) + return nil + } + elidedSize := uint64(len(i.iterKV.K.UserKey)) + uint64(i.iterKV.V.Len()) + if elidedSize != expectedSize { + // The original DELSIZED key was missized. It's unclear what to + // do. The user-provided size was wrong, so it's unlikely to be + // accurate or meaningful. We could: + // + // 1. return the DELSIZED with the original user-provided size unmodified + // 2. return the DELZIZED with a zeroed size to reflect that a key was + // elided, even if it wasn't the anticipated size. + // 3. subtract the elided size from the estimate and re-encode. + // 4. convert the DELSIZED into a value-less DEL, so that + // ordinary DEL heuristics apply. + // + // We opt for (4) under the rationale that we can't rely on the + // user-provided size for accuracy, so ordinary DEL heuristics + // are safer. + i.stats.CountMissizedDels++ + i.cfg.MissizedDeleteCallback(i.kv.K.UserKey, elidedSize, expectedSize) + i.kv.K.SetKind(base.InternalKeyKindDelete) + i.kv.V = base.InternalValue{} + // NB: We skipInStripe now, rather than returning leaving + // i.skip=true and returning early, because Next() requires + // that i.skip=true only if i.iterPos = iterPosCurForward. + // + // Ignore any error caused by skipInStripe since it does not affect + // the key/value being returned here, and the next call to Next() will + // expose it. + i.skipInStripe() + return &i.kv + } + // NB: We remove the value regardless of whether the key was sized + // appropriately. The size encoded is 'consumed' the first time it + // meets a key that it deletes. + i.kv.V = base.InternalValue{} + + default: + i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKV.Kind())) + return nil + } + } + + if i.iterStripeChange == sameStripe { + panic(errors.AssertionFailedf("unexpectedly found iter stripe change = %d", i.iterStripeChange)) + } + // We landed outside the original stripe. Reset skip. + i.skip = false + if i.err != nil { + return nil + } + return &i.kv +} + +// saveKey saves the key in iterKV to i.kv.K using i.keyBuf's memory. +func (i *Iter) saveKey() { + i.keyBuf = append(i.keyBuf[:0], i.iterKV.K.UserKey...) + i.kv.K = base.InternalKey{ + UserKey: i.keyBuf, + Trailer: i.iterKV.K.Trailer, + } + i.keyTrailer = i.kv.K.Trailer +} + +// saveValue saves the value in iterKV to i.kv. It must be called before +// stepping the iterator if the value needs to be retained. Unlike keys, values +// do not need to be copied in all code paths. For example, a SETWITHDEL key may +// be written to output sstables without needing to read ahead, copying the +// value directly from the existing input sstable block into the output block +// builder. +// +// If the value is in-place, this copies it into i.valueBuf. If the value is in +// a value block, it retrieves the value from the block (possibly storing the +// result into i.valueBuf). If the value is stored in an external blob file, the +// value is cloned (InternalValue.Clone) without retrieving it from the external +// file. +// +// Note that because saveValue uses i.valueBuf and i.valueFetcher to avoid +// allocations, values saved by saveValue are only valid until the next call to +// saveValue. +func (i *Iter) saveValue() { + // Clone blob value handles to defer the retrieval of the value. + if i.iterKV.V.IsBlobValueHandle() { + i.kv.V, i.valueBuf = i.iterKV.V.Clone(i.valueBuf, &i.valueFetcher) + return + } + + v, callerOwned, err := i.iterKV.Value(i.valueBuf[:0]) + if err != nil { + i.err = err + i.kv.V = base.InternalValue{} + } else if !callerOwned { + i.valueBuf = append(i.valueBuf[:0], v...) + i.kv.V = base.MakeInPlaceValue(i.valueBuf) + } else { + i.kv.V = base.MakeInPlaceValue(v) + } +} + +// Error returns any error encountered. +// +// Note that Close will return the error as well. +func (i *Iter) Error() error { + return i.err +} + +// Close the iterator. +func (i *Iter) Close() error { + err := i.iter.Close() + if i.err == nil { + i.err = err + } + + // Close the closer for the current value if one was open. + if i.valueCloser != nil { + i.err = errors.CombineErrors(i.err, i.valueCloser.Close()) + i.valueCloser = nil + } + + return i.err +} + +// cover is returned by tombstoneCovers and describes a span's relationship to +// a key at a particular snapshot. +type cover int8 + +const ( + // noCover indicates the tested key does not fall within the span's bounds, + // or the span contains no keys with sequence numbers higher than the key's. + noCover cover = iota + + // coversInvisibly indicates the tested key does fall within the span's + // bounds and the span contains at least one key with a higher sequence + // number, but none visible at the provided snapshot. + coversInvisibly + + // coversVisibly indicates the tested key does fall within the span's + // bounds, and the span constains at least one key with a sequence number + // higher than the key's sequence number that is visible at the provided + // snapshot. + coversVisibly +) + +// tombstoneCovers returns whether the key is covered by a tombstone and whether +// it is covered by a tombstone visible in the given snapshot. +// +// The key's UserKey must be greater or equal to the last span Start key passed +// to AddTombstoneSpan. The keys passed to tombstoneCovers calls must be +// ordered. +func (i *Iter) tombstoneCovers(key base.InternalKey, snapshot base.SeqNum) cover { + if i.lastRangeDelSpan.Empty() { + return noCover + } + if invariants.Enabled && (i.cmp(key.UserKey, i.lastRangeDelSpan.Start) < 0 || i.cmp(key.UserKey, i.lastRangeDelSpan.End) >= 0) { + panic(errors.AssertionFailedf("invalid key %q, last span %s", key, i.lastRangeDelSpan)) + } + // The Covers() check is very cheap, so we want to do that first. + switch { + case !i.lastRangeDelSpan.Covers(key.SeqNum()): + return noCover + case i.lastRangeDelSpan.CoversAt(snapshot, key.SeqNum()): + return coversVisibly + default: + return coversInvisibly + } +} + +func (i *Iter) setLastRangeDelSpan(span *keyspan.Span) { + if invariants.Enabled && !i.lastRangeDelSpan.Empty() { + panic("last range del span overwritten") + } + i.lastRangeDelSpan.CopyFrom(span) + i.lastRangeDelSpanFrontier.Update(i.lastRangeDelSpan.End) +} + +func (i *Iter) lastRangeDelSpanFrontierReached(key []byte) []byte { + i.lastRangeDelSpan.Reset() + return nil +} + +// isBottommostSnapshotStripe returns true if the compaction's inputs form the +// bottommost layer of the LSM for the compaction's key range and the provided +// snapshot stripe is the last stripe. +// +// When isBottommostSnapshotStripe returns true, it is guaranteed there does not +// exist any overlapping keys with lower sequence numbers than the keys in the +// provided snapshot stripe. However isBottommostSnapshotStripe is permitted to +// return false even when there is no overlapping data in lower levels (eg, +// flushes). +func (i *Iter) isBottommostSnapshotStripe(snapshotIdx int) bool { + // TODO(peter): This determination applies to the entire compaction. We + // could make the determination on a key by key basis, similar to what is + // done for elideTombstone. Need to add a benchmark for Iter to verify that + // isn't too expensive. + return i.cfg.IsBottommostDataLayer && snapshotIdx == 0 +} + +func finishValueMerger( + valueMerger base.ValueMerger, includesBase bool, +) (_ base.InternalValue, needDelete bool, closer io.Closer, err error) { + var value []byte + if valueMerger2, ok := valueMerger.(base.DeletableValueMerger); ok { + value, needDelete, closer, err = valueMerger2.DeletableFinish(includesBase) + } else { + value, closer, err = valueMerger.Finish(includesBase) + } + return base.MakeInPlaceValue(value), needDelete, closer, err +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compact/run.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/run.go new file mode 100644 index 0000000..d82ecad --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/run.go @@ -0,0 +1,498 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compact + +import ( + "sort" + "time" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/blob" +) + +// Result stores the result of a compaction - more specifically, the "data" part +// where we use the compaction iterator to write output tables. +type Result struct { + // Err is the result of the compaction. On success, Err is nil and Tables + // stores the output tables. On failure, Err is set and Tables stores the + // tables created so far (and which need to be cleaned up). + Err error + Tables []OutputTable + Blobs []OutputBlob + Stats Stats +} + +// WithError returns a modified Result which has the Err field set. +func (r Result) WithError(err error) Result { + return Result{ + Err: errors.CombineErrors(r.Err, err), + Tables: r.Tables, + Blobs: r.Blobs, + Stats: r.Stats, + } +} + +// OutputTable contains metadata about a table that was created during a compaction. +type OutputTable struct { + CreationTime time.Time + // ObjMeta is metadata for the object backing the table. + ObjMeta objstorage.ObjectMetadata + // WriterMeta is populated once the table is fully written. On compaction + // failure (see Result), WriterMeta might not be set. + WriterMeta sstable.WriterMetadata + // BlobReferences is the list of blob references for the table. + BlobReferences manifest.BlobReferences + // BlobReferenceDepth is the depth of the blob references for the table. + BlobReferenceDepth manifest.BlobReferenceDepth +} + +// OutputBlob contains metadata about a blob file that was created during a +// compaction. +type OutputBlob struct { + Stats blob.FileWriterStats + // ObjMeta is metadata for the object backing the blob file. + ObjMeta objstorage.ObjectMetadata + // Metadata is metadata for the blob file. + Metadata *manifest.PhysicalBlobFile +} + +// Stats describes stats collected during the compaction. +type Stats struct { + CumulativePinnedKeys uint64 + CumulativePinnedSize uint64 + // CumulativeWrittenSize is the total size of all data written to output + // objects. + CumulativeWrittenSize uint64 + // CumulativeBlobReferenceSize is the total size of all blob references + // written to output objects. + CumulativeBlobReferenceSize uint64 + // CumulativeBlobFileSize is the total size of all data written to blob + // output objects specifically. + CumulativeBlobFileSize uint64 + CountMissizedDels uint64 +} + +// RunnerConfig contains the parameters needed for the Runner. +type RunnerConfig struct { + // CompactionBounds are the bounds containing all the input tables. All output + // tables must fall within these bounds as well. + CompactionBounds base.UserKeyBounds + + // L0SplitKeys is only set for flushes and it contains the flush split keys + // (see L0Sublevels.FlushSplitKeys). These are split points enforced for the + // output tables. + L0SplitKeys [][]byte + + // Grandparents are the tables in level+2 that overlap with the files being + // compacted. Used to determine output table boundaries. Do not assume that + // the actual files in the grandparent when this compaction finishes will be + // the same. + Grandparents manifest.LevelSlice + + // MaxGrandparentOverlapBytes is the maximum number of bytes of overlap + // allowed for a single output table with the tables in the grandparent level. + MaxGrandparentOverlapBytes uint64 + + // TargetOutputFileSize is the desired size of an individual table created + // during compaction. In practice, the sizes can vary between 50%-200% of this + // value. + TargetOutputFileSize uint64 + + // GrantHandle is used to perform accounting of resource consumption by the + // CompactionScheduler. + GrantHandle base.CompactionGrantHandle +} + +// ValueSeparation defines an interface for writing some values to separate blob +// files. +type ValueSeparation interface { + // EstimatedFileSize returns an estimate of the disk space consumed by the + // current, pending blob file if it were closed now. If no blob file has + // been created, it returns 0. + EstimatedFileSize() uint64 + // EstimatedReferenceSize returns an estimate of the disk space consumed by + // the current output sstable's blob references so far. + EstimatedReferenceSize() uint64 + // Add adds the provided key-value pair to the provided sstable writer, + // possibly separating the value into a blob file. + Add(tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool) error + // FinishOutput is called when a compaction is finishing an output sstable. + // It returns the table's blob references, which will be added to the + // table's TableMetadata, and stats and metadata describing a newly + // constructed blob file if any. + FinishOutput() (ValueSeparationMetadata, error) +} + +// ValueSeparationMetadata describes metadata about a table's blob references, +// and optionally a newly constructed blob file. +type ValueSeparationMetadata struct { + BlobReferences manifest.BlobReferences + BlobReferenceSize uint64 + BlobReferenceDepth manifest.BlobReferenceDepth + + // The below fields are only populated if a new blob file was created. + BlobFileStats blob.FileWriterStats + BlobFileObject objstorage.ObjectMetadata + BlobFileMetadata *manifest.PhysicalBlobFile +} + +// Runner is a helper for running the "data" part of a compaction (where we use +// the compaction iterator to write output tables). +// +// Sample usage: +// +// r := NewRunner(cfg, iter) +// for r.MoreDataToWrite() { +// objMeta, tw := ... // Create object and table writer. +// r.WriteTable(objMeta, tw) +// } +// result := r.Finish() +type Runner struct { + cmp base.Compare + cfg RunnerConfig + iter *Iter + + tables []OutputTable + blobs []OutputBlob + // Stores any error encountered. + err error + // Last key/value returned by the compaction iterator. + kv *base.InternalKV + // Last RANGEDEL span (or portion of it) that was not yet written to a table. + lastRangeDelSpan keyspan.Span + // Last range key span (or portion of it) that was not yet written to a table. + lastRangeKeySpan keyspan.Span + stats Stats +} + +// NewRunner creates a new Runner. +func NewRunner(cfg RunnerConfig, iter *Iter) *Runner { + r := &Runner{ + cmp: iter.cmp, + cfg: cfg, + iter: iter, + } + r.kv = r.iter.First() + return r +} + +// MoreDataToWrite returns true if there is more data to be written. +func (r *Runner) MoreDataToWrite() bool { + if r.err != nil { + return false + } + return r.kv != nil || !r.lastRangeDelSpan.Empty() || !r.lastRangeKeySpan.Empty() +} + +// FirstKey returns the first key that will be written; this can be a point key +// or the beginning of a range del or range key span. +// +// FirstKey can only be called right after MoreDataToWrite() was called and +// returned true. +func (r *Runner) FirstKey() []byte { + firstKey := base.MinUserKey(r.cmp, spanStartOrNil(&r.lastRangeDelSpan), spanStartOrNil(&r.lastRangeKeySpan)) + // Note: if there was a r.lastRangeDelSpan or r.lastRangeKeySpan, it + // necessarily starts before the first point key. + if r.kv != nil && firstKey == nil { + firstKey = r.kv.K.UserKey + } + return firstKey +} + +// WriteTable writes a new output table. This table will be part of +// Result.Tables. Should only be called if MoreDataToWrite() returned true. +// +// limitKey (if non-empty) forces the sstable to be finished before reaching +// this key. +// +// WriteTable always closes the Writer. +func (r *Runner) WriteTable( + objMeta objstorage.ObjectMetadata, + tw sstable.RawWriter, + limitKey []byte, + valueSeparation ValueSeparation, +) { + if r.err != nil { + panic("error already encountered") + } + r.tables = append(r.tables, OutputTable{ + CreationTime: time.Now(), + ObjMeta: objMeta, + }) + splitKey, err := r.writeKeysToTable(tw, limitKey, valueSeparation) + + // Inform the value separation policy that the table is finished. + valSepMeta, valSepErr := valueSeparation.FinishOutput() + if valSepErr != nil { + r.err = errors.CombineErrors(r.err, valSepErr) + } else { + r.tables[len(r.tables)-1].BlobReferences = valSepMeta.BlobReferences + r.tables[len(r.tables)-1].BlobReferenceDepth = valSepMeta.BlobReferenceDepth + if valSepMeta.BlobFileObject.DiskFileNum != 0 { + r.blobs = append(r.blobs, OutputBlob{ + Stats: valSepMeta.BlobFileStats, + ObjMeta: valSepMeta.BlobFileObject, + Metadata: valSepMeta.BlobFileMetadata, + }) + } + } + + err = errors.CombineErrors(err, tw.Close()) + if err != nil { + r.err = err + r.kv = nil + return + } + writerMeta, err := tw.Metadata() + if err != nil { + r.err = err + return + } + if err := r.validateWriterMeta(writerMeta, splitKey); err != nil { + r.err = err + return + } + r.tables[len(r.tables)-1].WriterMeta = *writerMeta + r.stats.CumulativeWrittenSize += writerMeta.Size + valSepMeta.BlobFileStats.FileLen + r.stats.CumulativeBlobReferenceSize += valSepMeta.BlobReferenceSize + r.stats.CumulativeBlobFileSize += valSepMeta.BlobFileStats.FileLen +} + +func (r *Runner) writeKeysToTable( + tw sstable.RawWriter, limitKey []byte, valueSeparation ValueSeparation, +) (splitKey []byte, _ error) { + const updateGrantHandleEveryNKeys = 128 + firstKey := r.FirstKey() + if firstKey == nil { + return nil, base.AssertionFailedf("no data to write") + } + limitKey = base.MinUserKey(r.cmp, limitKey, r.TableSplitLimit(firstKey)) + splitter := NewOutputSplitter( + r.cmp, firstKey, limitKey, + r.cfg.TargetOutputFileSize, r.cfg.Grandparents.Iter(), r.iter.Frontiers(), + ) + equalPrev := func(k []byte) bool { + return tw.ComparePrev(k) == 0 + } + var pinnedKeySize, pinnedValueSize, pinnedCount uint64 + var iteratedKeys uint64 + kv := r.kv + for ; kv != nil; kv = r.iter.Next() { + iteratedKeys++ + if iteratedKeys%updateGrantHandleEveryNKeys == 0 { + r.cfg.GrantHandle.CumulativeStats(base.CompactionGrantHandleStats{ + CumWriteBytes: r.stats.CumulativeWrittenSize + tw.EstimatedSize() + + valueSeparation.EstimatedFileSize(), + }) + r.cfg.GrantHandle.MeasureCPU(base.CompactionGoroutinePrimary) + } + outputSize := tw.EstimatedSize() + outputSize += valueSeparation.EstimatedReferenceSize() + if splitter.ShouldSplitBefore(kv.K.UserKey, outputSize, equalPrev) { + break + } + + switch kv.K.Kind() { + case base.InternalKeyKindRangeDelete: + // The previous span (if any) must end at or before this key, since the + // spans we receive are non-overlapping. + if err := tw.EncodeSpan(r.lastRangeDelSpan); r.err != nil { + return nil, err + } + r.lastRangeDelSpan.CopyFrom(r.iter.Span()) + continue + + case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete: + // The previous span (if any) must end at or before this key, since the + // spans we receive are non-overlapping. + if err := tw.EncodeSpan(r.lastRangeKeySpan); err != nil { + return nil, err + } + r.lastRangeKeySpan.CopyFrom(r.iter.Span()) + continue + } + + valueLen := kv.V.Len() + // Add the value to the sstable, possibly separating its value into a + // blob file. The ValueSeparation implementation is responsible for + // writing the KV to the sstable. + if err := valueSeparation.Add(tw, kv, r.iter.ForceObsoleteDueToRangeDel()); err != nil { + return nil, err + } + if r.iter.SnapshotPinned() { + // The kv pair we just added to the sstable was only surfaced by + // the compaction iterator because an open snapshot prevented + // its elision. Increment the stats. + pinnedCount++ + pinnedKeySize += uint64(len(kv.K.UserKey)) + base.InternalTrailerLen + pinnedValueSize += uint64(valueLen) + } + } + r.kv = kv + splitKey = splitter.SplitKey() + if err := SplitAndEncodeSpan(r.cmp, &r.lastRangeDelSpan, splitKey, tw); err != nil { + return nil, err + } + if err := SplitAndEncodeSpan(r.cmp, &r.lastRangeKeySpan, splitKey, tw); err != nil { + return nil, err + } + // Set internal sstable properties. + tw.SetSnapshotPinnedProperties(pinnedCount, pinnedKeySize, pinnedValueSize) + r.stats.CumulativePinnedKeys += pinnedCount + r.stats.CumulativePinnedSize += pinnedKeySize + pinnedValueSize + + // TODO(jackson): CumulativeStats may block if the compaction scheduler + // wants to pace the compaction. We should thread through knowledge of + // whether or not this is the final sstable of the compaction, in which case + // all work has been completed and pacing would only needlessly delay the + // installation of the version edit. + r.cfg.GrantHandle.CumulativeStats(base.CompactionGrantHandleStats{ + CumWriteBytes: r.stats.CumulativeWrittenSize + + tw.EstimatedSize() + + valueSeparation.EstimatedFileSize(), + }) + r.cfg.GrantHandle.MeasureCPU(base.CompactionGoroutinePrimary) + return splitKey, nil +} + +// Finish closes the compaction iterator and returns the result of the +// compaction. +func (r *Runner) Finish() Result { + r.err = errors.CombineErrors(r.err, r.iter.Close()) + // The compaction iterator keeps track of a count of the number of DELSIZED + // keys that encoded an incorrect size. + r.stats.CountMissizedDels = r.iter.Stats().CountMissizedDels + return Result{ + Err: r.err, + Tables: r.tables, + Blobs: r.blobs, + Stats: r.stats, + } +} + +// TableSplitLimit returns a hard split limit for an output table that starts at +// startKey (which must be strictly greater than startKey), or nil if there is +// no limit. +func (r *Runner) TableSplitLimit(startKey []byte) []byte { + var limitKey []byte + + // Enforce the MaxGrandparentOverlapBytes limit: find the user key to which + // that table can extend without excessively overlapping the grandparent + // level. If no limit is needed considering the grandparent, limitKey stays + // nil. + // + // This is done in order to prevent a table at level N from overlapping too + // much data at level N+1. We want to avoid such large overlaps because they + // translate into large compactions. The current heuristic stops output of a + // table if the addition of another key would cause the table to overlap more + // than 10x the target file size at level N. See + // compaction.maxGrandparentOverlapBytes. + iter := r.cfg.Grandparents.Iter() + var overlappedBytes uint64 + f := iter.SeekGE(r.cmp, startKey) + // Handle an overlapping table. + if f != nil && r.cmp(f.Smallest().UserKey, startKey) <= 0 { + overlappedBytes += f.Size + f = iter.Next() + } + for ; f != nil; f = iter.Next() { + overlappedBytes += f.Size + if overlappedBytes > r.cfg.MaxGrandparentOverlapBytes { + limitKey = f.Smallest().UserKey + break + } + } + + if len(r.cfg.L0SplitKeys) != 0 { + // Find the first split key that is greater than startKey. + index := sort.Search(len(r.cfg.L0SplitKeys), func(i int) bool { + return r.cmp(r.cfg.L0SplitKeys[i], startKey) > 0 + }) + if index < len(r.cfg.L0SplitKeys) { + limitKey = base.MinUserKey(r.cmp, limitKey, r.cfg.L0SplitKeys[index]) + } + } + + return limitKey +} + +// validateWriterMeta runs some sanity cehcks on the WriterMetadata on an output +// table that was just finished. splitKey is the key where the table must have +// ended (or nil). +func (r *Runner) validateWriterMeta(meta *sstable.WriterMetadata, splitKey []byte) error { + if !meta.HasPointKeys && !meta.HasRangeDelKeys && !meta.HasRangeKeys { + return base.AssertionFailedf("output table has no keys") + } + + var err error + checkBounds := func(smallest, largest base.InternalKey, description string) { + bounds := base.UserKeyBoundsFromInternal(smallest, largest) + if !r.cfg.CompactionBounds.ContainsBounds(r.cmp, &bounds) { + err = errors.CombineErrors(err, base.AssertionFailedf( + "output table %s bounds %s extend beyond compaction bounds %s", + description, bounds, r.cfg.CompactionBounds, + )) + } + if splitKey != nil && bounds.End.IsUpperBoundFor(r.cmp, splitKey) { + err = errors.CombineErrors(err, base.AssertionFailedf( + "output table %s bounds %s extend beyond split key %s", + description, bounds, splitKey, + )) + } + } + + if meta.HasPointKeys { + checkBounds(meta.SmallestPoint, meta.LargestPoint, "point key") + } + if meta.HasRangeDelKeys { + checkBounds(meta.SmallestRangeDel, meta.LargestRangeDel, "range del") + } + if meta.HasRangeKeys { + checkBounds(meta.SmallestRangeKey, meta.LargestRangeKey, "range key") + } + return err +} + +func spanStartOrNil(s *keyspan.Span) []byte { + if s.Empty() { + return nil + } + return s.Start +} + +// NeverSeparateValues is a ValueSeparation implementation that never separates +// values into external blob files. It is the default value if no +// ValueSeparation implementation is explicitly provided. +type NeverSeparateValues struct{} + +// Assert that NeverSeparateValues implements the ValueSeparation interface. +var _ ValueSeparation = NeverSeparateValues{} + +// EstimatedFileSize implements the ValueSeparation interface. +func (NeverSeparateValues) EstimatedFileSize() uint64 { return 0 } + +// EstimatedReferenceSize implements the ValueSeparation interface. +func (NeverSeparateValues) EstimatedReferenceSize() uint64 { return 0 } + +// Add implements the ValueSeparation interface. +func (NeverSeparateValues) Add( + tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool, +) error { + v, _, err := kv.Value(nil) + if err != nil { + return err + } + return tw.Add(kv.K, v, forceObsolete) +} + +// FinishOutput implements the ValueSeparation interface. +func (NeverSeparateValues) FinishOutput() (ValueSeparationMetadata, error) { + return ValueSeparationMetadata{}, nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compact/snapshots.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/snapshots.go new file mode 100644 index 0000000..76d1b41 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/snapshots.go @@ -0,0 +1,65 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compact + +import ( + "sort" + + "github.com/cockroachdb/pebble/v2/internal/base" +) + +// Snapshots stores a list of snapshot sequence numbers, in ascending order. +// +// Snapshots are lightweight point-in-time views of the DB state. At its core, +// a snapshot is a sequence number along with a guarantee from Pebble that it +// will maintain the view of the database at that sequence number. Part of this +// guarantee is relatively straightforward to achieve. When reading from the +// database Pebble will ignore sequence numbers that are larger than the +// snapshot sequence number. The primary complexity with snapshots occurs +// during compaction: the collapsing of entries that are shadowed by newer +// entries is at odds with the guarantee that Pebble will maintain the view of +// the database at the snapshot sequence number. Rather than collapsing entries +// up to the next user key, compactionIter can only collapse entries up to the +// next snapshot boundary. That is, every snapshot boundary potentially causes +// another entry for the same user-key to be emitted. Another way to view this +// is that snapshots define stripes and entries are collapsed within stripes, +// but not across stripes. Consider the following scenario: +// +// a.PUT.9 +// a.DEL.8 +// a.PUT.7 +// a.DEL.6 +// a.PUT.5 +// +// In the absence of snapshots these entries would be collapsed to +// a.PUT.9. What if there is a snapshot at sequence number 7? The entries can +// be divided into two stripes and collapsed within the stripes: +// +// a.PUT.9 a.PUT.9 +// a.DEL.8 ---> +// a.PUT.7 +// -- -- +// a.DEL.6 ---> a.DEL.6 +// a.PUT.5 +type Snapshots []base.SeqNum + +// Index returns the index of the first snapshot sequence number which is >= seq +// or len(s) if there is no such sequence number. +func (s Snapshots) Index(seq base.SeqNum) int { + return sort.Search(len(s), func(i int) bool { + return s[i] > seq + }) +} + +// IndexAndSeqNum returns the index of the first snapshot sequence number which +// is >= seq and that sequence number, or len(s) and InternalKeySeqNumMax if +// there is no such sequence number. +func (s Snapshots) IndexAndSeqNum(seq base.SeqNum) (int, base.SeqNum) { + index := s.Index(seq) + if index == len(s) { + return index, base.SeqNumMax + } + return index, s[index] +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compact/spans.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/spans.go new file mode 100644 index 0000000..b86ca2c --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/spans.go @@ -0,0 +1,217 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compact + +import ( + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/rangekey" + "github.com/cockroachdb/pebble/v2/sstable" +) + +// RangeDelSpanCompactor coalesces RANGEDELs within snapshot stripes and elides +// RANGEDELs in the last stripe if possible. +type RangeDelSpanCompactor struct { + cmp base.Compare + equal base.Equal + snapshots Snapshots + elider rangeTombstoneElider +} + +// MakeRangeDelSpanCompactor creates a new compactor for RANGEDEL spans. +func MakeRangeDelSpanCompactor( + cmp base.Compare, equal base.Equal, snapshots Snapshots, elision TombstoneElision, +) RangeDelSpanCompactor { + c := RangeDelSpanCompactor{ + cmp: cmp, + equal: equal, + snapshots: snapshots, + } + c.elider.Init(cmp, elision) + return c +} + +// Compact compacts the given range del span and stores the results in the +// given output span, reusing its slices. +// +// Compaction of a span entails coalescing RANGEDELs keys within snapshot +// stripes, and eliding RANGEDELs in the last stripe if possible. +// +// It is possible for the output span to be empty after the call (if all +// RANGEDELs in the span are elided). +// +// The spans that are passed to Compact calls must be ordered and +// non-overlapping. +func (c *RangeDelSpanCompactor) Compact(span, output *keyspan.Span) { + if invariants.Enabled && span.KeysOrder != keyspan.ByTrailerDesc { + panic("pebble: span's keys unexpectedly not in trailer order") + } + output.Reset() + // Apply the snapshot stripe rules, keeping only the latest tombstone for + // each snapshot stripe. + currentIdx := -1 + for _, k := range span.Keys { + idx := c.snapshots.Index(k.SeqNum()) + if currentIdx == idx { + continue + } + if idx == 0 && c.elider.ShouldElide(span.Start, span.End) { + // This is the last snapshot stripe and the range tombstone + // can be elided. + break + } + + output.Keys = append(output.Keys, k) + if idx == 0 { + // This is the last snapshot stripe. + break + } + currentIdx = idx + } + if len(output.Keys) > 0 { + output.Start = append(output.Start, span.Start...) + output.End = append(output.End, span.End...) + output.KeysOrder = span.KeysOrder + } +} + +// RangeKeySpanCompactor coalesces range keys within snapshot stripes and elides +// RangeKeyDelete and RangeKeyUnsets when possible. It is used as a container +// for at most one "compacted" span. +type RangeKeySpanCompactor struct { + cmp base.Compare + suffixCmp base.CompareRangeSuffixes + snapshots Snapshots + elider rangeTombstoneElider +} + +// MakeRangeKeySpanCompactor creates a new compactor for range key spans. +func MakeRangeKeySpanCompactor( + cmp base.Compare, + suffixCmp base.CompareRangeSuffixes, + snapshots Snapshots, + elision TombstoneElision, +) RangeKeySpanCompactor { + c := RangeKeySpanCompactor{ + cmp: cmp, + suffixCmp: suffixCmp, + snapshots: snapshots, + } + c.elider.Init(cmp, elision) + return c +} + +// Compact compacts the given range key span and stores the results in the +// given output span, reusing its slices. +// +// Compaction of a span entails coalescing range keys within snapshot +// stripes, and eliding RangeKeyUnset/RangeKeyDelete in the last stripe if +// possible. +// +// It is possible for the output span to be empty after the call (if all range +// keys in the span are elided). +// +// The spans that are passed to Compact calls must be ordered and +// non-overlapping. +func (c *RangeKeySpanCompactor) Compact(span, output *keyspan.Span) { + if invariants.Enabled && span.KeysOrder != keyspan.ByTrailerDesc { + panic("pebble: span's keys unexpectedly not in trailer order") + } + // snapshots are in ascending order, while s.keys are in descending seqnum + // order. Partition s.keys by snapshot stripes, and call rangekey.Coalesce + // on each partition. + output.Reset() + x, y := len(c.snapshots)-1, 0 + usedLen := 0 + for x >= 0 { + start := y + for y < len(span.Keys) && !base.Visible(span.Keys[y].SeqNum(), c.snapshots[x], base.SeqNumMax) { + // Include y in current partition. + y++ + } + if y > start { + keysDst := output.Keys[usedLen:cap(output.Keys)] + rangekey.Coalesce(c.suffixCmp, span.Keys[start:y], &keysDst) + if y == len(span.Keys) { + // This is the last snapshot stripe. Unsets and deletes can be elided. + keysDst = c.elideInLastStripe(span.Start, span.End, keysDst) + } + usedLen += len(keysDst) + output.Keys = append(output.Keys, keysDst...) + } + x-- + } + if y < len(span.Keys) { + keysDst := output.Keys[usedLen:cap(output.Keys)] + rangekey.Coalesce(c.suffixCmp, span.Keys[y:], &keysDst) + keysDst = c.elideInLastStripe(span.Start, span.End, keysDst) + usedLen += len(keysDst) + output.Keys = append(output.Keys, keysDst...) + } + if len(output.Keys) > 0 { + output.Start = append(output.Start, span.Start...) + output.End = append(output.End, span.End...) + output.KeysOrder = span.KeysOrder + } +} + +func (c *RangeKeySpanCompactor) elideInLastStripe( + start, end []byte, keys []keyspan.Key, +) []keyspan.Key { + // Unsets and deletes in the last snapshot stripe can be elided. + k := 0 + for j := range keys { + if (keys[j].Kind() == base.InternalKeyKindRangeKeyUnset || keys[j].Kind() == base.InternalKeyKindRangeKeyDelete) && + c.elider.ShouldElide(start, end) { + continue + } + keys[k] = keys[j] + k++ + } + return keys[:k] +} + +// SplitAndEncodeSpan splits a span at upToKey and encodes the first part into +// the table writer, and updates the span to store the remaining part. +// +// If upToKey is nil or the span ends before upToKey, we encode the entire span +// and reset it to the empty span. +// +// Note that the span.Start slice will be reused (it will be replaced with a +// copy of upToKey, if appropriate). +// +// The span can contain either only RANGEDEL keys or only range keys. +func SplitAndEncodeSpan( + cmp base.Compare, span *keyspan.Span, upToKey []byte, tw sstable.RawWriter, +) error { + if span.Empty() { + return nil + } + + if upToKey == nil || cmp(span.End, upToKey) <= 0 { + if err := tw.EncodeSpan(*span); err != nil { + return err + } + span.Reset() + return nil + } + + if cmp(span.Start, upToKey) >= 0 { + // The span starts at/after upToKey; nothing to encode. + return nil + } + + // Split the span at upToKey and encode the first part. + if err := tw.EncodeSpan(keyspan.Span{ + Start: span.Start, + End: upToKey, + Keys: span.Keys, + }); err != nil { + return err + } + span.Start = append(span.Start[:0], upToKey...) + return nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compact/splitting.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/splitting.go new file mode 100644 index 0000000..1320800 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/splitting.go @@ -0,0 +1,556 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compact + +import ( + "bytes" + "fmt" + "slices" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/manifest" +) + +// ShouldSplit indicates whether a compaction should split between output files. +// See the OutputSplitter interface. +type ShouldSplit bool + +const ( + // NoSplit may be returned by an OutputSplitter to indicate that it does NOT + // recommend splitting compaction output sstables between the previous key + // and the next key. + NoSplit ShouldSplit = false + // SplitNow may be returned by an OutputSplitter to indicate that it does + // recommend splitting compaction output sstables between the previous key + // and the next key. + SplitNow ShouldSplit = true +) + +// String implements the Stringer interface. +func (s ShouldSplit) String() string { + if s == NoSplit { + return "no-split" + } + return "split-now" +} + +// OutputSplitter is used to determine where to split output tables in a +// compaction. +// +// An OutputSplitter is initialized when we start an output file: +// +// s := NewOutputSplitter(...) +// for nextKey != nil && !s.ShouldSplitBefore(nextKey, ...) { +// ... +// } +// splitKey := s.SplitKey() +// +// OutputSplitter enforces a target file size. This splitter splits to a new +// output file when the estimated file size is 0.5x-2x the target file size. If +// there are overlapping grandparent files, this splitter will attempt to split +// at a grandparent boundary. For example, consider the example where a +// compaction wrote 'd' to the current output file, and the next key has a user +// key 'g': +// +// previous key next key +// | | +// | | +// +---------------|----+ +--|----------+ +// grandparents: | 000006 | | | | 000007 | +// +---------------|----+ +--|----------+ +// a b d e f g i +// +// Splitting the output file F before 'g' will ensure that the current output +// file F does not overlap the grandparent file 000007. Aligning sstable +// boundaries like this can significantly reduce write amplification, since a +// subsequent compaction of F into the grandparent level will avoid needlessly +// rewriting any keys within 000007 that do not overlap F's bounds. Consider the +// following compaction: +// +// +----------------------+ +// input | | +// level +----------------------+ +// \/ +// +---------------+ +---------------+ +// output |XXXXXXX| | | |XXXXXXXX| +// level +---------------+ +---------------+ +// +// The input-level file overlaps two files in the output level, but only +// partially. The beginning of the first output-level file and the end of the +// second output-level file will be rewritten verbatim. This write I/O is +// "wasted" in the sense that no merging is being performed. +// +// To prevent the above waste, this splitter attempts to split output files +// before the start key of grandparent files. It still strives to write output +// files of approximately the target file size, by constraining this splitting +// at grandparent points to apply only if the current output's file size is +// about the right order of magnitude. +// +// OutputSplitter guarantees that we never split user keys between files. +// +// The dominant cost of OutputSplitter is one key comparison per +// ShouldSplitBefore call. +type OutputSplitter struct { + cmp base.Compare + startKey []byte + limit []byte + targetFileSize uint64 + frontier frontier + + shouldSplitCalled bool + + nextBoundary splitterBoundary + // reachedBoundary is set when the frontier reaches a boundary and is cleared + // in the first ShouldSplitBefore call after that. + reachedBoundary splitterBoundary + + grandparentBoundariesObserved uint64 + grandparentLevel manifest.LevelIterator + + splitKey []byte +} + +type splitterBoundary struct { + key []byte + // isGrandparent is true if this boundary corresponds to a grandparent boundary. + // It is false when the boundary is unset or it is a limit boundary. + isGrandparent bool +} + +// NewOutputSplitter creates a new OutputSplitter. See OutputSplitter for more +// information. +// +// The limitKey must be either nil (no limit) or a key greater than startKey. +// +// NewOutputSplitter registers the splitter with the provided Frontiers. +// +// Note: it is allowed for the startKey to be behind the current frontier, as +// long as the key in the first ShouldSplitBefore call is at the frontier. +func NewOutputSplitter( + cmp base.Compare, + startKey []byte, + limit []byte, + targetFileSize uint64, + grandparentLevel manifest.LevelIterator, + frontiers *Frontiers, +) *OutputSplitter { + s := &OutputSplitter{ + cmp: cmp, + startKey: slices.Clone(startKey), + targetFileSize: targetFileSize, + grandparentLevel: grandparentLevel, + } + if len(limit) > 0 { + if invariants.Enabled && cmp(startKey, limit) >= 0 { + panic("limit <= startKey") + } + s.limit = slices.Clone(limit) + } + // Find the first grandparent that starts at or after startKey. + grandparent := s.grandparentLevel.SeekGE(cmp, startKey) + if grandparent != nil && cmp(grandparent.Smallest().UserKey, startKey) <= 0 { + grandparent = s.grandparentLevel.Next() + } + s.setNextBoundary(grandparent) + if invariants.Enabled && s.nextBoundary.key != nil && s.cmp(s.nextBoundary.key, startKey) <= 0 { + panic("first boundary is not after startKey") + } + // We start using the frontier after the first ShouldSplitBefore call. + s.frontier.Init(frontiers, nil, s.boundaryReached) + return s +} + +// boundaryReached is the callback registered with Frontiers; it runs whenever +// the frontier advances past the current boundary. +func (s *OutputSplitter) boundaryReached(key []byte) (nextBoundary []byte) { + // The passed key can be past the next boundary. + s.reachedBoundary = s.nextBoundary + if !s.nextBoundary.isGrandparent { + s.nextBoundary = splitterBoundary{} + return nil + } + s.grandparentBoundariesObserved++ + s.setNextBoundary(s.grandparentLevel.Next()) + // It is possible that the next boundary is already reached; in that case + // boundaryReached will just fire again immediately. + return s.nextBoundary.key +} + +func (s *OutputSplitter) setNextBoundary(nextGrandparent *manifest.TableMetadata) { + if nextGrandparent != nil && (s.limit == nil || s.cmp(nextGrandparent.Smallest().UserKey, s.limit) < 0) { + s.nextBoundary = splitterBoundary{ + key: nextGrandparent.Smallest().UserKey, + isGrandparent: true, + } + } else { + s.nextBoundary = splitterBoundary{ + key: s.limit, + isGrandparent: false, + } + } +} + +// ShouldSplitBefore returns whether we should split the output before the next +// key. It is passed the current estimated file size and a function that can be +// used to retrieve the previous user key. +// +// The equalPrevFn function is used to guarantee no split user keys, without +// OutputSplitter copying each key internally. It is not performance sensitive, +// as it is only called once we decide to split. +// +// Once ShouldSplitBefore returns SplitNow, it must not be called again. +// SplitKey() can be used to retrieve the recommended split key. +// +// INVARIANT: nextUserKey must match the current frontier. +func (s *OutputSplitter) ShouldSplitBefore( + nextUserKey []byte, estimatedFileSize uint64, equalPrevFn func([]byte) bool, +) ShouldSplit { + if invariants.Enabled && s.splitKey != nil { + panic("ShouldSplitBefore called after it returned SplitNow") + } + if !s.shouldSplitCalled { + // The boundary could have been advanced to nextUserKey before the splitter + // was created. So one single time, we advance the boundary manually. + s.shouldSplitCalled = true + for s.nextBoundary.key != nil && s.cmp(s.nextBoundary.key, nextUserKey) <= 0 { + s.boundaryReached(nextUserKey) + } + s.frontier.Update(s.nextBoundary.key) + } + + if invariants.Enabled && s.nextBoundary.key != nil && s.cmp(s.nextBoundary.key, nextUserKey) <= 0 { + panic("boundary is behind the next key (or startKey was before the boundary)") + } + // Note: s.reachedBoundary can be empty. + reachedBoundary := s.reachedBoundary + s.reachedBoundary = splitterBoundary{} + if invariants.Enabled && reachedBoundary.key != nil && s.cmp(reachedBoundary.key, nextUserKey) > 0 { + panic("reached boundary ahead of the next user key") + } + if reachedBoundary.key != nil && !reachedBoundary.isGrandparent { + // Limit was reached. + s.splitKey = s.limit + return SplitNow + } + + if s.shouldSplitBasedOnSize(estimatedFileSize, reachedBoundary.isGrandparent) == SplitNow { + // We want to split here based on size, but we cannot split between two keys + // with the same UserKey. + // + // If we are at a grandparent boundary, we know that this key cannot have the + // same UserKey as the previous key (otherwise, that key would have been the + // one hitting this boundary). + if reachedBoundary.isGrandparent { + s.splitKey = reachedBoundary.key + return SplitNow + } + + // When the target file size limit is very small (in tests), we could end up + // splitting at the first key, which is not allowed. + if s.cmp(nextUserKey, s.startKey) <= 0 { + return NoSplit + } + + // TODO(radu): it would make for a cleaner interface if we didn't rely on a + // equalPrevFn. We could make a copy of the key here and split at the next + // user key that is different; the main difficulty is that various tests + // expect 1 key per output table if the target file size is very small. + if !equalPrevFn(nextUserKey) { + s.splitKey = slices.Clone(nextUserKey) + return SplitNow + } + } + + return NoSplit +} + +// SplitKey returns the suggested split key - the first key at which the next +// output file should start. +// +// If ShouldSplitBefore never returned SplitNow, then SplitKey returns the limit +// passed to NewOutputSplitter (which can be nil). +// +// Otherwise, it returns a key <= the key passed to the last ShouldSplitBefore +// call and > the key passed to the previous call to ShouldSplitBefore (and > +// than the start key). This key is guaranteed to be larger than the start key. +func (s *OutputSplitter) SplitKey() []byte { + s.frontier.Update(nil) + if s.splitKey != nil { + if invariants.Enabled && s.cmp(s.splitKey, s.startKey) <= 0 { + panic(fmt.Sprintf("splitKey %q <= startKey %q", s.splitKey, s.startKey)) + } + return s.splitKey + } + return s.limit +} + +// shouldSplitBasedOnSize returns whether we should split based on the file size +// and whether we are at a grandparent boundary. +func (s *OutputSplitter) shouldSplitBasedOnSize( + estSize uint64, atGrandparentBoundary bool, +) ShouldSplit { + switch { + case estSize < s.targetFileSize/2: + // The estimated file size is less than half the target file size. Don't + // split it, even if currently aligned with a grandparent file because + // it's too small. + return NoSplit + case estSize >= 2*s.targetFileSize: + // The estimated file size is double the target file size. Split it even + // if we were not aligned with a grandparent file boundary to avoid + // excessively exceeding the target file size. + return SplitNow + case !atGrandparentBoundary: + // Don't split if we're not at a grandparent, except if we've exhausted all + // the grandparents up to the limit. Then we may want to split purely based + // on file size. + if !s.nextBoundary.isGrandparent { + // There are no more grandparents. Optimize for the target file size + // and split as soon as we hit the target file size. + if estSize >= s.targetFileSize { + return SplitNow + } + } + return NoSplit + default: + // INVARIANT: atGrandparentBoundary + // INVARIANT: targetSize/2 < estSize < 2*targetSize + // + // The estimated file size is close enough to the target file size that + // we should consider splitting. + // + // Determine whether to split now based on how many grandparent + // boundaries we have already observed while building this output file. + // The intuition here is that if the grandparent level is dense in this + // part of the keyspace, we're likely to continue to have more + // opportunities to split this file aligned with a grandparent. If this + // is the first grandparent boundary observed, we split immediately + // (we're already at ≥50% the target file size). Otherwise, each + // overlapping grandparent we've observed increases the minimum file + // size by 5% of the target file size, up to at most 90% of the target + // file size. + // + // TODO(jackson): The particular thresholds are somewhat unprincipled. + // This is the same heuristic as RocksDB implements. Is there are more + // principled formulation that can, further reduce w-amp, produce files + // closer to the target file size, or is more understandable? + + // NB: Subtract 1 from `boundariesObserved` to account for the current + // boundary we're considering splitting at. + minimumPctOfTargetSize := 50 + 5*min(s.grandparentBoundariesObserved-1, 8) + if estSize < (minimumPctOfTargetSize*s.targetFileSize)/100 { + return NoSplit + } + return SplitNow + } +} + +// A frontier is used to monitor a compaction's progression across the user +// keyspace. +// +// A frontier hold a user key boundary that it's concerned with in its `key` +// field. If/when the compaction iterator returns an InternalKey with a user key +// _k_ such that k ≥ frontier.key, the compaction iterator invokes the +// frontier's `reached` function, passing _k_ as its argument. +// +// The `reached` function returns a new value to use as the key. If `reached` +// returns nil, the frontier is forgotten and its `reached` method will not be +// invoked again, unless the user calls [Update] to set a new key. +// +// A frontier's key may be updated outside the context of a `reached` +// invocation at any time, through its Update method. +type frontier struct { + // container points to the containing *Frontiers that was passed to Init + // when the frontier was initialized. + container *Frontiers + + // key holds the frontier's current key. If nil, this frontier is inactive + // and its reached func will not be invoked. The value of this key may only + // be updated by the `Frontiers` type, or the Update method. + key []byte + + reached frontierReachedFn +} + +// frontierReachedFn is invoked to inform a frontier that its key has been +// reached. It's invoked with the user key that reached the limit. The `key` +// argument is guaranteed to be ≥ the frontier's key. +// +// After frontierReachedFn is invoked, the frontier's key is updated to the +// return value of frontierReachedFn. The frontier is permitted to update its +// key to a user key ≤ the argument `key`. +// +// If a frontier is set to key k1, and reached(k2) is invoked (k2 ≥ k1), the +// frontier will receive reached(k2) calls until it returns nil or a key k3 such +// that k2 < k3. This property is useful for Frontiers that use +// frontierReachedFn invocations to drive iteration through collections of keys +// that may contain multiple keys that are both < k2 and ≥ k1. +type frontierReachedFn func(currentFrontier []byte) (next []byte) + +// Init initializes the frontier with the provided key and reached callback. +// The frontier is attached to the provided *Frontiers and the provided reached +// func will be invoked when the *Frontiers is advanced to a key ≥ this +// frontier's key. +func (f *frontier) Init(frontiers *Frontiers, initialKey []byte, reached frontierReachedFn) { + *f = frontier{ + container: frontiers, + key: initialKey, + reached: reached, + } + if initialKey != nil { + f.container.push(f) + } +} + +// String implements fmt.Stringer. +func (f *frontier) String() string { + return string(f.key) +} + +// Update replaces the existing frontier's key with the provided key. The +// frontier's reached func will be invoked when the new key is reached. +func (f *frontier) Update(key []byte) { + c := f.container + prevKeyIsNil := f.key == nil + f.key = key + if prevKeyIsNil { + if key != nil { + c.push(f) + } + return + } + + // Find the frontier within the heap (it must exist within the heap because + // f.key was != nil). If the frontier key is now nil, remove it from the + // heap. Otherwise, fix up its position. + for i := 0; i < len(c.items); i++ { + if c.items[i] == f { + if key != nil { + c.fix(i) + } else { + n := c.len() - 1 + c.swap(i, n) + c.down(i, n) + c.items = c.items[:n] + } + return + } + } + panic("unreachable") +} + +// Frontiers is used to track progression of a task (eg, compaction) across the +// keyspace. Clients that want to be informed when the task advances to a key ≥ +// some frontier may register a frontier, providing a callback. The task calls +// `Advance(k)` with each user key encountered, which invokes the `reached` func +// on all tracked Frontiers with `key`s ≤ k. +// +// Internally, Frontiers is implemented as a simple heap. +type Frontiers struct { + cmp base.Compare + items []*frontier +} + +// Init initializes a Frontiers for use. +func (f *Frontiers) Init(cmp base.Compare) { + f.cmp = cmp +} + +// String implements fmt.Stringer. +func (f *Frontiers) String() string { + var buf bytes.Buffer + for i := 0; i < len(f.items); i++ { + if i > 0 { + fmt.Fprint(&buf, ", ") + } + fmt.Fprintf(&buf, "%s: %q", f.items[i], f.items[i].key) + } + return buf.String() +} + +// Advance notifies all member Frontiers with keys ≤ k. +func (f *Frontiers) Advance(k []byte) { + for len(f.items) > 0 && f.cmp(k, f.items[0].key) >= 0 { + // This frontier has been reached. Invoke the closure and update with + // the next frontier. + f.items[0].key = f.items[0].reached(k) + if f.items[0].key == nil { + // This was the final frontier that this user was concerned with. + // Remove it from the heap. + f.pop() + } else { + // Fix up the heap root. Note that if the key is still smaller than k, the + // callback will be invoked again in the same loop. + f.fix(0) + } + } +} + +func (f *Frontiers) len() int { + return len(f.items) +} + +func (f *Frontiers) less(i, j int) bool { + return f.cmp(f.items[i].key, f.items[j].key) < 0 +} + +func (f *Frontiers) swap(i, j int) { + f.items[i], f.items[j] = f.items[j], f.items[i] +} + +// fix, up and down are copied from the go stdlib. + +func (f *Frontiers) fix(i int) { + if !f.down(i, f.len()) { + f.up(i) + } +} + +func (f *Frontiers) push(ff *frontier) { + n := len(f.items) + f.items = append(f.items, ff) + f.up(n) +} + +func (f *Frontiers) pop() *frontier { + n := f.len() - 1 + f.swap(0, n) + f.down(0, n) + item := f.items[n] + f.items = f.items[:n] + return item +} + +func (f *Frontiers) up(j int) { + for { + i := (j - 1) / 2 // parent + if i == j || !f.less(j, i) { + break + } + f.swap(i, j) + j = i + } +} + +func (f *Frontiers) down(i0, n int) bool { + i := i0 + for { + j1 := 2*i + 1 + if j1 >= n || j1 < 0 { // j1 < 0 after int overflow + break + } + j := j1 // left child + if j2 := j1 + 1; j2 < n && f.less(j2, j1) { + j = j2 // = 2*i + 2 // right child + } + if !f.less(j, i) { + break + } + f.swap(i, j) + i = j + } + return i > i0 +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compact/tombstone_elision.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/tombstone_elision.go new file mode 100644 index 0000000..1d69e27 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compact/tombstone_elision.go @@ -0,0 +1,194 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compact + +import ( + "strings" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/manifest" +) + +// TombstoneElision is the information required to determine which tombstones +// (in the bottom snapshot stripe) can be elided. For example, when compacting +// into L6 (the lowest level), we can elide all tombstones (in the bottom +// snapshot stripe). +// +// TombstoneElision can indicate that no tombstones can be elided, or it can +// store a set of key ranges where only tombstones that do NOT overlap those key +// ranges can be elided. +// +// Note that the concept of "tombstone" applies to range keys as well: +// RangeKeyUnset and RangeKeyDelete are considered tombstones w.r.t other +// range keys and can use TombstoneElision. +type TombstoneElision struct { + mode tombstoneElisionMode + inUseRanges []base.UserKeyBounds +} + +type tombstoneElisionMode int8 + +const ( + elideNothing tombstoneElisionMode = iota + elideNotInUse +) + +// NoTombstoneElision is used when no tombstones can be elided (e.g. the entire +// compaction range is in use). +func NoTombstoneElision() TombstoneElision { + return TombstoneElision{mode: elideNothing} +} + +// ElideTombstonesOutsideOf is used when tombstones can be elided if they don't +// overlap with a set of "in use" key ranges. These ranges must be ordered and +// disjoint. +func ElideTombstonesOutsideOf(inUseRanges []base.UserKeyBounds) TombstoneElision { + return TombstoneElision{ + mode: elideNotInUse, + inUseRanges: inUseRanges, + } +} + +// ElidesNothing returns true if no tombstones will be elided. +func (e TombstoneElision) ElidesNothing() bool { + return e.mode == elideNothing +} + +// ElidesEverything returns true if all tombstones (in the bottom snapshot +// stripe) can be elided. +func (e TombstoneElision) ElidesEverything() bool { + return e.mode == elideNotInUse && len(e.inUseRanges) == 0 +} + +func (e TombstoneElision) String() string { + switch { + case e.ElidesNothing(): + return "elide nothing" + case e.ElidesEverything(): + return "elide everything" + default: + var b strings.Builder + for i, r := range e.inUseRanges { + if i > 0 { + b.WriteString(" ") + } + b.WriteString(r.String()) + } + return b.String() + } +} + +// pointTombstoneElider is used to check if point tombstones (i.e. DEL/SINGLEDELs) can +// be elided. +type pointTombstoneElider struct { + cmp base.Compare + elision TombstoneElision + // inUseIdx is an index into elision.inUseRanges; it points to the first + // range that ends after the last key passed to ShouldElide. + inUseIdx int +} + +func (te *pointTombstoneElider) Init(cmp base.Compare, elision TombstoneElision) { + *te = pointTombstoneElider{ + cmp: cmp, + elision: elision, + } +} + +// ShouldElide returns true if a point tombstone with the given key can be +// elided. The keys in multiple invocations to ShouldElide must be supplied in +// order. +func (te *pointTombstoneElider) ShouldElide(key []byte) bool { + if te.elision.ElidesNothing() { + return false + } + + inUseRanges := te.elision.inUseRanges + if invariants.Enabled && te.inUseIdx > 0 && inUseRanges[te.inUseIdx-1].End.IsUpperBoundFor(te.cmp, key) { + panic("ShouldElidePoint called with out-of-order key") + } + // Advance inUseIdx to the first in-use range that ends after key. + for te.inUseIdx < len(te.elision.inUseRanges) && !inUseRanges[te.inUseIdx].End.IsUpperBoundFor(te.cmp, key) { + te.inUseIdx++ + } + // We can elide the point tombstone if this range starts after the key. + return te.inUseIdx >= len(te.elision.inUseRanges) || te.cmp(inUseRanges[te.inUseIdx].Start, key) > 0 +} + +// rangeTombstoneElider is used to check if range tombstones can be elided. +// +// It can be used for RANGEDELs (in which case, the "in use" ranges reflect +// point keys); or for RANGEKEYUNSET, RANGEKEYDELETE, in which case the "in use" +// ranges reflect range keys. +type rangeTombstoneElider struct { + cmp base.Compare + elision TombstoneElision + // inUseIdx is an index into elision.inUseRanges; it points to the first + // range that ends after the last start key passed to ShouldElide. + inUseIdx int +} + +func (te *rangeTombstoneElider) Init(cmp base.Compare, elision TombstoneElision) { + *te = rangeTombstoneElider{ + cmp: cmp, + elision: elision, + } +} + +// ShouldElide returns true if the tombstone for the given end-exclusive range +// can be elided. The start keys in multiple invocations to ShouldElide must be +// supplied in order. +func (te *rangeTombstoneElider) ShouldElide(start, end []byte) bool { + if te.elision.ElidesNothing() { + return false + } + + inUseRanges := te.elision.inUseRanges + if invariants.Enabled && te.inUseIdx > 0 && inUseRanges[te.inUseIdx-1].End.IsUpperBoundFor(te.cmp, start) { + panic("ShouldElideRange called with out-of-order key") + } + // Advance inUseIdx to the first in-use range that ends after start. + for te.inUseIdx < len(te.elision.inUseRanges) && !inUseRanges[te.inUseIdx].End.IsUpperBoundFor(te.cmp, start) { + te.inUseIdx++ + } + // We can elide the range tombstone if this range starts after the tombstone ends. + return te.inUseIdx >= len(te.elision.inUseRanges) || te.cmp(inUseRanges[te.inUseIdx].Start, end) >= 0 +} + +// SetupTombstoneElision calculates the TombstoneElision policies for a +// compaction operating on the given version and output level. +func SetupTombstoneElision( + cmp base.Compare, + v *manifest.Version, + l0Organizer *manifest.L0Organizer, + outputLevel int, + compactionBounds base.UserKeyBounds, +) (dels, rangeKeys TombstoneElision) { + // We want to calculate the in-use key ranges from the levels below our output + // level, unless it is L0; L0 requires special treatment, since sstables + // within L0 may overlap. + startLevel := 0 + if outputLevel > 0 { + startLevel = outputLevel + 1 + } + // CalculateInuseKeyRanges will return a series of sorted spans. Overlapping + // or abutting spans have already been merged. + inUseKeyRanges := v.CalculateInuseKeyRanges( + l0Organizer, startLevel, manifest.NumLevels-1, compactionBounds.Start, compactionBounds.End.Key, + ) + // Check if there's a single in-use span that encompasses the entire key range + // of the compaction. This is an optimization to avoid key comparisons against + // the in-use ranges during the compaction when every key within the + // compaction overlaps with an in-use span. + if len(inUseKeyRanges) == 1 && inUseKeyRanges[0].ContainsBounds(cmp, &compactionBounds) { + dels = NoTombstoneElision() + } else { + dels = ElideTombstonesOutsideOf(inUseKeyRanges) + } + // TODO(radu): we should calculate in-use ranges separately for point keys and for range keys. + rangeKeys = dels + return dels, rangeKeys +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compression/adaptive.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/adaptive.go new file mode 100644 index 0000000..34ece33 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/adaptive.go @@ -0,0 +1,104 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compression + +import ( + "math" + "math/rand/v2" + "sync" + + "github.com/cockroachdb/pebble/v2/internal/ewma" +) + +// AdaptiveCompressor is a Compressor that automatically chooses between two +// algorithms: it uses a slower but better algorithm as long as it reduces the +// compressed size (compared to the faster algorithm) by a certain relative +// amount. The decision is probabilistic and based on sampling a subset of +// blocks. +type AdaptiveCompressor struct { + fast Compressor + slow Compressor + + reductionCutoff float64 + sampleEvery int + + // estimator for the relative size reduction when choosing the slow algorithm. + estimator ewma.Bytes + rng rand.PCG + + buf []byte +} + +// AdaptiveCompressorParams contains the parameters for an adaptive compressor. +type AdaptiveCompressorParams struct { + // Fast and Slow are the two compression settings the adaptive compressor + // chooses between. + Fast Setting + Slow Setting + // ReductionCutoff is the relative size reduction (when using the slow + // algorithm vs the fast algorithm) below which we use the fast algorithm. For + // example, if ReductionCutoff is 0.3 then we only use the slow algorithm if + // it reduces the compressed size (compared to the fast algorithm) by at least + // 30%. + ReductionCutoff float64 + // SampleEvery defines the sampling frequency: the probability we sample a + // block is 1.0/SampleEvery. Sampling means trying both algorithms and + // recording the compression ratio. + SampleEvery int + // SampleHalfLife defines the half-life of the exponentially weighted moving + // average. It should be a factor larger than the expected average block size. + SampleHalfLife int64 + SamplingSeed uint64 +} + +func NewAdaptiveCompressor(p AdaptiveCompressorParams) *AdaptiveCompressor { + ac := adaptiveCompressorPool.Get().(*AdaptiveCompressor) + ac.fast = GetCompressor(p.Fast) + ac.slow = GetCompressor(p.Slow) + ac.sampleEvery = p.SampleEvery + ac.reductionCutoff = p.ReductionCutoff + ac.estimator.Init(p.SampleHalfLife) + ac.rng.Seed(p.SamplingSeed, p.SamplingSeed) + return ac +} + +var _ Compressor = (*AdaptiveCompressor)(nil) + +var adaptiveCompressorPool = sync.Pool{ + New: func() any { return &AdaptiveCompressor{} }, +} + +func (ac *AdaptiveCompressor) Compress(dst, src []byte) ([]byte, Setting) { + estimate := ac.estimator.Estimate() + // TODO(radu): consider decreasing the sampling frequency if the estimate is + // far from the cutoff. + sampleThisBlock := math.IsNaN(estimate) || ac.rng.Uint64()%uint64(ac.sampleEvery) == 0 + if !sampleThisBlock { + ac.estimator.NoSample(int64(len(src))) + if estimate < ac.reductionCutoff { + return ac.fast.Compress(dst, src) + } else { + return ac.slow.Compress(dst, src) + } + } + bufFast, fastSetting := ac.fast.Compress(ac.buf[:0], src) + ac.buf = bufFast[:0] + dst, slowSetting := ac.slow.Compress(dst, src) + reduction := 1 - float64(len(dst))/float64(len(bufFast)) + ac.estimator.SampledBlock(int64(len(src)), reduction) + if reduction < ac.reductionCutoff { + return append(dst[:0], bufFast...), fastSetting + } + return dst, slowSetting +} + +func (ac *AdaptiveCompressor) Close() { + ac.fast.Close() + ac.slow.Close() + if cap(ac.buf) > 256*1024 { + ac.buf = nil // Release large buffers. + } + adaptiveCompressorPool.Put(ac) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compression/compression.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/compression.go new file mode 100644 index 0000000..4d1a91d --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/compression.go @@ -0,0 +1,139 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compression + +import ( + "fmt" + + "github.com/minio/minlz" +) + +// Algorithm identifies a compression algorithm. Some compression algorithms +// support multiple compression levels. +// +// Decompressing data requires only an Algorithm. +type Algorithm uint8 + +const ( + NoCompression Algorithm = iota + SnappyAlgorithm + Zstd + MinLZ + + NumAlgorithms +) + +// String implements fmt.Stringer, returning a human-readable name for the +// compression algorithm. +func (a Algorithm) String() string { + switch a { + case NoCompression: + return "NoCompression" + case SnappyAlgorithm: + return "Snappy" + case Zstd: + return "ZSTD" + case MinLZ: + return "MinLZ" + default: + return fmt.Sprintf("unknown(%d)", a) + } +} + +// Setting contains the information needed to compress data. It includes an +// Algorithm and possibly a compression level. +type Setting struct { + Algorithm Algorithm + // Level depends on the algorithm. Some algorithms don't support a level (in + // which case Level is 0). + Level uint8 +} + +func (s Setting) String() string { + if s.Level == 0 { + return s.Algorithm.String() + } + return fmt.Sprintf("%s%d", s.Algorithm, s.Level) +} + +// Setting presets. +var ( + None = makePreset(NoCompression, 0) + Snappy = makePreset(SnappyAlgorithm, 0) + MinLZFastest = makePreset(MinLZ, minlz.LevelFastest) + MinLZBalanced = makePreset(MinLZ, minlz.LevelBalanced) + ZstdLevel1 = makePreset(Zstd, 1) + ZstdLevel3 = makePreset(Zstd, 3) + ZstdLevel5 = makePreset(Zstd, 5) + ZstdLevel7 = makePreset(Zstd, 7) +) + +// Compressor is an interface for compressing data. An instance is associated +// with a specific Setting. +type Compressor interface { + // Compress a block, appending the compressed data to dst[:0]. + // Returns setting used. + Compress(dst, src []byte) ([]byte, Setting) + + // Close must be called when the Compressor is no longer needed. + // After Close is called, the Compressor must not be used again. + Close() +} + +func GetCompressor(s Setting) Compressor { + switch s.Algorithm { + case NoCompression: + return noopCompressor{} + case SnappyAlgorithm: + return snappyCompressor{} + case Zstd: + return getZstdCompressor(int(s.Level)) + case MinLZ: + return getMinlzCompressor(int(s.Level)) + default: + panic("Invalid compression type.") + } +} + +// Decompressor is an interface for compressing data. An instance is associated +// with a specific Algorithm. +type Decompressor interface { + // DecompressInto decompresses compressed into buf. The buf slice must have the + // exact size as the decompressed value. Callers may use DecompressedLen to + // determine the correct size. + DecompressInto(buf, compressed []byte) error + + // DecompressedLen returns the length of the provided block once decompressed, + // allowing the caller to allocate a buffer exactly sized to the decompressed + // payload. + DecompressedLen(b []byte) (decompressedLen int, err error) + + // Close must be called when the Decompressor is no longer needed. + // After Close is called, the Decompressor must not be used again. + Close() +} + +func GetDecompressor(a Algorithm) Decompressor { + switch a { + case NoCompression: + return noopDecompressor{} + case SnappyAlgorithm: + return snappyDecompressor{} + case Zstd: + return getZstdDecompressor() + case MinLZ: + return minlzDecompressor{} + default: + panic("Invalid compression type.") + } +} + +var presets []Setting + +func makePreset(algorithm Algorithm, level uint8) Setting { + s := Setting{Algorithm: algorithm, Level: level} + presets = append(presets, s) + return s +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compression/minlz.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/minlz.go new file mode 100644 index 0000000..26f7685 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/minlz.go @@ -0,0 +1,69 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compression + +import ( + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/minio/minlz" +) + +type minlzCompressor struct { + level int +} + +var _ Compressor = (*minlzCompressor)(nil) + +func (c *minlzCompressor) Algorithm() Algorithm { return MinLZ } + +func (c *minlzCompressor) Compress(dst, src []byte) ([]byte, Setting) { + // MinLZ cannot encode blocks greater than 8MB. Fall back to Snappy in those + // cases. + if len(src) > minlz.MaxBlockSize { + return (snappyCompressor{}).Compress(dst, src) + } + + compressed, err := minlz.Encode(dst, src, c.level) + if err != nil { + panic(errors.Wrap(err, "minlz compression")) + } + return compressed, Setting{Algorithm: MinLZ, Level: uint8(c.level)} +} + +func (c *minlzCompressor) Close() {} + +var minlzCompressorFastest = &minlzCompressor{level: minlz.LevelFastest} +var minlzCompressorBalanced = &minlzCompressor{level: minlz.LevelBalanced} + +func getMinlzCompressor(level int) Compressor { + switch level { + case minlz.LevelFastest: + return minlzCompressorFastest + case minlz.LevelBalanced: + return minlzCompressorBalanced + default: + panic(errors.AssertionFailedf("unexpected MinLZ level %d", level)) + } +} + +type minlzDecompressor struct{} + +var _ Decompressor = minlzDecompressor{} + +func (minlzDecompressor) DecompressInto(buf, compressed []byte) error { + result, err := minlz.Decode(buf, compressed) + if len(result) != len(buf) || (len(result) > 0 && &result[0] != &buf[0]) { + return base.CorruptionErrorf("pebble/table: decompressed into unexpected buffer: %p != %p", + errors.Safe(result), errors.Safe(buf)) + } + return err +} + +func (minlzDecompressor) DecompressedLen(b []byte) (decompressedLen int, err error) { + l, err := minlz.DecodedLen(b) + return l, err +} + +func (minlzDecompressor) Close() {} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compression/noop.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/noop.go new file mode 100644 index 0000000..443cbfd --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/noop.go @@ -0,0 +1,30 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compression + +type noopCompressor struct{} + +var _ Compressor = noopCompressor{} + +func (noopCompressor) Compress(dst, src []byte) ([]byte, Setting) { + return append(dst[:0], src...), None +} +func (noopCompressor) Close() {} + +type noopDecompressor struct{} + +var _ Decompressor = noopDecompressor{} + +func (noopDecompressor) DecompressInto(dst, src []byte) error { + dst = dst[:len(src)] + copy(dst, src) + return nil +} + +func (noopDecompressor) DecompressedLen(b []byte) (decompressedLen int, err error) { + return len(b), nil +} + +func (noopDecompressor) Close() {} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compression/snappy.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/snappy.go new file mode 100644 index 0000000..d567e6b --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/snappy.go @@ -0,0 +1,46 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package compression + +import ( + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/golang/snappy" +) + +type snappyCompressor struct{} + +var _ Compressor = snappyCompressor{} + +func (snappyCompressor) Algorithm() Algorithm { return SnappyAlgorithm } + +func (snappyCompressor) Compress(dst, src []byte) ([]byte, Setting) { + dst = dst[:cap(dst):cap(dst)] + return snappy.Encode(dst, src), Snappy +} + +func (snappyCompressor) Close() {} + +type snappyDecompressor struct{} + +var _ Decompressor = snappyDecompressor{} + +func (snappyDecompressor) DecompressInto(buf, compressed []byte) error { + result, err := snappy.Decode(buf, compressed) + if err != nil { + return err + } + if len(result) != len(buf) || (len(result) > 0 && &result[0] != &buf[0]) { + return base.CorruptionErrorf("pebble: decompressed into unexpected buffer: %p != %p", + errors.Safe(result), errors.Safe(buf)) + } + return nil +} + +func (snappyDecompressor) DecompressedLen(b []byte) (decompressedLen int, err error) { + return snappy.DecodedLen(b) +} + +func (snappyDecompressor) Close() {} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compression/zstd_cgo.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/zstd_cgo.go new file mode 100644 index 0000000..6709da0 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/zstd_cgo.go @@ -0,0 +1,125 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build cgo && !pebblegozstd + +package compression + +import ( + "encoding/binary" + "sync" + + "github.com/DataDog/zstd" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" +) + +type zstdCompressor struct { + level int + ctx zstd.Ctx +} + +var _ Compressor = (*zstdCompressor)(nil) + +var zstdCompressorPool = sync.Pool{ + New: func() any { + return &zstdCompressor{ctx: zstd.NewCtx()} + }, +} + +// UseStandardZstdLib indicates whether the zstd implementation is a port of the +// official one in the facebook/zstd repository. +// +// This constant is only used in tests. Some tests rely on reproducibility of +// SST files, but a custom implementation of zstd will produce different +// compression result. So those tests have to be disabled in such cases. +// +// We cannot always use the official facebook/zstd implementation since it +// relies on CGo. +const UseStandardZstdLib = true + +func (z *zstdCompressor) Compress(compressedBuf []byte, b []byte) ([]byte, Setting) { + if len(compressedBuf) < binary.MaxVarintLen64 { + compressedBuf = append(compressedBuf, make([]byte, binary.MaxVarintLen64-len(compressedBuf))...) + } + + // Get the bound and allocate the proper amount of memory instead of relying on + // Datadog/zstd to do it for us. This allows us to avoid memcopying data around + // for the varIntLen prefix. + bound := zstd.CompressBound(len(b)) + if cap(compressedBuf) < binary.MaxVarintLen64+bound { + compressedBuf = make([]byte, binary.MaxVarintLen64, binary.MaxVarintLen64+bound) + } + + varIntLen := binary.PutUvarint(compressedBuf, uint64(len(b))) + result, err := z.ctx.CompressLevel(compressedBuf[varIntLen:varIntLen+bound], b, z.level) + if err != nil { + panic("Error while compressing using Zstd.") + } + if &result[0] != &compressedBuf[varIntLen] { + panic("Allocated a new buffer despite checking CompressBound.") + } + + return compressedBuf[:varIntLen+len(result)], Setting{Algorithm: Zstd, Level: uint8(z.level)} +} + +func (z *zstdCompressor) Close() { + zstdCompressorPool.Put(z) +} + +func getZstdCompressor(level int) *zstdCompressor { + z := zstdCompressorPool.Get().(*zstdCompressor) + z.level = level + return z +} + +type zstdDecompressor struct { + ctx zstd.Ctx +} + +var _ Decompressor = (*zstdDecompressor)(nil) + +// DecompressInto decompresses src with the Zstandard algorithm. The destination +// buffer must already be sufficiently sized, otherwise DecompressInto may error. +func (z *zstdDecompressor) DecompressInto(dst, src []byte) error { + // The payload is prefixed with a varint encoding the length of + // the decompressed block. + _, prefixLen := binary.Uvarint(src) + src = src[prefixLen:] + if len(src) == 0 { + return errors.Errorf("decodeZstd: empty src buffer") + } + if len(dst) == 0 { + return errors.Errorf("decodeZstd: empty dst buffer") + } + _, err := z.ctx.DecompressInto(dst, src) + if err != nil { + return err + } + return nil +} + +func (zstdDecompressor) DecompressedLen(b []byte) (decompressedLen int, err error) { + // This will also be used by zlib, bzip2 and lz4 to retrieve the decodedLen + // if we implement these algorithms in the future. + decodedLenU64, varIntLen := binary.Uvarint(b) + if varIntLen <= 0 { + return 0, base.CorruptionErrorf("pebble: compression block has invalid length") + } + return int(decodedLenU64), nil +} + +func (z *zstdDecompressor) Close() { + zstdDecompressorPool.Put(z) +} + +var zstdDecompressorPool = sync.Pool{ + New: func() any { + return &zstdDecompressor{ctx: zstd.NewCtx()} + }, +} + +func getZstdDecompressor() *zstdDecompressor { + return zstdDecompressorPool.Get().(*zstdDecompressor) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/compression/zstd_nocgo.go b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/zstd_nocgo.go new file mode 100644 index 0000000..017eb31 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/compression/zstd_nocgo.go @@ -0,0 +1,104 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build !cgo || pebblegozstd + +package compression + +import ( + "encoding/binary" + "sync" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/klauspost/compress/zstd" +) + +type zstdCompressor struct { + level int + encoder *zstd.Encoder +} + +var _ Compressor = (*zstdCompressor)(nil) + +var zstdCompressorPool = sync.Pool{ + New: func() any { return &zstdCompressor{} }, +} + +// UseStandardZstdLib indicates whether the zstd implementation is a port of the +// official one in the facebook/zstd repository. +// +// This constant is only used in tests. Some tests rely on reproducibility of +// SST files, but a custom implementation of zstd will produce different +// compression result. So those tests have to be disabled in such cases. +// +// We cannot always use the official facebook/zstd implementation since it +// relies on CGo. +const UseStandardZstdLib = false + +func (z *zstdCompressor) Compress(compressedBuf, b []byte) ([]byte, Setting) { + if len(compressedBuf) < binary.MaxVarintLen64 { + compressedBuf = append(compressedBuf, make([]byte, binary.MaxVarintLen64-len(compressedBuf))...) + } + varIntLen := binary.PutUvarint(compressedBuf, uint64(len(b))) + res := z.encoder.EncodeAll(b, compressedBuf[:varIntLen]) + return res, Setting{Algorithm: Zstd, Level: uint8(z.level)} +} + +func (z *zstdCompressor) Close() { + if err := z.encoder.Close(); err != nil { + panic(err) + } + z.encoder = nil + zstdCompressorPool.Put(z) +} + +func getZstdCompressor(level int) *zstdCompressor { + encoder, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.EncoderLevelFromZstd(level))) + if err != nil { + panic(err) + } + z := zstdCompressorPool.Get().(*zstdCompressor) + z.level = level + z.encoder = encoder + return z +} + +type zstdDecompressor struct{} + +var _ Decompressor = zstdDecompressor{} + +func (zstdDecompressor) DecompressInto(dst, src []byte) error { + // The payload is prefixed with a varint encoding the length of + // the decompressed block. + _, prefixLen := binary.Uvarint(src) + src = src[prefixLen:] + decoder, _ := zstd.NewReader(nil) + defer decoder.Close() + result, err := decoder.DecodeAll(src, dst[:0]) + if err != nil { + return err + } + if len(result) != len(dst) || (len(result) > 0 && &result[0] != &dst[0]) { + return base.CorruptionErrorf("pebble/table: decompressed into unexpected buffer: %p != %p", + errors.Safe(result), errors.Safe(dst)) + } + return nil +} + +func (zstdDecompressor) DecompressedLen(b []byte) (decompressedLen int, err error) { + // This will also be used by zlib, bzip2 and lz4 to retrieve the decodedLen + // if we implement these algorithms in the future. + decodedLenU64, varIntLen := binary.Uvarint(b) + if varIntLen <= 0 { + return 0, base.CorruptionErrorf("pebble: compression block has invalid length") + } + return int(decodedLenU64), nil +} + +func (zstdDecompressor) Close() {} + +func getZstdDecompressor() zstdDecompressor { + return zstdDecompressor{} +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/constants/constants.go b/vendor/github.com/cockroachdb/pebble/v2/internal/constants/constants.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/constants/constants.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/constants/constants.go diff --git a/vendor/github.com/cockroachdb/pebble/internal/crc/crc.go b/vendor/github.com/cockroachdb/pebble/v2/internal/crc/crc.go similarity index 95% rename from vendor/github.com/cockroachdb/pebble/internal/crc/crc.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/crc/crc.go index 4021a2e..faf2e4c 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/crc/crc.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/crc/crc.go @@ -14,7 +14,7 @@ // var u uint32 = crc.New(data).Value() // // In pebble, the uint32 value is then stored in little-endian format. -package crc // import "github.com/cockroachdb/pebble/internal/crc" +package crc // import "github.com/cockroachdb/pebble/v2/internal/crc" import "hash/crc32" diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/dsl/dsl.go b/vendor/github.com/cockroachdb/pebble/v2/internal/dsl/dsl.go new file mode 100644 index 0000000..ef546fd --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/dsl/dsl.go @@ -0,0 +1,160 @@ +// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package dsl provides facilities for parsing lisp-like domain-specific +// languages (DSL). +package dsl + +import ( + "fmt" + "go/scanner" + "go/token" + "strconv" + "strings" + + "github.com/cockroachdb/errors" +) + +// NewParser constructs a new Parser of a lisp-like DSL. +func NewParser[T any]() *Parser[T] { + p := new(Parser[T]) + p.constants = make(map[string]func() T) + p.funcs = make(map[string]func(*Parser[T], *Scanner) T) + return p +} + +// NewPredicateParser constructs a new Parser of a Lisp-like DSL, where the +// resulting type implements Predicate[E]. NewPredicateParser predefines a few +// useful functions: Not, And, Or, OnIndex. +func NewPredicateParser[E any]() *Parser[Predicate[E]] { + p := NewParser[Predicate[E]]() + p.DefineFunc("Not", parseNot[E]) + p.DefineFunc("And", parseAnd[E]) + p.DefineFunc("Or", parseOr[E]) + p.DefineFunc("OnIndex", parseOnIndex[E]) + return p +} + +// A Parser holds the rules and logic for parsing a DSL. +type Parser[T any] struct { + constants map[string]func() T + funcs map[string]func(*Parser[T], *Scanner) T +} + +// DefineConstant adds a new constant to the Parser's supported DSL. Whenever +// the provided identifier is used within a constant context, the provided +// closure is invoked to instantiate an appropriate AST value. +func (p *Parser[T]) DefineConstant(identifier string, instantiate func() T) { + p.constants[identifier] = instantiate +} + +// DefineFunc adds a new func to the Parser's supported DSL. Whenever the +// provided identifier is used within a function invocation context, the +// provided closure is invoked to instantiate an appropriate AST value. +func (p *Parser[T]) DefineFunc(identifier string, parseFunc func(*Parser[T], *Scanner) T) { + p.funcs[identifier] = parseFunc +} + +// Parse parses the provided input string. +func (p *Parser[T]) Parse(d string) (ret T, err error) { + defer func() { + if r := recover(); r != nil { + var ok bool + err, ok = r.(error) + if !ok { + panic(r) + } + } + }() + + fset := token.NewFileSet() + file := fset.AddFile("", -1, len(d)) + var s Scanner + s.Init(file, []byte(strings.TrimSpace(d)), nil /* no error handler */, 0) + tok := s.Scan() + ret = p.ParseFromPos(&s, tok) + tok = s.Scan() + if tok.Kind == token.SEMICOLON { + tok = s.Scan() + } + assertTok(tok, token.EOF) + return ret, err +} + +// ParseFromPos parses from the provided current position and associated +// scanner. If the parser fails to parse, it panics. This function is intended +// to be used when composing Parsers of various types. +func (p *Parser[T]) ParseFromPos(s *Scanner, tok Token) T { + switch tok.Kind { + case token.IDENT: + // A constant without any parens, eg. `Reads`. + p, ok := p.constants[tok.Lit] + if !ok { + panic(errors.Errorf("dsl: unknown constant %q", tok.Lit)) + } + return p() + case token.LPAREN: + // Otherwise it's an expression, eg: (OnIndex 1) + tok = s.Consume(token.IDENT) + fp, ok := p.funcs[tok.Lit] + if !ok { + panic(errors.Errorf("dsl: unknown func %q", tok.Lit)) + } + return fp(p, s) + default: + panic(errors.Errorf("dsl: unexpected token %s; expected IDENT or LPAREN", tok.String())) + } +} + +// A Scanner holds the scanner's internal state while processing a given text. +type Scanner struct { + scanner.Scanner +} + +// Scan scans the next token and returns it. +func (s *Scanner) Scan() Token { + pos, tok, lit := s.Scanner.Scan() + return Token{pos, tok, lit} +} + +// Consume scans the next token. If the token is not of the provided token, it +// panics. It returns the token itself. +func (s *Scanner) Consume(expect token.Token) Token { + t := s.Scan() + assertTok(t, expect) + return t +} + +// ConsumeString scans the next token. It panics if the next token is not a +// string, or if unable to unquote the string. It returns the unquoted string +// contents. +func (s *Scanner) ConsumeString() string { + lit := s.Consume(token.STRING).Lit + str, err := strconv.Unquote(lit) + if err != nil { + panic(errors.Newf("dsl: unquoting %q: %v", lit, err)) + } + return str +} + +// Token is a lexical token scanned from an input text. +type Token struct { + pos token.Pos + Kind token.Token + Lit string +} + +// String implements fmt.Stringer. +func (t *Token) String() string { + if t.Lit != "" { + return fmt.Sprintf("(%s, %q) at pos %v", t.Kind, t.Lit, t.pos) + } + return fmt.Sprintf("%s at pos %v", t.Kind, t.pos) +} + +func assertTok(tok Token, expect token.Token) { + if tok.Kind != expect { + panic(errors.Errorf("dsl: unexpected token %s; expected %s", tok.String(), expect)) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/dsl/predicates.go b/vendor/github.com/cockroachdb/pebble/v2/internal/dsl/predicates.go new file mode 100644 index 0000000..fff0fcd --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/dsl/predicates.go @@ -0,0 +1,136 @@ +// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package dsl + +import ( + "fmt" + "go/token" + "strconv" + "strings" + "sync/atomic" + + "github.com/cockroachdb/errors" +) + +// Predicate encodes conditional logic that yields a boolean. +type Predicate[E any] interface { + Evaluate(E) bool + String() string +} + +// Not returns a Predicate that negates the provided predicate. +func Not[E any](p Predicate[E]) Predicate[E] { return not[E]{Predicate: p} } + +// And returns a Predicate that evaluates to true if all its operands evaluate +// to true. +func And[E any](preds ...Predicate[E]) Predicate[E] { return and[E](preds) } + +// Or returns a Predicate that evaluates to true if any of its operands evaluate +// true. +func Or[E any](preds ...Predicate[E]) Predicate[E] { return or[E](preds) } + +// OnIndex returns a Predicate that evaluates to true on its N-th call. +func OnIndex[E any](n int32) *Index[E] { + p := new(Index[E]) + p.Int32.Store(n) + return p +} + +// Index is a Predicate that evaluates to true only on its N-th invocation. +type Index[E any] struct { + atomic.Int32 +} + +// String implements fmt.Stringer. +func (p *Index[E]) String() string { + return fmt.Sprintf("(OnIndex %d)", p.Int32.Load()) +} + +// Evaluate implements Predicate. +func (p *Index[E]) Evaluate(E) bool { return p.Int32.Add(-1) == -1 } + +type not[E any] struct { + Predicate[E] +} + +func (p not[E]) String() string { return fmt.Sprintf("(Not %s)", p.Predicate.String()) } +func (p not[E]) Evaluate(e E) bool { return !p.Predicate.Evaluate(e) } + +type and[E any] []Predicate[E] + +func (p and[E]) String() string { + var sb strings.Builder + sb.WriteString("(And") + for i := 0; i < len(p); i++ { + sb.WriteRune(' ') + sb.WriteString(p[i].String()) + } + sb.WriteRune(')') + return sb.String() +} + +func (p and[E]) Evaluate(e E) bool { + ok := true + for i := range p { + ok = ok && p[i].Evaluate(e) + } + return ok +} + +type or[E any] []Predicate[E] + +func (p or[E]) String() string { + var sb strings.Builder + sb.WriteString("(Or") + for i := 0; i < len(p); i++ { + sb.WriteRune(' ') + sb.WriteString(p[i].String()) + } + sb.WriteRune(')') + return sb.String() +} + +func (p or[E]) Evaluate(e E) bool { + ok := false + for i := range p { + ok = ok || p[i].Evaluate(e) + } + return ok +} + +func parseNot[E any](p *Parser[Predicate[E]], s *Scanner) Predicate[E] { + preds := parseVariadicPredicate(p, s) + if len(preds) != 1 { + panic(errors.Newf("dsl: not accepts exactly 1 argument, given %d", len(preds))) + } + return not[E]{Predicate: preds[0]} +} + +func parseAnd[E any](p *Parser[Predicate[E]], s *Scanner) Predicate[E] { + return And[E](parseVariadicPredicate[E](p, s)...) +} + +func parseOr[E any](p *Parser[Predicate[E]], s *Scanner) Predicate[E] { + return Or[E](parseVariadicPredicate[E](p, s)...) +} + +func parseOnIndex[E any](p *Parser[Predicate[E]], s *Scanner) Predicate[E] { + i, err := strconv.ParseInt(s.Consume(token.INT).Lit, 10, 32) + if err != nil { + panic(err) + } + s.Consume(token.RPAREN) + return OnIndex[E](int32(i)) +} + +func parseVariadicPredicate[E any](p *Parser[Predicate[E]], s *Scanner) (ret []Predicate[E]) { + tok := s.Scan() + for tok.Kind == token.LPAREN || tok.Kind == token.IDENT { + ret = append(ret, p.ParseFromPos(s, tok)) + tok = s.Scan() + } + assertTok(tok, token.RPAREN) + return ret +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/ewma/ewma_bytes.go b/vendor/github.com/cockroachdb/pebble/v2/internal/ewma/ewma_bytes.go new file mode 100644 index 0000000..fd0280d --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/ewma/ewma_bytes.go @@ -0,0 +1,99 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package ewma + +import ( + "math" + + "github.com/cockroachdb/pebble/v2/internal/invariants" +) + +// Bytes is an estimator for an arbitrary value that is sampled from byte +// blocks. +// +// Consider a stream of data which is divided into blocks of varying size. We +// want to estimate a value (like compression ratio) based on the values from +// recent blocks. +// +// Bytes implements a per-byte exponential moving average (EWMA) estimator: let +// pos_i and val_i be the position and value of each byte for which we have +// data; the estimate at position p is the weighted sum: +// +// Sum_i val_i*(1-alpha)^(p-pos_i) +// ------------------------------- +// Sum_i (1-alpha)^(p-pos_i) +type Bytes struct { + alpha float64 + sum float64 + totalWeight float64 + gap int64 +} + +// Init the estimator such that a block sampled bytes ago has half +// the weight compared to a block sampled now. +// +// Intuitively, half of the estimate comes from the values within the half-life +// window; and 75% of the estimate comes from values within 2x half-life. +func (b *Bytes) Init(halfLife int64) { + *b = Bytes{} + // Exact value is 1 - 2^(-1/H). The straightforward calculation suffers from + // precision loss as H grows (we are subtracting two nearly equal numbers). We + // use a numerically stable alternative: + // 1 - 2^(-1/H) = 1 - e^(-ln(2)/H) = -expm1(-ln(2)/H) + b.alpha = -math.Expm1(-math.Ln2 / float64(halfLife)) +} + +// Estimate returns the current estimate of the value, based on the recent +// SampledBlock() calls. Returns NaN if no blocks have been sampled yet. +func (b *Bytes) Estimate() float64 { + return b.sum / b.totalWeight +} + +// NoSample informs the estimator that a block of the given length was not +// sampled. +func (b *Bytes) NoSample(numBytes int64) { + if numBytes < 0 { + if invariants.Enabled { + panic("invalid numBytes") + } + return + } + // It would be equivalent (but less efficient) to multiply both sum and + // totalWeight by (1-alpha)^numBytes instead of keeping track of the gap. + b.gap += numBytes +} + +// SampledBlock informs the estimator that a block of the given length was +// sampled. +func (b *Bytes) SampledBlock(numBytes int64, value float64) { + if numBytes < 1 { + if invariants.Enabled { + panic("invalid numBytes") + } + return + } + decay := b.decay(b.gap + numBytes) + b.sum *= decay + b.totalWeight *= decay + b.gap = 0 + + // The sum of weights for the new bytes is: + // + // 1 - (1 - alpha)^numBytes + // Sum (1 - alpha)^i = ------------------------ + // 0≤i s.capacity { + s.mu.coldTarget = s.capacity + } + + n.referenced.Store(false) + s.addNode(n, key, hot) + s.mu.sizeHot++ + } + + v := &value[V]{ + initialized: make(chan struct{}), + } + // One ref count for the shard, one for the caller. + v.refCount.Store(2) + n.value = v + s.misses.Add(1) + + s.mu.Unlock() + + vRef := ValueRef[K, V]{ + shard: s, + value: v, + } + + v.err = s.initValueFn(ctx, key, vRef) + if v.err != nil { + s.mu.Lock() + defer s.mu.Unlock() + // Lookup the node in the cache again as it might have already been + // removed. + if n := s.mu.nodes[key]; n != nil && n.value == v { + s.unlinkNode(n) + s.clearNode(n) + } + } + close(v.initialized) + return v +} + +func (s *shard[K, V]) addNode(n *node[K, V], key K, status nodeStatus) { + n.key = key + n.status = status + + s.evictNodes() + s.mu.nodes[n.key] = n + + n.links.next = n + n.links.prev = n + if s.mu.handHot == nil { + // First element. + s.mu.handHot = n + s.mu.handCold = n + s.mu.handTest = n + } else { + s.mu.handHot.link(n) + } + + if s.mu.handCold == s.mu.handHot { + s.mu.handCold = s.mu.handCold.prev() + } +} + +func (s *shard[K, V]) evictNodes() { + for s.capacity <= s.mu.sizeHot+s.mu.sizeCold && s.mu.handCold != nil { + s.runHandCold() + } +} + +func (s *shard[K, V]) runHandCold() { + n := s.mu.handCold + if n.status == cold { + if n.referenced.Load() { + n.referenced.Store(false) + n.status = hot + s.mu.sizeCold-- + s.mu.sizeHot++ + } else { + s.clearNode(n) + n.status = test + s.mu.sizeCold-- + s.mu.sizeTest++ + for s.capacity < s.mu.sizeTest && s.mu.handTest != nil { + s.runHandTest() + } + } + } + + s.mu.handCold = s.mu.handCold.next() + + for s.capacity-s.mu.coldTarget <= s.mu.sizeHot && s.mu.handHot != nil { + s.runHandHot() + } +} + +func (s *shard[K, V]) runHandHot() { + if s.mu.handHot == s.mu.handTest && s.mu.handTest != nil { + s.runHandTest() + if s.mu.handHot == nil { + return + } + } + + n := s.mu.handHot + if n.status == hot { + if n.referenced.Load() { + n.referenced.Store(false) + } else { + n.status = cold + s.mu.sizeHot-- + s.mu.sizeCold++ + } + } + + s.mu.handHot = s.mu.handHot.next() +} + +func (s *shard[K, V]) runHandTest() { + if s.mu.sizeCold > 0 && s.mu.handTest == s.mu.handCold && s.mu.handCold != nil { + s.runHandCold() + if s.mu.handTest == nil { + return + } + } + + n := s.mu.handTest + if n.status == test { + s.mu.coldTarget-- + if s.mu.coldTarget < 0 { + s.mu.coldTarget = 0 + } + s.unlinkNode(n) + s.clearNode(n) + } + + s.mu.handTest = s.mu.handTest.next() +} + +// Evict any entry associated with the given key. If there is a corresponding +// value in the shard, it is released before the function returns. There must +// not be any outstanding references on the value. +func (s *shard[K, V]) Evict(key K) { + s.mu.Lock() + n := s.mu.nodes[key] + var v *value[V] + if n != nil { + // NB: This is equivalent to UnrefValue, but we perform the releaseValueFn() + // call synchronously below to free up any associated resources before + // returning. + s.unlinkNode(n) + v = n.value + } + s.mu.Unlock() + + if v != nil { + if v.refCount.Add(-1) != 0 { + panic("element has outstanding references") + } + <-v.initialized + if v.err == nil { + s.releaseValueFn(&v.v) + } + } +} + +// EvictAll evicts all entries in the shard with a key that satisfies the given +// predicate. Any corresponding values are released before the function returns. +// There must not be any outstanding references on the values, and no keys that +// satisfy the predicate should be inserted while the method is running. +// +// It should be used sparingly as it is an O(n) operation. +func (s *shard[K, V]) EvictAll(predicate func(K) bool) []K { + // Collect the keys which need to be evicted. + var keys []K + s.mu.RLock() + s.forAllNodesLocked(func(n *node[K, V]) { + if predicate(n.key) { + keys = append(keys, n.key) + } + }) + s.mu.RUnlock() + + for i := range keys { + s.Evict(keys[i]) + } + + if invariants.Enabled { + s.mu.RLock() + defer s.mu.RUnlock() + s.forAllNodesLocked(func(n *node[K, V]) { + if predicate(n.key) { + panic("evictable key added in shard") + } + }) + } + return keys +} + +func (s *shard[K, V]) forAllNodesLocked(f func(n *node[K, V])) { + if firstNode := s.mu.handHot; firstNode != nil { + for node := firstNode; ; { + f(node) + if node = node.next(); node == firstNode { + return + } + } + } +} + +// Close the shard, releasing all live values. There must not be any outstanding +// references on any of the values. +func (s *shard[K, V]) Close() { + s.mu.Lock() + defer s.mu.Unlock() + + for s.mu.handHot != nil { + n := s.mu.handHot + if v := n.value; v != nil { + if v.refCount.Add(-1) != 0 { + panic("element has outstanding references") + } + s.releasingCh <- v + } + s.unlinkNode(n) + } + + s.mu.nodes = nil + s.mu.handHot = nil + s.mu.handCold = nil + s.mu.handTest = nil + + close(s.releasingCh) + s.releaseLoopExit.Wait() +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/humanize/humanize.go b/vendor/github.com/cockroachdb/pebble/v2/internal/humanize/humanize.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/humanize/humanize.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/humanize/humanize.go diff --git a/vendor/github.com/cockroachdb/pebble/internal/intern/intern.go b/vendor/github.com/cockroachdb/pebble/v2/internal/intern/intern.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/intern/intern.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/intern/intern.go diff --git a/vendor/github.com/cockroachdb/pebble/internal/invalidating/iter.go b/vendor/github.com/cockroachdb/pebble/v2/internal/invalidating/iter.go similarity index 55% rename from vendor/github.com/cockroachdb/pebble/internal/invalidating/iter.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/invalidating/iter.go index 48909ec..1867241 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/invalidating/iter.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/invalidating/iter.go @@ -5,18 +5,19 @@ package invalidating import ( - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/fastrand" - "github.com/cockroachdb/pebble/internal/invariants" + "context" + "slices" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" ) // MaybeWrapIfInvariants wraps some iterators with an invalidating iterator. // MaybeWrapIfInvariants does nothing in non-invariant builds. func MaybeWrapIfInvariants(iter base.InternalIterator) base.InternalIterator { - if invariants.Enabled { - if fastrand.Uint32n(10) == 1 { - return NewIter(iter) - } + if invariants.Enabled && invariants.Sometimes(10) { + return NewIter(iter) } return iter } @@ -25,8 +26,7 @@ func MaybeWrapIfInvariants(iter base.InternalIterator) base.InternalIterator { // returned key/value to all 1s. type iter struct { iter base.InternalIterator - lastKey *base.InternalKey - lastValue base.LazyValue + lastKV *base.InternalKV ignoreKinds [base.InternalKeyKindMax + 1]bool err error } @@ -53,7 +53,7 @@ func IgnoreKinds(kinds ...base.InternalKeyKind) Option { // NewIter constructs a new invalidating iterator that wraps the provided // iterator, trashing buffers for previously returned keys. -func NewIter(originalIterator base.InternalIterator, opts ...Option) base.InternalIterator { +func NewIter(originalIterator base.InternalIterator, opts ...Option) base.TopLevelIterator { i := &iter{iter: originalIterator} for _, opt := range opts { opt.apply(i) @@ -61,84 +61,87 @@ func NewIter(originalIterator base.InternalIterator, opts ...Option) base.Intern return i } -func (i *iter) update( - key *base.InternalKey, value base.LazyValue, -) (*base.InternalKey, base.LazyValue) { +func (i *iter) update(kv *base.InternalKV) *base.InternalKV { i.trashLastKV() - if key == nil { - i.lastKey = nil - i.lastValue = base.LazyValue{} - return nil, base.LazyValue{} + if kv == nil { + i.lastKV = nil + return nil } - i.lastKey = &base.InternalKey{} - *i.lastKey = key.Clone() - i.lastValue = base.LazyValue{ - ValueOrHandle: append(make([]byte, 0, len(value.ValueOrHandle)), value.ValueOrHandle...), + lv := kv.LazyValue() + copiedLV := base.LazyValue{ + ValueOrHandle: slices.Clone(lv.ValueOrHandle), } - if value.Fetcher != nil { + if lv.Fetcher != nil { fetcher := new(base.LazyFetcher) - *fetcher = *value.Fetcher - i.lastValue.Fetcher = fetcher + *fetcher = *lv.Fetcher + copiedLV.Fetcher = fetcher + } + i.lastKV = &base.InternalKV{ + K: kv.K.Clone(), + V: base.MakeLazyValue(copiedLV), } - return i.lastKey, i.lastValue + return i.lastKV } func (i *iter) trashLastKV() { - if i.lastKey == nil { + if i.lastKV == nil { return } - if i.ignoreKinds[i.lastKey.Kind()] { + if i.ignoreKinds[i.lastKV.Kind()] { return } - if i.lastKey != nil { - for j := range i.lastKey.UserKey { - i.lastKey.UserKey[j] = 0xff + if i.lastKV != nil { + for j := range i.lastKV.K.UserKey { + i.lastKV.K.UserKey[j] = 0xff } - i.lastKey.Trailer = 0xffffffffffffffff + i.lastKV.K.Trailer = 0xffffffffffffffff } - for j := range i.lastValue.ValueOrHandle { - i.lastValue.ValueOrHandle[j] = 0xff + lv := i.lastKV.LazyValue() + for j := range lv.ValueOrHandle { + lv.ValueOrHandle[j] = 0xff } - if i.lastValue.Fetcher != nil { + if lv.Fetcher != nil { // Not all the LazyFetcher fields are visible, so we zero out the last // value's Fetcher struct entirely. - *i.lastValue.Fetcher = base.LazyFetcher{} + *lv.Fetcher = base.LazyFetcher{} } } -func (i *iter) SeekGE(key []byte, flags base.SeekGEFlags) (*base.InternalKey, base.LazyValue) { +func (i *iter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { return i.update(i.iter.SeekGE(key, flags)) } -func (i *iter) SeekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { +func (i *iter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + return i.update(i.iter.SeekPrefixGE(prefix, key, flags)) +} + +func (i *iter) SeekPrefixGEStrict(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { return i.update(i.iter.SeekPrefixGE(prefix, key, flags)) } -func (i *iter) SeekLT(key []byte, flags base.SeekLTFlags) (*base.InternalKey, base.LazyValue) { +func (i *iter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { return i.update(i.iter.SeekLT(key, flags)) } -func (i *iter) First() (*base.InternalKey, base.LazyValue) { +func (i *iter) First() *base.InternalKV { return i.update(i.iter.First()) } -func (i *iter) Last() (*base.InternalKey, base.LazyValue) { +func (i *iter) Last() *base.InternalKV { return i.update(i.iter.Last()) } -func (i *iter) Next() (*base.InternalKey, base.LazyValue) { +func (i *iter) Next() *base.InternalKV { return i.update(i.iter.Next()) } -func (i *iter) Prev() (*base.InternalKey, base.LazyValue) { +func (i *iter) Prev() *base.InternalKV { return i.update(i.iter.Prev()) } -func (i *iter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { +func (i *iter) NextPrefix(succKey []byte) *base.InternalKV { return i.update(i.iter.NextPrefix(succKey)) } @@ -157,6 +160,18 @@ func (i *iter) SetBounds(lower, upper []byte) { i.iter.SetBounds(lower, upper) } +func (i *iter) SetContext(ctx context.Context) { + i.iter.SetContext(ctx) +} + +// DebugTree is part of the InternalIterator interface. +func (i *iter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.iter != nil { + i.iter.DebugTree(n) + } +} + func (i *iter) String() string { return i.iter.String() } diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/invariants/invariants.go b/vendor/github.com/cockroachdb/pebble/v2/internal/invariants/invariants.go new file mode 100644 index 0000000..39ef503 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/invariants/invariants.go @@ -0,0 +1,44 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package invariants + +import ( + "runtime" + + "github.com/cockroachdb/pebble/v2/internal/buildtags" +) + +// Enabled is true if we were built with the "invariants" or "race" build tags. +// +// Enabled should be used to gate invariant checks that may be expensive. It +// should not be used to unconditionally alter a code path significantly (e.g. +// wrapping an iterator - see #3678); Sometimes() should be used instead so that +// the production code path gets test coverage as well. +const Enabled = buildtags.Race || buildtags.Invariants + +// RaceEnabled is true if we were built with the "race" build tag. +const RaceEnabled = buildtags.Race + +// UseFinalizers is true if we want to use finalizers for assertions around +// object lifetime and cleanup. This happens when the invariants or tracing tags +// are set, but we exclude race builds because we historically ran into some +// finalizer-related race detector bugs. +const UseFinalizers = !buildtags.Race && (buildtags.Invariants || buildtags.Tracing) + +// SetFinalizer is a wrapper around runtime.SetFinalizer that is a no-op under +// race builds or if neither the invariants nor tracing build tags are +// specified. +// +// We exclude race builds because we historically ran into some race detector +// bugs related to finalizers. +// +// This function is a no-op if UseFinalizers is false and it should inline to +// nothing. However, note that it might not inline so in very hot paths it's +// best to check UseFinalizers first. +func SetFinalizer(obj, finalizer interface{}) { + if UseFinalizers { + runtime.SetFinalizer(obj, finalizer) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/invariants/off.go b/vendor/github.com/cockroachdb/pebble/v2/internal/invariants/off.go new file mode 100644 index 0000000..dd77975 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/invariants/off.go @@ -0,0 +1,78 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build !invariants && !race + +package invariants + +// Sometimes returns true percent% of the time if invariants are Enabled (i.e. +// we were built with the "invariants" or "race" build tags). Otherwise, always +// returns false. +func Sometimes(percent int) bool { + return false +} + +// CloseChecker is used to check that objects are closed exactly once. It is +// empty and does nothing in non-invariant builds. +// +// Note that in non-invariant builds, the struct is zero-sized but it can still +// increase the size of a parent struct if it is the last field (because Go must +// allow getting a valid pointer address of the field). +type CloseChecker struct{} + +// Close panics if called twice on the same object (if we were built with the +// "invariants" or "race" build tags). +func (d *CloseChecker) Close() {} + +// AssertClosed panics in invariant builds if Close was not called. +func (d *CloseChecker) AssertClosed() {} + +// AssertNotClosed panics in invariant builds if Close was called. +func (d *CloseChecker) AssertNotClosed() {} + +// Value is a generic container for a value that should only exist in invariant +// builds. In non-invariant builds, storing a value is a no-op, retrieving a +// value returns the type parameter's zero value, and the Value struct takes up +// no space. +// +// Note that in non-invariant builds, the struct is zero-sized but it can still +// increase the size of a parent struct if it is the last field (because Go must +// allow getting a valid pointer address of the field). +type Value[V any] struct{} + +// Get the current value, or the zero value if invariants are disabled. +func (*Value[V]) Get() V { + var v V // zero value + return v +} + +// Set the value; no-op in non-invariant builds. +func (*Value[V]) Set(v V) {} + +// BufMangler is a utility that can be used to test that the caller doesn't use +type BufMangler struct{} + +// MaybeMangleLater returns either the given buffer or a copy of it which will +// be mangled the next time this function is called. +func (bm *BufMangler) MaybeMangleLater(buf []byte) []byte { + return buf +} + +// CheckBounds panics if the index is not in the range [0, n). No-op in +// non-invariant builds. +func CheckBounds[T Integer](i T, n T) {} + +// SafeSub returns a - b. If a < b, it panics in invariant builds and returns 0 +// in non-invariant builds. +func SafeSub[T Integer](a, b T) T { + if a < b { + return 0 + } + return a - b +} + +// Integer is a constraint that permits any integer type. +type Integer interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/invariants/on.go b/vendor/github.com/cockroachdb/pebble/v2/internal/invariants/on.go new file mode 100644 index 0000000..d894d2e --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/invariants/on.go @@ -0,0 +1,121 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build invariants || race + +package invariants + +import ( + "fmt" + "math/rand/v2" + "slices" +) + +// Sometimes returns true percent% of the time if invariants are Enabled (i.e. +// we were built with the "invariants" or "race" build tags). Otherwise, always +// returns false. +func Sometimes(percent int) bool { + return rand.Uint32N(100) < uint32(percent) +} + +// CloseChecker is used to check that objects are closed exactly once. It is +// empty and does nothing in non-invariant builds. +// +// Note that in non-invariant builds, the struct is zero-sized but it can still +// increase the size of a parent struct if it is the last field (because Go must +// allow getting a valid pointer address of the field). +type CloseChecker struct { + closed bool +} + +// Close panics if called twice on the same object (if we were built with the +// "invariants" or "race" build tags). +func (d *CloseChecker) Close() { + if d.closed { + // Note: to debug a double-close, you can add a stack field to CloseChecker + // and set it to string(debug.Stack()) in Close, then print that in this + // panic. + panic("double close") + } + d.closed = true +} + +// AssertClosed panics in invariant builds if Close was not called. +func (d *CloseChecker) AssertClosed() { + if !d.closed { + panic("not closed") + } +} + +// AssertNotClosed panics in invariant builds if Close was called. +func (d *CloseChecker) AssertNotClosed() { + if d.closed { + panic("closed") + } +} + +// Value is a generic container for a value that should only exist in invariant +// builds. In non-invariant builds, storing a value is a no-op, retrieving a +// value returns the type parameter's zero value, and the Value struct takes up +// no space. +// +// Note that in non-invariant builds, the struct is zero-sized but it can still +// increase the size of a parent struct if it is the last field (because Go must +// allow getting a valid pointer address of the field). +type Value[V any] struct { + v V +} + +// Get the current value, or the zero-value if invariants are disabled. +func (v *Value[V]) Get() V { + return v.v +} + +// BufMangler is a utility that can be used to test that the caller doesn't use +type BufMangler struct { + lastReturnedBuf []byte +} + +// MaybeMangleLater returns either the given buffer or a copy of it which will +// be mangled the next time this function is called. +func (bm *BufMangler) MaybeMangleLater(buf []byte) []byte { + if bm.lastReturnedBuf != nil { + for i := range bm.lastReturnedBuf { + bm.lastReturnedBuf[i] = 0xCC + } + bm.lastReturnedBuf = nil + } + if rand.Uint32N(2) == 0 { + bm.lastReturnedBuf = slices.Clone(buf) + return bm.lastReturnedBuf + } + return buf +} + +// Set the value; no-op in non-invariant builds. +func (v *Value[V]) Set(inner V) { + v.v = inner +} + +// CheckBounds panics if the index is not in the range [0, n). No-op in +// non-invariant builds. +func CheckBounds[T Integer](i T, n T) { + if i < 0 || i >= n { + panic(fmt.Sprintf("index %d out of bounds [0, %d)", i, n)) + } +} + +// SafeSub returns a - b. If a < b, it panics in invariant builds and returns 0 +// in non-invariant builds. +func SafeSub[T Integer](a, b T) T { + if a < b { + panic(fmt.Sprintf("underflow: %d - %d", a, b)) + } + return a - b +} + +// Integer is a constraint that permits any integer type. +type Integer interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/assert_iter.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/assert_iter.go new file mode 100644 index 0000000..6ea9029 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/assert_iter.go @@ -0,0 +1,190 @@ +// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package keyspan + +import ( + "context" + "fmt" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// Assert wraps an iterator which asserts that operations return sane results. +func Assert(iter FragmentIterator, cmp base.Compare) FragmentIterator { + return &assertIter{ + iter: iter, + cmp: cmp, + } +} + +// MaybeAssert potentially wraps an iterator with Assert and/or +// NewInvalidatingIter if we are in testing mode. +func MaybeAssert(iter FragmentIterator, cmp base.Compare) FragmentIterator { + if invariants.Enabled { + if invariants.Sometimes(60 /* percent */) { + iter = NewInvalidatingIter(iter) + } + if invariants.Sometimes(60 /* percent */) { + iter = Assert(iter, cmp) + } + } + return iter +} + +// AssertUserKeyBounds wraps an iterator and asserts that all spans are within +// the given bounds [lower, upper). +func AssertUserKeyBounds( + iter FragmentIterator, lower, upper []byte, cmp base.Compare, +) FragmentIterator { + return AssertBounds(iter, base.MakeSearchKey(lower), upper, cmp) +} + +// AssertBounds wraps an iterator and asserts that all spans are within the +// given bounds [lower.UserKey, upper), and that all keys in a span that starts +// exactly at lower.UserKey are >= lower. +// +// The asymmetry here is due to fragment spans having exclusive end user keys. +func AssertBounds( + iter FragmentIterator, lower base.InternalKey, upper []byte, cmp base.Compare, +) FragmentIterator { + i := &assertIter{ + iter: iter, + cmp: cmp, + } + i.checkBounds.enabled = true + i.checkBounds.lower = lower + i.checkBounds.upper = upper + return i +} + +// assertIter is a pass-through FragmentIterator wrapper which performs checks +// on what the wrapped iterator returns. +// +// It verifies that results for various operations are sane, and it optionally +// verifies that spans are within given bounds. +type assertIter struct { + iter FragmentIterator + cmp base.Compare + checkBounds struct { + enabled bool + lower base.InternalKey + upper []byte + } + lastSpanStart []byte + lastSpanEnd []byte +} + +var _ FragmentIterator = (*assertIter)(nil) + +func (i *assertIter) panicf(format string, args ...interface{}) { + str := fmt.Sprintf(format, args...) + panic(errors.AssertionFailedf("%s; wraps %T", str, i.iter)) +} + +func (i *assertIter) check(span *Span) { + i.lastSpanStart = i.lastSpanStart[:0] + i.lastSpanEnd = i.lastSpanEnd[:0] + if span == nil { + return + } + if i.checkBounds.enabled { + lower := i.checkBounds.lower + switch startCmp := i.cmp(span.Start, lower.UserKey); { + case startCmp < 0: + i.panicf("lower bound %q violated by span %s", lower.UserKey, span) + case startCmp == 0: + // Note: trailers are in descending order. + if len(span.Keys) > 0 && span.SmallestKey().Trailer > lower.Trailer { + i.panicf("lower bound %s violated by key %s", lower, span.SmallestKey()) + } + } + if i.cmp(span.End, i.checkBounds.upper) > 0 { + i.panicf("upper bound %q violated by span %s", i.checkBounds.upper, span) + } + } + // Save the span to check Next/Prev operations. + i.lastSpanStart = append(i.lastSpanStart, span.Start...) + i.lastSpanEnd = append(i.lastSpanEnd, span.End...) +} + +// SeekGE implements FragmentIterator. +func (i *assertIter) SeekGE(key []byte) (*Span, error) { + span, err := i.iter.SeekGE(key) + if span != nil && i.cmp(span.End, key) <= 0 { + i.panicf("incorrect SeekGE(%q) span %s", key, span) + } + i.check(span) + return span, err +} + +// SeekLT implements FragmentIterator. +func (i *assertIter) SeekLT(key []byte) (*Span, error) { + span, err := i.iter.SeekLT(key) + if span != nil && i.cmp(span.Start, key) >= 0 { + i.panicf("incorrect SeekLT(%q) span %s", key, span) + } + i.check(span) + return span, err +} + +// First implements FragmentIterator. +func (i *assertIter) First() (*Span, error) { + span, err := i.iter.First() + i.check(span) + return span, err +} + +// Last implements FragmentIterator. +func (i *assertIter) Last() (*Span, error) { + span, err := i.iter.Last() + i.check(span) + return span, err +} + +// Next implements FragmentIterator. +func (i *assertIter) Next() (*Span, error) { + span, err := i.iter.Next() + if span != nil && len(i.lastSpanEnd) > 0 && i.cmp(i.lastSpanEnd, span.Start) > 0 { + i.panicf("Next span %s not after last span end %q", span, i.lastSpanEnd) + } + i.check(span) + return span, err +} + +// Prev implements FragmentIterator. +func (i *assertIter) Prev() (*Span, error) { + span, err := i.iter.Prev() + if span != nil && len(i.lastSpanStart) > 0 && i.cmp(i.lastSpanStart, span.End) < 0 { + i.panicf("Prev span %s not before last span start %q", span, i.lastSpanStart) + } + i.check(span) + return span, err +} + +// SetContext is part of the FragmentIterator interface. +func (i *assertIter) SetContext(ctx context.Context) { + i.iter.SetContext(ctx) +} + +// Close implements FragmentIterator. +func (i *assertIter) Close() { + i.iter.Close() +} + +// WrapChildren implements FragmentIterator. +func (i *assertIter) WrapChildren(wrap WrapFn) { + i.iter = wrap(i.iter) +} + +// DebugTree is part of the FragmentIterator interface. +func (i *assertIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.iter != nil { + i.iter.DebugTree(n) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/bounded.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/bounded.go similarity index 79% rename from vendor/github.com/cockroachdb/pebble/internal/keyspan/bounded.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/bounded.go index 70dd395..03f6f3f 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/bounded.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/bounded.go @@ -4,7 +4,12 @@ package keyspan -import "github.com/cockroachdb/pebble/internal/base" +import ( + "context" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) // TODO(jackson): Consider removing this type and adding bounds enforcement // directly to the MergingIter. This type is probably too lightweight to warrant @@ -100,37 +105,37 @@ var _ FragmentIterator = (*BoundedIter)(nil) // respect the prefix bounds. // SeekGE implements FragmentIterator. -func (i *BoundedIter) SeekGE(key []byte) *Span { - s := i.iter.SeekGE(key) - s = i.checkPrefixSpanStart(s) - s = i.checkPrefixSpanEnd(s) - return i.checkForwardBound(s) +func (i *BoundedIter) SeekGE(key []byte) (*Span, error) { + s, err := i.iter.SeekGE(key) + s, err = i.checkPrefixSpanStart(s, err) + s, err = i.checkPrefixSpanEnd(s, err) + return i.checkForwardBound(s, err) } // SeekLT implements FragmentIterator. -func (i *BoundedIter) SeekLT(key []byte) *Span { - s := i.iter.SeekLT(key) - s = i.checkPrefixSpanStart(s) - s = i.checkPrefixSpanEnd(s) - return i.checkBackwardBound(s) +func (i *BoundedIter) SeekLT(key []byte) (*Span, error) { + s, err := i.iter.SeekLT(key) + s, err = i.checkPrefixSpanStart(s, err) + s, err = i.checkPrefixSpanEnd(s, err) + return i.checkBackwardBound(s, err) } // First implements FragmentIterator. -func (i *BoundedIter) First() *Span { - s := i.iter.First() - s = i.checkPrefixSpanStart(s) - return i.checkForwardBound(s) +func (i *BoundedIter) First() (*Span, error) { + s, err := i.iter.First() + s, err = i.checkPrefixSpanStart(s, err) + return i.checkForwardBound(s, err) } // Last implements FragmentIterator. -func (i *BoundedIter) Last() *Span { - s := i.iter.Last() - s = i.checkPrefixSpanEnd(s) - return i.checkBackwardBound(s) +func (i *BoundedIter) Last() (*Span, error) { + s, err := i.iter.Last() + s, err = i.checkPrefixSpanEnd(s, err) + return i.checkBackwardBound(s, err) } // Next implements FragmentIterator. -func (i *BoundedIter) Next() *Span { +func (i *BoundedIter) Next() (*Span, error) { switch i.pos { case posAtLowerLimit: // The BoundedIter had previously returned nil, because it knew from @@ -138,14 +143,14 @@ func (i *BoundedIter) Next() *Span { // need to return the current iter span and reset i.pos to reflect that // we're no longer positioned at the limit. i.pos = posAtIterSpan - return i.iterSpan + return i.iterSpan, nil case posAtIterSpan: // If the span at the underlying iterator position extends to or beyond the // upper bound, we can avoid advancing because the next span is necessarily // out of bounds. if i.iterSpan != nil && i.upper != nil && i.cmp(i.iterSpan.End, i.upper) >= 0 { i.pos = posAtUpperLimit - return nil + return nil, nil } // Similarly, if the span extends to the next prefix and we're in prefix // iteration mode, we can avoid advancing. @@ -153,31 +158,31 @@ func (i *BoundedIter) Next() *Span { ei := i.split(i.iterSpan.End) if i.cmp(i.iterSpan.End[:ei], *i.prefix) > 0 { i.pos = posAtUpperLimit - return nil + return nil, nil } } return i.checkForwardBound(i.checkPrefixSpanStart(i.iter.Next())) case posAtUpperLimit: // Already exhausted. - return nil + return nil, nil default: panic("unreachable") } } // Prev implements FragmentIterator. -func (i *BoundedIter) Prev() *Span { +func (i *BoundedIter) Prev() (*Span, error) { switch i.pos { case posAtLowerLimit: // Already exhausted. - return nil + return nil, nil case posAtIterSpan: // If the span at the underlying iterator position extends to or beyond // the lower bound, we can avoid advancing because the previous span is // necessarily out of bounds. if i.iterSpan != nil && i.lower != nil && i.cmp(i.iterSpan.Start, i.lower) <= 0 { i.pos = posAtLowerLimit - return nil + return nil, nil } // Similarly, if the span extends to or beyond the current prefix and // we're in prefix iteration mode, we can avoid advancing. @@ -185,7 +190,7 @@ func (i *BoundedIter) Prev() *Span { si := i.split(i.iterSpan.Start) if i.cmp(i.iterSpan.Start[:si], *i.prefix) < 0 { i.pos = posAtLowerLimit - return nil + return nil, nil } } return i.checkBackwardBound(i.checkPrefixSpanEnd(i.iter.Prev())) @@ -195,20 +200,20 @@ func (i *BoundedIter) Prev() *Span { // need to return the current iter span and reset i.pos to reflect that // we're no longer positioned at the limit. i.pos = posAtIterSpan - return i.iterSpan + return i.iterSpan, nil default: panic("unreachable") } } -// Error implements FragmentIterator. -func (i *BoundedIter) Error() error { - return i.iter.Error() +// Close implements FragmentIterator. +func (i *BoundedIter) Close() { + i.iter.Close() } -// Close implements FragmentIterator. -func (i *BoundedIter) Close() error { - return i.iter.Close() +// SetContext is part of the FragmentIterator interface. +func (i *BoundedIter) SetContext(ctx context.Context) { + i.iter.SetContext(ctx) } // SetBounds modifies the FragmentIterator's bounds. @@ -216,7 +221,7 @@ func (i *BoundedIter) SetBounds(lower, upper []byte) { i.lower, i.upper = lower, upper } -func (i *BoundedIter) checkPrefixSpanStart(span *Span) *Span { +func (i *BoundedIter) checkPrefixSpanStart(span *Span, err error) (*Span, error) { // Compare to the prefix's bounds, if in prefix iteration mode. if span != nil && i.hasPrefix != nil && *i.hasPrefix { si := i.split(span.Start) @@ -225,13 +230,13 @@ func (i *BoundedIter) checkPrefixSpanStart(span *Span) *Span { span = nil } } - return span + return span, err } // checkForwardBound enforces the upper bound, returning nil if the provided // span is wholly outside the upper bound. It also updates i.pos and i.iterSpan // to reflect the new iterator position. -func (i *BoundedIter) checkForwardBound(span *Span) *Span { +func (i *BoundedIter) checkForwardBound(span *Span, err error) (*Span, error) { // Compare to the upper bound. if span != nil && i.upper != nil && i.cmp(span.Start, i.upper) >= 0 { span = nil @@ -240,22 +245,22 @@ func (i *BoundedIter) checkForwardBound(span *Span) *Span { if i.pos != posAtIterSpan { i.pos = posAtIterSpan } - return span + return span, err } -func (i *BoundedIter) checkPrefixSpanEnd(span *Span) *Span { +func (i *BoundedIter) checkPrefixSpanEnd(span *Span, err error) (*Span, error) { // Compare to the prefix's bounds, if in prefix iteration mode. if span != nil && i.hasPrefix != nil && *i.hasPrefix && i.cmp(span.End, *i.prefix) <= 0 { // This span ends before the current prefix. span = nil } - return span + return span, err } // checkBackward enforces the lower bound, returning nil if the provided span is // wholly outside the lower bound. It also updates i.pos and i.iterSpan to // reflect the new iterator position. -func (i *BoundedIter) checkBackwardBound(span *Span) *Span { +func (i *BoundedIter) checkBackwardBound(span *Span, err error) (*Span, error) { // Compare to the lower bound. if span != nil && i.lower != nil && i.cmp(span.End, i.lower) <= 0 { span = nil @@ -264,5 +269,18 @@ func (i *BoundedIter) checkBackwardBound(span *Span) *Span { if i.pos != posAtIterSpan { i.pos = posAtIterSpan } - return span + return span, err +} + +// WrapChildren implements FragmentIterator. +func (i *BoundedIter) WrapChildren(wrap WrapFn) { + i.iter = wrap(i.iter) +} + +// DebugTree is part of the FragmentIterator interface. +func (i *BoundedIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.iter != nil { + i.iter.DebugTree(n) + } } diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/defragment.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/defragment.go similarity index 77% rename from vendor/github.com/cockroachdb/pebble/internal/keyspan/defragment.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/defragment.go index d056ef0..4cac770 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/defragment.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/defragment.go @@ -6,16 +6,18 @@ package keyspan import ( "bytes" + "context" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/bytealloc" - "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/bytealloc" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" ) -// bufferReuseMaxCapacity is the maximum capacity of a DefragmentingIter buffer +// BufferReuseMaxCapacity is the maximum capacity of a DefragmentingIter buffer // that DefragmentingIter will reuse. Buffers larger than this will be // discarded and reallocated as necessary. -const bufferReuseMaxCapacity = 10 << 10 // 10 KB +const BufferReuseMaxCapacity = 10 << 10 // 10 KB // keysReuseMaxCapacity is the maximum capacity of a []keyspan.Key buffer that // DefragmentingIter will reuse. Buffers larger than this will be discarded and @@ -27,27 +29,29 @@ const keysReuseMaxCapacity = 100 type DefragmentMethod interface { // ShouldDefragment takes two abutting spans and returns whether the two // spans should be combined into a single, defragmented Span. - ShouldDefragment(equal base.Equal, left, right *Span) bool + ShouldDefragment(suffixCmp base.CompareRangeSuffixes, left, right *Span) bool } // The DefragmentMethodFunc type is an adapter to allow the use of ordinary // functions as DefragmentMethods. If f is a function with the appropriate // signature, DefragmentMethodFunc(f) is a DefragmentMethod that calls f. -type DefragmentMethodFunc func(equal base.Equal, left, right *Span) bool +type DefragmentMethodFunc func(suffixCmp base.CompareRangeSuffixes, left, right *Span) bool // ShouldDefragment calls f(equal, left, right). -func (f DefragmentMethodFunc) ShouldDefragment(equal base.Equal, left, right *Span) bool { - return f(equal, left, right) +func (f DefragmentMethodFunc) ShouldDefragment( + suffixCmp base.CompareRangeSuffixes, left, right *Span, +) bool { + return f(suffixCmp, left, right) } -// DefragmentInternal configures a DefragmentingIter to defragment spans -// only if they have identical keys. It requires spans' keys to be sorted in -// trailer descending order. +// DefragmentInternal configures a DefragmentingIter to defragment spans only if +// they have identical keys. It requires spans' keys to be sorted in trailer +// descending order. // // This defragmenting method is intended for use in compactions that may see // internal range keys fragments that may now be joined, because the state that // required their fragmentation has been dropped. -var DefragmentInternal DefragmentMethod = DefragmentMethodFunc(func(equal base.Equal, a, b *Span) bool { +var DefragmentInternal DefragmentMethod = DefragmentMethodFunc(func(suffixCmp base.CompareRangeSuffixes, a, b *Span) bool { if a.KeysOrder != ByTrailerDesc || b.KeysOrder != ByTrailerDesc { panic("pebble: span keys unexpectedly not in trailer descending order") } @@ -58,7 +62,7 @@ var DefragmentInternal DefragmentMethod = DefragmentMethodFunc(func(equal base.E if a.Keys[i].Trailer != b.Keys[i].Trailer { return false } - if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) { + if suffixCmp(a.Keys[i].Suffix, b.Keys[i].Suffix) != 0 { return false } if !bytes.Equal(a.Keys[i].Value, b.Keys[i].Value) { @@ -162,10 +166,10 @@ type DefragmentingBuffers struct { // PrepareForReuse discards any excessively large buffers. func (bufs *DefragmentingBuffers) PrepareForReuse() { - if cap(bufs.currBuf) > bufferReuseMaxCapacity { + if cap(bufs.currBuf) > BufferReuseMaxCapacity { bufs.currBuf = nil } - if cap(bufs.keyBuf) > bufferReuseMaxCapacity { + if cap(bufs.keyBuf) > BufferReuseMaxCapacity { bufs.keyBuf = nil } if cap(bufs.keysBuf) > keysReuseMaxCapacity { @@ -195,27 +199,31 @@ func (i *DefragmentingIter) Init( } } -// Error returns any accumulated error. -func (i *DefragmentingIter) Error() error { - return i.iter.Error() +// SetContext is part of the FragmentIterator interface. +func (i *DefragmentingIter) SetContext(ctx context.Context) { + i.iter.SetContext(ctx) } // Close closes the underlying iterators. -func (i *DefragmentingIter) Close() error { - return i.iter.Close() +func (i *DefragmentingIter) Close() { + i.iter.Close() } // SeekGE moves the iterator to the first span covering a key greater than or // equal to the given key. This is equivalent to seeking to the first span with // an end key greater than the given key. -func (i *DefragmentingIter) SeekGE(key []byte) *Span { - i.iterSpan = i.iter.SeekGE(key) - if i.iterSpan == nil { +func (i *DefragmentingIter) SeekGE(key []byte) (*Span, error) { + var err error + i.iterSpan, err = i.iter.SeekGE(key) + switch { + case err != nil: + return nil, err + case i.iterSpan == nil: i.iterPos = iterPosCurr - return nil - } else if i.iterSpan.Empty() { + return nil, nil + case i.iterSpan.Empty(): i.iterPos = iterPosCurr - return i.iterSpan + return i.iterSpan, nil } // If the span starts strictly after key, we know there mustn't be an // earlier span that ends at i.iterSpan.Start, otherwise i.iter would've @@ -227,18 +235,16 @@ func (i *DefragmentingIter) SeekGE(key []byte) *Span { // The span we landed on has a Start bound ≤ key. There may be additional // fragments before this span. Defragment backward to find the start of the // defragmented span. - i.defragmentBackward() - - // Defragmenting backward may have stopped because it encountered an error. - // If so, we must not continue so that i.iter.Error() (and thus i.Error()) - // yields the error. - if i.iterSpan == nil && i.iter.Error() != nil { - return nil + if _, err := i.defragmentBackward(); err != nil { + return nil, err } - if i.iterPos == iterPosPrev { // Next once back onto the span. - i.iterSpan = i.iter.Next() + var err error + i.iterSpan, err = i.iter.Next() + if err != nil { + return nil, err + } } // Defragment the full span from its start. return i.defragmentForward() @@ -247,14 +253,18 @@ func (i *DefragmentingIter) SeekGE(key []byte) *Span { // SeekLT moves the iterator to the last span covering a key less than the // given key. This is equivalent to seeking to the last span with a start // key less than the given key. -func (i *DefragmentingIter) SeekLT(key []byte) *Span { - i.iterSpan = i.iter.SeekLT(key) - if i.iterSpan == nil { +func (i *DefragmentingIter) SeekLT(key []byte) (*Span, error) { + var err error + i.iterSpan, err = i.iter.SeekLT(key) + switch { + case err != nil: + return nil, err + case i.iterSpan == nil: i.iterPos = iterPosCurr - return nil - } else if i.iterSpan.Empty() { + return nil, nil + case i.iterSpan.Empty(): i.iterPos = iterPosCurr - return i.iterSpan + return i.iterSpan, nil } // If the span ends strictly before key, we know there mustn't be a later // span that starts at i.iterSpan.End, otherwise i.iter would've returned @@ -266,45 +276,54 @@ func (i *DefragmentingIter) SeekLT(key []byte) *Span { // The span we landed on has a End bound ≥ key. There may be additional // fragments after this span. Defragment forward to find the end of the // defragmented span. - i.defragmentForward() - - // Defragmenting forward may have stopped because it encountered an error. - // If so, we must not continue so that i.iter.Error() (and thus i.Error()) - // yields the error. - if i.iterSpan == nil && i.iter.Error() != nil { - return nil + if _, err := i.defragmentForward(); err != nil { + return nil, err } if i.iterPos == iterPosNext { // Prev once back onto the span. - i.iterSpan = i.iter.Prev() + var err error + i.iterSpan, err = i.iter.Prev() + if err != nil { + return nil, err + } } // Defragment the full span from its end. return i.defragmentBackward() } // First seeks the iterator to the first span and returns it. -func (i *DefragmentingIter) First() *Span { - i.iterSpan = i.iter.First() - if i.iterSpan == nil { +func (i *DefragmentingIter) First() (*Span, error) { + var err error + i.iterSpan, err = i.iter.First() + switch { + case err != nil: + return nil, err + case i.iterSpan == nil: i.iterPos = iterPosCurr - return nil + return nil, nil + default: + return i.defragmentForward() } - return i.defragmentForward() } // Last seeks the iterator to the last span and returns it. -func (i *DefragmentingIter) Last() *Span { - i.iterSpan = i.iter.Last() - if i.iterSpan == nil { +func (i *DefragmentingIter) Last() (*Span, error) { + var err error + i.iterSpan, err = i.iter.Last() + switch { + case err != nil: + return nil, err + case i.iterSpan == nil: i.iterPos = iterPosCurr - return nil + return nil, nil + default: + return i.defragmentBackward() } - return i.defragmentBackward() } // Next advances to the next span and returns it. -func (i *DefragmentingIter) Next() *Span { +func (i *DefragmentingIter) Next() (*Span, error) { switch i.iterPos { case iterPosPrev: // Switching directions; The iterator is currently positioned over the @@ -319,18 +338,23 @@ func (i *DefragmentingIter) Next() *Span { // // Next once to move onto y, defragment forward to land on the first z // position. - i.iterSpan = i.iter.Next() - if invariants.Enabled && i.iterSpan == nil && i.iter.Error() == nil { + var err error + i.iterSpan, err = i.iter.Next() + if err != nil { + return nil, err + } else if i.iterSpan == nil { panic("pebble: invariant violation: no next span while switching directions") } // We're now positioned on the first span that was defragmented into the // current iterator position. Skip over the rest of the current iterator // position's constitutent fragments. In the above example, this would // land on the first 'z'. - i.defragmentForward() + if _, err = i.defragmentForward(); err != nil { + return nil, err + } if i.iterSpan == nil { i.iterPos = iterPosCurr - return nil + return nil, nil } // Now that we're positioned over the first of the next set of @@ -343,16 +367,18 @@ func (i *DefragmentingIter) Next() *Span { panic("pebble: invariant violation: iterPosCurr with valid iterSpan") } - i.iterSpan = i.iter.Next() + var err error + i.iterSpan, err = i.iter.Next() if i.iterSpan == nil { - return nil + // NB: err may be nil or non-nil. + return nil, err } return i.defragmentForward() case iterPosNext: // Already at the next span. if i.iterSpan == nil { i.iterPos = iterPosCurr - return nil + return nil, nil } return i.defragmentForward() default: @@ -361,13 +387,13 @@ func (i *DefragmentingIter) Next() *Span { } // Prev steps back to the previous span and returns it. -func (i *DefragmentingIter) Prev() *Span { +func (i *DefragmentingIter) Prev() (*Span, error) { switch i.iterPos { case iterPosPrev: // Already at the previous span. if i.iterSpan == nil { i.iterPos = iterPosCurr - return nil + return nil, nil } return i.defragmentBackward() case iterPosCurr: @@ -377,9 +403,11 @@ func (i *DefragmentingIter) Prev() *Span { panic("pebble: invariant violation: iterPosCurr with valid iterSpan") } - i.iterSpan = i.iter.Prev() + var err error + i.iterSpan, err = i.iter.Prev() if i.iterSpan == nil { - return nil + // NB: err may be nil or non-nil. + return nil, err } return i.defragmentBackward() case iterPosNext: @@ -395,21 +423,26 @@ func (i *DefragmentingIter) Prev() *Span { // // Prev once to move onto y, defragment backward to land on the last x // position. - i.iterSpan = i.iter.Prev() - if invariants.Enabled && i.iterSpan == nil && i.iter.Error() == nil { + var err error + i.iterSpan, err = i.iter.Prev() + if err != nil { + return nil, err + } else if i.iterSpan == nil { panic("pebble: invariant violation: no previous span while switching directions") } // We're now positioned on the last span that was defragmented into the // current iterator position. Skip over the rest of the current iterator // position's constitutent fragments. In the above example, this would // land on the last 'x'. - i.defragmentBackward() + if _, err = i.defragmentBackward(); err != nil { + return nil, err + } // Now that we're positioned over the last of the prev set of // fragments, defragment backward. if i.iterSpan == nil { i.iterPos = iterPosCurr - return nil + return nil, nil } return i.defragmentBackward() default: @@ -421,24 +454,25 @@ func (i *DefragmentingIter) Prev() *Span { // DefragmentMethod and ensures both spans are NOT empty; not defragmenting empty // spans is an optimization that lets us load fewer sstable blocks. func (i *DefragmentingIter) checkEqual(left, right *Span) bool { - return (!left.Empty() && !right.Empty()) && i.method.ShouldDefragment(i.equal, i.iterSpan, &i.curr) + return (!left.Empty() && !right.Empty()) && i.method.ShouldDefragment(i.comparer.CompareRangeSuffixes, i.iterSpan, &i.curr) } // defragmentForward defragments spans in the forward direction, starting from // i.iter's current position. The span at the current position must be non-nil, // but may be Empty(). -func (i *DefragmentingIter) defragmentForward() *Span { +func (i *DefragmentingIter) defragmentForward() (*Span, error) { if i.iterSpan.Empty() { // An empty span will never be equal to another span; see checkEqual for // why. To avoid loading non-empty range keys further ahead by calling Next, // return early. i.iterPos = iterPosCurr - return i.iterSpan + return i.iterSpan, nil } i.saveCurrent() + var err error i.iterPos = iterPosNext - i.iterSpan = i.iter.Next() + i.iterSpan, err = i.iter.Next() for i.iterSpan != nil { if !i.equal(i.curr.End, i.iterSpan.Start) { // Not a continuation. @@ -451,36 +485,32 @@ func (i *DefragmentingIter) defragmentForward() *Span { i.keyBuf = append(i.keyBuf[:0], i.iterSpan.End...) i.curr.End = i.keyBuf i.keysBuf = i.reduce(i.keysBuf, i.iterSpan.Keys) - i.iterSpan = i.iter.Next() + i.iterSpan, err = i.iter.Next() } // i.iterSpan == nil - // - // The inner iterator may return nil when it encounters an error. If there - // was an error, we don't know whether there is another span we should - // defragment or not. Return nil so that the caller knows they should check - // Error(). - if i.iter.Error() != nil { - return nil + if err != nil { + return nil, err } i.curr.Keys = i.keysBuf - return &i.curr + return &i.curr, nil } // defragmentBackward defragments spans in the backward direction, starting from // i.iter's current position. The span at the current position must be non-nil, // but may be Empty(). -func (i *DefragmentingIter) defragmentBackward() *Span { +func (i *DefragmentingIter) defragmentBackward() (*Span, error) { if i.iterSpan.Empty() { // An empty span will never be equal to another span; see checkEqual for // why. To avoid loading non-empty range keys further ahead by calling Next, // return early. i.iterPos = iterPosCurr - return i.iterSpan + return i.iterSpan, nil } i.saveCurrent() + var err error i.iterPos = iterPosPrev - i.iterSpan = i.iter.Prev() + i.iterSpan, err = i.iter.Prev() for i.iterSpan != nil { if !i.equal(i.curr.Start, i.iterSpan.End) { // Not a continuation. @@ -493,19 +523,14 @@ func (i *DefragmentingIter) defragmentBackward() *Span { i.keyBuf = append(i.keyBuf[:0], i.iterSpan.Start...) i.curr.Start = i.keyBuf i.keysBuf = i.reduce(i.keysBuf, i.iterSpan.Keys) - i.iterSpan = i.iter.Prev() + i.iterSpan, err = i.iter.Prev() } // i.iterSpan == nil - // - // The inner iterator may return nil when it encounters an error. If there - // was an error, we don't know whether there is another span we should - // defragment or not. Return nil so that the caller knows they should check - // Error(). - if i.iter.Error() != nil { - return nil + if err != nil { + return nil, err } i.curr.Keys = i.keysBuf - return &i.curr + return &i.curr, nil } func (i *DefragmentingIter) saveCurrent() { @@ -537,3 +562,16 @@ func (i *DefragmentingIter) saveBytes(b []byte) []byte { i.currBuf, b = i.currBuf.Copy(b) return b } + +// WrapChildren implements FragmentIterator. +func (i *DefragmentingIter) WrapChildren(wrap WrapFn) { + i.iter = wrap(i.iter) +} + +// DebugTree is part of the FragmentIterator interface. +func (i *DefragmentingIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.iter != nil { + i.iter.DebugTree(n) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/doc.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/doc.go similarity index 75% rename from vendor/github.com/cockroachdb/pebble/internal/keyspan/doc.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/doc.go index e05aad2..3805931 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/doc.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/doc.go @@ -1,4 +1,4 @@ -// Package keyspan provides facilities for sorting, fragmenting and +// Package keyspan provides general facilities for sorting, fragmenting and // iterating over spans of user keys. // // A Span represents a range of user key space with an inclusive start @@ -10,4 +10,7 @@ // are fragmented at overlapping key boundaries by the Fragmenter type. // This package's various iteration facilities require these // non-overlapping fragmented spans. +// +// Implementations that are specific to Pebble and use manifest types are +// in the keyspanimpl subpackage. package keyspan diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/filter.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/filter.go new file mode 100644 index 0000000..b1116f6 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/filter.go @@ -0,0 +1,150 @@ +// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package keyspan + +import ( + "context" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// FilterFunc is a callback that allows filtering keys from a Span. The result +// is the set of keys that should be retained (using buf as a buffer). If the +// result has no keys, the span is skipped altogether. +type FilterFunc func(span *Span, buf []Key) []Key + +// filteringIter is a FragmentIterator that uses a FilterFunc to select which +// Spans from the input iterator are returned in the output. +// +// A note on Span lifetimes: as the FilterFunc reuses a Span with a mutable +// slice of Keys to reduce allocations, Spans returned by this iterator are only +// valid until the next relative or absolute positioning method is called. +type filteringIter struct { + iter FragmentIterator + filterFn FilterFunc + cmp base.Compare + + // span is a mutable Span passed to the filterFn. The filterFn is free to + // mutate this Span. The slice of Keys in the Span is reused with every call + // to the filterFn. + span Span +} + +var _ FragmentIterator = (*filteringIter)(nil) + +// Filter returns a new filteringIter that will filter the Spans from the +// provided child iterator using the provided FilterFunc. +func Filter(iter FragmentIterator, filter FilterFunc, cmp base.Compare) FragmentIterator { + return MaybeAssert(&filteringIter{iter: iter, filterFn: filter, cmp: cmp}, cmp) +} + +// SeekGE implements FragmentIterator. +func (i *filteringIter) SeekGE(key []byte) (*Span, error) { + s, err := i.iter.SeekGE(key) + if err != nil { + return nil, err + } + return i.filter(s, +1) +} + +// SeekLT implements FragmentIterator. +func (i *filteringIter) SeekLT(key []byte) (*Span, error) { + span, err := i.iter.SeekLT(key) + if err != nil { + return nil, err + } + return i.filter(span, -1) +} + +// First implements FragmentIterator. +func (i *filteringIter) First() (*Span, error) { + s, err := i.iter.First() + if err != nil { + return nil, err + } + return i.filter(s, +1) +} + +// Last implements FragmentIterator. +func (i *filteringIter) Last() (*Span, error) { + s, err := i.iter.Last() + if err != nil { + return nil, err + } + return i.filter(s, -1) +} + +// Next implements FragmentIterator. +func (i *filteringIter) Next() (*Span, error) { + s, err := i.iter.Next() + if err != nil { + return nil, err + } + return i.filter(s, +1) +} + +// Prev implements FragmentIterator. +func (i *filteringIter) Prev() (*Span, error) { + s, err := i.iter.Prev() + if err != nil { + return nil, err + } + return i.filter(s, -1) +} + +// SetContext is part of the FragmentIterator interface. +func (i *filteringIter) SetContext(ctx context.Context) { + i.iter.SetContext(ctx) +} + +// Close implements FragmentIterator. +func (i *filteringIter) Close() { + i.iter.Close() +} + +// filter uses the filterFn (if configured) to filter and possibly mutate the +// given Span. If the current Span is to be skipped, the iterator continues +// iterating in the given direction until it lands on a Span that should be +// returned, or the iterator becomes invalid. +func (i *filteringIter) filter(span *Span, dir int8) (*Span, error) { + if i.filterFn == nil { + return span, nil + } + var err error + for span != nil { + keys := i.filterFn(span, i.span.Keys[:0]) + if len(keys) > 0 { + i.span = Span{ + Start: span.Start, + End: span.End, + Keys: keys, + KeysOrder: span.KeysOrder, + } + return &i.span, nil + } + + if dir == +1 { + span, err = i.iter.Next() + } else { + span, err = i.iter.Prev() + } + } + // NB: err may be nil or non-nil. + return span, err +} + +// WrapChildren implements FragmentIterator. +func (i *filteringIter) WrapChildren(wrap WrapFn) { + i.iter = wrap(i.iter) +} + +// DebugTree is part of the FragmentIterator interface. +func (i *filteringIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.iter != nil { + i.iter.DebugTree(n) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/fragmenter.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/fragmenter.go similarity index 62% rename from vendor/github.com/cockroachdb/pebble/internal/keyspan/fragmenter.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/fragmenter.go index d4a410d..bd0f596 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/fragmenter.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/fragmenter.go @@ -6,60 +6,11 @@ package keyspan import ( "fmt" - "sort" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" ) -type spansByStartKey struct { - cmp base.Compare - buf []Span -} - -func (v *spansByStartKey) Len() int { return len(v.buf) } -func (v *spansByStartKey) Less(i, j int) bool { - return v.cmp(v.buf[i].Start, v.buf[j].Start) < 0 -} -func (v *spansByStartKey) Swap(i, j int) { - v.buf[i], v.buf[j] = v.buf[j], v.buf[i] -} - -type spansByEndKey struct { - cmp base.Compare - buf []Span -} - -func (v *spansByEndKey) Len() int { return len(v.buf) } -func (v *spansByEndKey) Less(i, j int) bool { - return v.cmp(v.buf[i].End, v.buf[j].End) < 0 -} -func (v *spansByEndKey) Swap(i, j int) { - v.buf[i], v.buf[j] = v.buf[j], v.buf[i] -} - -// keysBySeqNumKind sorts spans by the start key's sequence number in -// descending order. If two spans have equal sequence number, they're compared -// by key kind in descending order. This ordering matches the ordering of -// base.InternalCompare among keys with matching user keys. -type keysBySeqNumKind []Key - -func (v *keysBySeqNumKind) Len() int { return len(*v) } -func (v *keysBySeqNumKind) Less(i, j int) bool { return (*v)[i].Trailer > (*v)[j].Trailer } -func (v *keysBySeqNumKind) Swap(i, j int) { (*v)[i], (*v)[j] = (*v)[j], (*v)[i] } - -// Sort the spans by start key. This is the ordering required by the -// Fragmenter. Usually spans are naturally sorted by their start key, -// but that isn't true for range deletion tombstones in the legacy -// range-del-v1 block format. -func Sort(cmp base.Compare, spans []Span) { - sorter := spansByStartKey{ - cmp: cmp, - buf: spans, - } - sort.Sort(&sorter) -} - // Fragmenter fragments a set of spans such that overlapping spans are // split at their overlap points. The fragmented spans are output to the // supplied Output function. @@ -80,10 +31,8 @@ type Fragmenter struct { // specific key (e.g. TruncateAndFlushTo). It is cached in the Fragmenter to // allow reuse. doneBuf []Span - // sortBuf is used to sort fragments by end key when flushing. - sortBuf spansByEndKey // flushBuf is used to sort keys by (seqnum,kind) before emitting. - flushBuf keysBySeqNumKind + flushBuf []Key // flushedKey is the key that fragments have been flushed up to. Any // additional spans added to the fragmenter must have a start key >= // flushedKey. A nil value indicates flushedKey has not been set. @@ -184,7 +133,7 @@ func (f *Fragmenter) checkInvariants(buf []Span) { // stability, typically callers only need to perform a shallow clone of the Span // before Add-ing it to the fragmenter. // -// Add requires the provided span's keys are sorted in Trailer descending order. +// Add requires the provided span's keys are sorted in InternalKeyTrailer descending order. func (f *Fragmenter) Add(s Span) { if f.finished { panic("pebble: span fragmenter already finished") @@ -229,126 +178,11 @@ func (f *Fragmenter) Add(s Span) { f.pending = append(f.pending, s) } -// Cover is returned by Framenter.Covers and describes a span's relationship to -// a key at a particular snapshot. -type Cover int8 - -const ( - // NoCover indicates the tested key does not fall within the span's bounds, - // or the span contains no keys with sequence numbers higher than the key's. - NoCover Cover = iota - // CoversInvisibly indicates the tested key does fall within the span's - // bounds and the span contains at least one key with a higher sequence - // number, but none visible at the provided snapshot. - CoversInvisibly - // CoversVisibly indicates the tested key does fall within the span's - // bounds, and the span constains at least one key with a sequence number - // higher than the key's sequence number that is visible at the provided - // snapshot. - CoversVisibly -) - -// Covers returns an enum indicating whether the specified key is covered by one -// of the pending keys. The provided key must be consistent with the ordering of -// the spans. That is, it is invalid to specify a key here that is out of order -// with the span start keys passed to Add. -func (f *Fragmenter) Covers(key base.InternalKey, snapshot uint64) Cover { - if f.finished { - panic("pebble: span fragmenter already finished") - } - if len(f.pending) == 0 { - return NoCover - } - - if f.Cmp(f.pending[0].Start, key.UserKey) > 0 { - panic(fmt.Sprintf("pebble: keys must be in order: %s > %s", - f.Format(f.pending[0].Start), key.Pretty(f.Format))) - } - - cover := NoCover - seqNum := key.SeqNum() - for _, s := range f.pending { - if f.Cmp(key.UserKey, s.End) < 0 { - // NB: A range deletion tombstone does not delete a point operation - // at the same sequence number, and broadly a span is not considered - // to cover a point operation at the same sequence number. - - for i := range s.Keys { - if kseq := s.Keys[i].SeqNum(); kseq > seqNum { - // This key from the span has a higher sequence number than - // `key`. It covers `key`, although the span's key might not - // be visible if its snapshot is too high. - // - // Batch keys are always be visible. - if kseq < snapshot || kseq&base.InternalKeySeqNumBatch != 0 { - return CoversVisibly - } - // s.Keys[i] is not visible. - cover = CoversInvisibly - } - } - } - } - return cover -} - // Empty returns true if all fragments added so far have finished flushing. func (f *Fragmenter) Empty() bool { return f.finished || len(f.pending) == 0 } -// TruncateAndFlushTo flushes all of the fragments with a start key <= key, -// truncating spans to the specified end key. Used during compaction to force -// emitting of spans which straddle an sstable boundary. Consider -// the scenario: -// -// a---------k#10 -// f#8 -// f#7 -// -// Let's say the next user key after f is g. Calling TruncateAndFlushTo(g) will -// flush this span: -// -// a-------g#10 -// f#8 -// f#7 -// -// And leave this one in f.pending: -// -// g----k#10 -// -// WARNING: The fragmenter could hold on to the specified end key. Ensure it's -// a safe byte slice that could outlast the current sstable output, and one -// that will never be modified. -func (f *Fragmenter) TruncateAndFlushTo(key []byte) { - if f.finished { - panic("pebble: span fragmenter already finished") - } - if f.flushedKey != nil { - switch c := f.Cmp(key, f.flushedKey); { - case c < 0: - panic(fmt.Sprintf("pebble: start key (%s) < flushed key (%s)", - f.Format(key), f.Format(f.flushedKey))) - } - } - if invariants.RaceEnabled { - f.checkInvariants(f.pending) - defer func() { f.checkInvariants(f.pending) }() - } - if len(f.pending) > 0 { - // Since all of the pending spans have the same start key, we only need - // to compare against the first one. - switch c := f.Cmp(f.pending[0].Start, key); { - case c > 0: - panic(fmt.Sprintf("pebble: keys must be added in order: %s > %s", - f.Format(f.pending[0].Start), f.Format(key))) - case c == 0: - return - } - } - f.truncateAndFlush(key) -} - // Start returns the start key of the first span in the pending buffer, or nil // if there are no pending spans. The start key of all pending spans is the same // as that of the first one. @@ -359,6 +193,14 @@ func (f *Fragmenter) Start() []byte { return nil } +// Truncate truncates all pending spans up to key (exclusive), flushes them, and +// retains any spans that continue onward for future flushes. +func (f *Fragmenter) Truncate(key []byte) { + if len(f.pending) > 0 { + f.truncateAndFlush(key) + } +} + // Flushes all pending spans up to key (exclusive). // // WARNING: The specified key is stored without making a copy, so all callers @@ -420,9 +262,7 @@ func (f *Fragmenter) flush(buf []Span, lastKey []byte) { // Sort the spans by end key. This will allow us to walk over the spans and // easily determine the next split point (the smallest end-key). - f.sortBuf.cmp = f.Cmp - f.sortBuf.buf = buf - sort.Sort(&f.sortBuf) + SortSpansByEndKey(f.Cmp, buf) // Loop over the spans, splitting by end key. for len(buf) > 0 { @@ -439,7 +279,7 @@ func (f *Fragmenter) flush(buf []Span, lastKey []byte) { f.flushBuf = append(f.flushBuf, buf[i].Keys...) } - sort.Sort(&f.flushBuf) + SortKeysByTrailer(f.flushBuf) f.Emit(Span{ Start: buf[0].Start, @@ -451,7 +291,7 @@ func (f *Fragmenter) flush(buf []Span, lastKey []byte) { // indefinitely. // // Eventually, we should be able to replace the fragmenter with the - // keyspan.MergingIter which will perform just-in-time + // keyspanimpl.MergingIter which will perform just-in-time // fragmentation, and only guaranteeing the memory lifetime for the // current span. The MergingIter fragments while only needing to // access one Span per level. It only accesses the Span at the diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/get.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/get.go new file mode 100644 index 0000000..26f1e11 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/get.go @@ -0,0 +1,28 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package keyspan + +import "github.com/cockroachdb/pebble/v2/internal/base" + +// Get returns the newest span that contains the target key. If no span contains +// the target key, an empty span is returned. The iterator must contain +// fragmented spans: no span may overlap another. +// +// If an error occurs while seeking iter, a nil span and non-nil error is +// returned. +func Get(cmp base.Compare, iter FragmentIterator, key []byte) (*Span, error) { + // NB: FragmentIterator.SeekGE moves the iterator to the first span covering + // a key greater than or equal to the given key. This is equivalent to + // seeking to the first span with an end key greater than the given key. + iterSpan, err := iter.SeekGE(key) + switch { + case err != nil: + return nil, err + case iterSpan != nil && cmp(iterSpan.Start, key) > 0: + return nil, nil + default: + return iterSpan, nil + } +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/interleaving_iter.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/interleaving_iter.go similarity index 81% rename from vendor/github.com/cockroachdb/pebble/internal/keyspan/interleaving_iter.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/interleaving_iter.go index 6419821..f913f87 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/interleaving_iter.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/interleaving_iter.go @@ -5,11 +5,13 @@ package keyspan import ( + "context" "fmt" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" ) // A SpanMask may be used to configure an interleaving iterator to skip point @@ -86,6 +88,11 @@ type SpanMask interface { // InterleavedIter does not interleave synthetic markers for spans that do not // contain any keys. // +// When InterleavingIterOpts.InterleaveEndKeys is set, in addition to +// interleaving start keys, the interleaving iterator will interleave end +// boundary keys (also at the maximumal sequence number). At these end boundary +// positions, Span() will return the span to which the end boundary belongs. +// // # SpanMask // // InterelavingIter takes a SpanMask parameter that may be used to configure the @@ -97,10 +104,7 @@ type InterleavingIter struct { comparer *base.Comparer pointIter base.InternalIterator keyspanIter FragmentIterator - mask SpanMask - - // lower and upper hold the iteration bounds set through SetBounds. - lower, upper []byte + opts InterleavingIterOpts // keyBuf is used to copy SeekGE or SeekPrefixGE arguments when they're used // to truncate a span. The byte slices backing a SeekGE/SeekPrefixGE search // keys can come directly from the end user, so they're copied into keyBuf @@ -110,8 +114,7 @@ type InterleavingIter struct { // upper bound of the returned spans. SeekPrefixGE truncates the returned // spans to an upper bound of the seeked prefix's immediate successor. nextPrefixBuf []byte - pointKey *base.InternalKey - pointVal base.LazyValue + pointKV *base.InternalKV // err holds an iterator error from either pointIter or keyspanIter. It's // reset to nil on seeks. An overview of error-handling mechanics: // @@ -142,7 +145,7 @@ type InterleavingIter struct { span *Span // spanMarker holds the synthetic key that is returned when the iterator // passes over a key span's start bound. - spanMarker base.InternalKey + spanMarker base.InternalKV // truncated indicates whether or not the span at the current position // needed to be truncated. If it did, truncatedSpan holds the truncated // span that should be returned. @@ -193,6 +196,10 @@ var _ base.InternalIterator = &InterleavingIter{} type InterleavingIterOpts struct { Mask SpanMask LowerBound, UpperBound []byte + // InterleaveEndKeys configures the interleaving iterator to interleave the + // end keys of spans (in addition to the start keys, which are always + // interleaved). + InterleaveEndKeys bool } // Init initializes the InterleavingIter to interleave point keys from pointIter @@ -206,14 +213,15 @@ func (i *InterleavingIter) Init( keyspanIter FragmentIterator, opts InterleavingIterOpts, ) { + keyspanIter = MaybeAssert(keyspanIter, comparer.Compare) + // To debug: + // keyspanIter = InjectLogging(keyspanIter, base.DefaultLogger) *i = InterleavingIter{ cmp: comparer.Compare, comparer: comparer, pointIter: pointIter, keyspanIter: keyspanIter, - mask: opts.Mask, - lower: opts.LowerBound, - upper: opts.UpperBound, + opts: opts, } } @@ -228,12 +236,12 @@ func (i *InterleavingIter) Init( // It allows for seeding the iterator with the current position of the point // iterator. func (i *InterleavingIter) InitSeekGE( - prefix, key []byte, pointKey *base.InternalKey, pointValue base.LazyValue, -) (*base.InternalKey, base.LazyValue) { + prefix, key []byte, pointKV *base.InternalKV, +) *base.InternalKV { i.dir = +1 i.clearMask() i.prefix = prefix - i.savePoint(pointKey, pointValue) + i.savePoint(pointKV) // NB: This keyspanSeekGE call will truncate the span to the seek key if // necessary. This truncation is important for cases where a switch to // combined iteration is made during a user-initiated SeekGE. @@ -252,15 +260,13 @@ func (i *InterleavingIter) InitSeekGE( // This method is used specifically for lazily constructing combined iterators. // It allows for seeding the iterator with the current position of the point // iterator. -func (i *InterleavingIter) InitSeekLT( - key []byte, pointKey *base.InternalKey, pointValue base.LazyValue, -) (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) InitSeekLT(key []byte, pointKV *base.InternalKV) *base.InternalKV { i.dir = -1 i.clearMask() - i.savePoint(pointKey, pointValue) + i.savePoint(pointKV) i.keyspanSeekLT(key) i.computeLargestPos() - return i.yieldPosition(i.lower, i.prevPos) + return i.yieldPosition(i.opts.LowerBound, i.prevPos) } // SeekGE implements (base.InternalIterator).SeekGE. @@ -274,9 +280,7 @@ func (i *InterleavingIter) InitSeekLT( // NB: In accordance with the base.InternalIterator contract: // // i.lower ≤ key -func (i *InterleavingIter) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { i.err = nil i.clearMask() i.disablePrefixMode() @@ -288,7 +292,7 @@ func (i *InterleavingIter) SeekGE( if i.span != nil && i.cmp(key, i.span.End) < 0 && i.cmp(key, i.span.Start) >= 0 { // We're seeking within the existing span's bounds. We still might need // truncate the span to the iterator's bounds. - i.saveSpanForward(i.span) + i.saveSpanForward(i.span, nil) i.savedKeyspan() } else { i.keyspanSeekGE(key, nil /* prefix */) @@ -312,7 +316,7 @@ func (i *InterleavingIter) SeekGE( // i.lower ≤ key func (i *InterleavingIter) SeekPrefixGE( prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { +) *base.InternalKV { i.err = nil i.clearMask() i.prefix = prefix @@ -345,7 +349,7 @@ func (i *InterleavingIter) SeekPrefixGE( if ei := i.comparer.Split(i.span.End); i.cmp(prefix, i.span.End[:ei]) < 0 { // We're seeking within the existing span's bounds. We still might need // truncate the span to the iterator's bounds. - i.saveSpanForward(i.span) + i.saveSpanForward(i.span, nil) i.savedKeyspan() seekKeyspanIter = false } @@ -360,9 +364,7 @@ func (i *InterleavingIter) SeekPrefixGE( } // SeekLT implements (base.InternalIterator).SeekLT. -func (i *InterleavingIter) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { i.err = nil i.clearMask() i.disablePrefixMode() @@ -374,7 +376,7 @@ func (i *InterleavingIter) SeekLT( if i.span != nil && i.cmp(key, i.span.Start) > 0 && i.cmp(key, i.span.End) < 0 { // We're seeking within the existing span's bounds. We still might need // truncate the span to the iterator's bounds. - i.saveSpanBackward(i.span) + i.saveSpanBackward(i.span, nil) // The span's start key is still not guaranteed to be less than key, // because of the bounds enforcement. Consider the following example: // @@ -397,11 +399,11 @@ func (i *InterleavingIter) SeekLT( i.dir = -1 i.computeLargestPos() - return i.yieldPosition(i.lower, i.prevPos) + return i.yieldPosition(i.opts.LowerBound, i.prevPos) } // First implements (base.InternalIterator).First. -func (i *InterleavingIter) First() (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) First() *base.InternalKV { i.err = nil i.clearMask() i.disablePrefixMode() @@ -410,11 +412,11 @@ func (i *InterleavingIter) First() (*base.InternalKey, base.LazyValue) { i.savedKeyspan() i.dir = +1 i.computeSmallestPos() - return i.yieldPosition(i.lower, i.nextPos) + return i.yieldPosition(i.opts.LowerBound, i.nextPos) } // Last implements (base.InternalIterator).Last. -func (i *InterleavingIter) Last() (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) Last() *base.InternalKV { i.err = nil i.clearMask() i.disablePrefixMode() @@ -423,16 +425,16 @@ func (i *InterleavingIter) Last() (*base.InternalKey, base.LazyValue) { i.savedKeyspan() i.dir = -1 i.computeLargestPos() - return i.yieldPosition(i.lower, i.prevPos) + return i.yieldPosition(i.opts.LowerBound, i.prevPos) } // Next implements (base.InternalIterator).Next. -func (i *InterleavingIter) Next() (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) Next() *base.InternalKV { if i.dir == -1 { // Switching directions. i.dir = +1 - if i.mask != nil { + if i.opts.Mask != nil { // Clear the mask while we reposition the point iterator. While // switching directions, we may move the point iterator outside of // i.span's bounds. @@ -464,32 +466,34 @@ func (i *InterleavingIter) Next() (*base.InternalKey, base.LazyValue) { // Since we're positioned on a Span, the pointIter is positioned // entirely behind the current iterator position. Reposition it // ahead of the current iterator position. - i.savePoint(i.pointIter.Next()) + i.switchPointIteratorIntoForward() case posKeyspanEnd: // Since we're positioned on a Span, the pointIter is positioned // entirely behind of the current iterator position. Reposition it // ahead the current iterator position. - i.savePoint(i.pointIter.Next()) + i.switchPointIteratorIntoForward() } // Fallthrough to calling i.nextPos. } i.nextPos() - return i.yieldPosition(i.lower, i.nextPos) + return i.yieldPosition(i.opts.LowerBound, i.nextPos) } // NextPrefix implements (base.InternalIterator).NextPrefix. -func (i *InterleavingIter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { +// +// Calling NextPrefix while positioned at a span boundary is prohibited. +func (i *InterleavingIter) NextPrefix(succKey []byte) *base.InternalKV { if i.dir == -1 { panic("pebble: cannot switch directions with NextPrefix") } switch i.pos { case posExhausted: - return nil, base.LazyValue{} + return nil case posPointKey: i.savePoint(i.pointIter.NextPrefix(succKey)) if i.withinSpan { - if i.pointKey == nil || i.cmp(i.span.End, i.pointKey.UserKey) <= 0 { + if i.pointKV == nil || i.cmp(i.span.End, i.pointKV.K.UserKey) <= 0 { i.pos = posKeyspanEnd } else { i.pos = posPointKey @@ -498,18 +502,18 @@ func (i *InterleavingIter) NextPrefix(succKey []byte) (*base.InternalKey, base.L i.computeSmallestPos() } case posKeyspanStart, posKeyspanEnd: - i.nextPos() + panic(errors.AssertionFailedf("NextPrefix called while positioned on a span boundary")) } - return i.yieldPosition(i.lower, i.nextPos) + return i.yieldPosition(i.opts.LowerBound, i.nextPos) } // Prev implements (base.InternalIterator).Prev. -func (i *InterleavingIter) Prev() (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) Prev() *base.InternalKV { if i.dir == +1 { // Switching directions. i.dir = -1 - if i.mask != nil { + if i.opts.Mask != nil { // Clear the mask while we reposition the point iterator. While // switching directions, we may move the point iterator outside of // i.span's bounds. @@ -540,7 +544,7 @@ func (i *InterleavingIter) Prev() (*base.InternalKey, base.LazyValue) { // Since we're positioned on a Span, the pointIter is positioned // entirely ahead of the current iterator position. Reposition it // behind the current iterator position. - i.savePoint(i.pointIter.Prev()) + i.switchPointIteratorIntoReverse() // Without considering truncation of spans to seek keys, the keyspan // iterator is already in the right place. But consider span [a, z) // and this sequence of iterator calls: @@ -562,7 +566,7 @@ func (i *InterleavingIter) Prev() (*base.InternalKey, base.LazyValue) { // Since we're positioned on a Span, the pointIter is positioned // entirely ahead of the current iterator position. Reposition it // behind the current iterator position. - i.savePoint(i.pointIter.Prev()) + i.switchPointIteratorIntoReverse() } if i.spanMarkerTruncated { @@ -572,7 +576,7 @@ func (i *InterleavingIter) Prev() (*base.InternalKey, base.LazyValue) { // Fallthrough to calling i.prevPos. } i.prevPos() - return i.yieldPosition(i.lower, i.prevPos) + return i.yieldPosition(i.opts.LowerBound, i.prevPos) } // computeSmallestPos sets i.{pos,withinSpan} to: @@ -580,13 +584,13 @@ func (i *InterleavingIter) Prev() (*base.InternalKey, base.LazyValue) { // MIN(i.pointKey, i.span.Start) func (i *InterleavingIter) computeSmallestPos() { if i.err == nil { - if i.span != nil && (i.pointKey == nil || i.cmp(i.startKey(), i.pointKey.UserKey) <= 0) { + if i.span != nil && (i.pointKV == nil || i.cmp(i.startKey(), i.pointKV.K.UserKey) <= 0) { i.withinSpan = true i.pos = posKeyspanStart return } i.withinSpan = false - if i.pointKey != nil { + if i.pointKV != nil { i.pos = posPointKey return } @@ -599,13 +603,13 @@ func (i *InterleavingIter) computeSmallestPos() { // MAX(i.pointKey, i.span.End) func (i *InterleavingIter) computeLargestPos() { if i.err == nil { - if i.span != nil && (i.pointKey == nil || i.cmp(i.span.End, i.pointKey.UserKey) > 0) { + if i.span != nil && (i.pointKV == nil || i.cmp(i.span.End, i.pointKV.K.UserKey) > 0) { i.withinSpan = true i.pos = posKeyspanEnd return } i.withinSpan = false - if i.pointKey != nil { + if i.pointKV != nil { i.pos = posPointKey return } @@ -634,7 +638,7 @@ func (i *InterleavingIter) nextPos() { switch i.pos { case posExhausted: - i.savePoint(i.pointIter.Next()) + i.switchPointIteratorIntoForward() i.saveSpanForward(i.keyspanIter.Next()) i.savedKeyspan() i.computeSmallestPos() @@ -657,13 +661,13 @@ func (i *InterleavingIter) nextPos() { switch { case i.span == nil: panic("i.withinSpan=true and i.span=nil") - case i.pointKey == nil: + case i.pointKV == nil: // Since i.withinSpan=true, we step onto the end boundary of the // keyspan. i.pos = posKeyspanEnd default: - // i.withinSpan && i.pointKey != nil && i.span != nil - if i.cmp(i.span.End, i.pointKey.UserKey) <= 0 { + // i.withinSpan && i.pointKV != nil && i.span != nil + if i.cmp(i.span.End, i.pointKV.K.UserKey) <= 0 { i.pos = posKeyspanEnd } else { i.pos = posPointKey @@ -671,7 +675,7 @@ func (i *InterleavingIter) nextPos() { } case posKeyspanStart: // Either a point key or the span's end key comes next. - if i.pointKey != nil && i.cmp(i.pointKey.UserKey, i.span.End) < 0 { + if i.pointKV != nil && i.cmp(i.pointKV.K.UserKey, i.span.End) < 0 { i.pos = posPointKey } else { i.pos = posKeyspanEnd @@ -706,7 +710,7 @@ func (i *InterleavingIter) prevPos() { switch i.pos { case posExhausted: - i.savePoint(i.pointIter.Prev()) + i.switchPointIteratorIntoReverse() i.saveSpanBackward(i.keyspanIter.Prev()) i.savedKeyspan() i.computeLargestPos() @@ -726,11 +730,11 @@ func (i *InterleavingIter) prevPos() { switch { case i.span == nil: panic("withinSpan=true, but i.span == nil") - case i.pointKey == nil: - i.pos = posKeyspanEnd + case i.pointKV == nil: + i.pos = posKeyspanStart default: // i.withinSpan && i.pointKey != nil && i.span != nil - if i.cmp(i.span.Start, i.pointKey.UserKey) > 0 { + if i.cmp(i.span.Start, i.pointKV.K.UserKey) > 0 { i.pos = posKeyspanStart } else { i.pos = posPointKey @@ -742,7 +746,7 @@ func (i *InterleavingIter) prevPos() { i.computeLargestPos() case posKeyspanEnd: // Either a point key or the span's start key is previous. - if i.pointKey != nil && i.cmp(i.pointKey.UserKey, i.span.Start) >= 0 { + if i.pointKV != nil && i.cmp(i.pointKV.K.UserKey, i.span.Start) >= 0 { i.pos = posPointKey } else { i.pos = posKeyspanStart @@ -752,9 +756,7 @@ func (i *InterleavingIter) prevPos() { } } -func (i *InterleavingIter) yieldPosition( - lowerBound []byte, advance func(), -) (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) yieldPosition(lowerBound []byte, advance func()) *base.InternalKV { // This loop returns the first visible position in the current iteration // direction. Some positions are not visible and skipped. For example, if // masking is enabled and the iterator is positioned over a masked point @@ -767,13 +769,13 @@ func (i *InterleavingIter) yieldPosition( case posExhausted: return i.yieldNil() case posPointKey: - if i.pointKey == nil { - panic("i.pointKey is nil") + if i.pointKV == nil { + panic("i.pointKV is nil") } - if i.mask != nil { + if i.opts.Mask != nil { i.maybeUpdateMask() - if i.withinSpan && i.mask.SkipPoint(i.pointKey.UserKey) { + if i.withinSpan && i.opts.Mask.SkipPoint(i.pointKV.K.UserKey) { // The span covers the point key. If a SkipPoint hook is // configured, ask it if we should skip this point key. if i.prefix != nil { @@ -802,16 +804,19 @@ func (i *InterleavingIter) yieldPosition( } return i.yieldPointKey() case posKeyspanEnd: - // Don't interleave end keys; just advance. - advance() - continue + if !i.opts.InterleaveEndKeys { + // Don't interleave end keys; just advance. + advance() + continue + } + return i.yieldSyntheticSpanEndMarker() case posKeyspanStart: // Don't interleave an empty span. if i.span.Empty() { advance() continue } - return i.yieldSyntheticSpanMarker(lowerBound) + return i.yieldSyntheticSpanStartMarker(lowerBound) default: panic(fmt.Sprintf("unexpected interleavePos=%d", i.pos)) } @@ -844,23 +849,40 @@ func (i *InterleavingIter) keyspanSeekLT(k []byte) { i.savedKeyspan() } -func (i *InterleavingIter) saveSpanForward(span *Span) { +// switchPointIteratorIntoReverse switches the direction of the point iterator +// into reverse, stepping to the previous point key. If the point iterator is +// exhausted in the forward direction and there's an upper bound present, it's +// re-seeked to ensure the iterator obeys the upper bound. +func (i *InterleavingIter) switchPointIteratorIntoReverse() { + if i.pointKV == nil && i.opts.UpperBound != nil { + i.savePoint(i.pointIter.SeekLT(i.opts.UpperBound, base.SeekLTFlagsNone)) + return + } + i.savePoint(i.pointIter.Prev()) +} + +// switchPointIteratorIntoForward switches the direction of the point iterator +// into the forward direction, stepping to the next point key. If the point +// iterator is exhausted in the reverse direction and there's a lower bound +// present, it's re-seeked to ensure the iterator obeys the lower bound. +func (i *InterleavingIter) switchPointIteratorIntoForward() { + if i.pointKV == nil && i.opts.LowerBound != nil { + i.savePoint(i.pointIter.SeekGE(i.opts.LowerBound, base.SeekGEFlagsNone)) + return + } + i.savePoint(i.pointIter.Next()) +} + +func (i *InterleavingIter) saveSpanForward(span *Span, err error) { i.span = span + i.err = firstError(i.err, err) i.truncated = false i.truncatedSpan = Span{} if i.span == nil { - i.err = firstError(i.err, i.keyspanIter.Error()) return } - if invariants.Enabled { - if err := i.keyspanIter.Error(); err != nil { - panic(errors.WithSecondaryError( - errors.AssertionFailedf("pebble: %T keyspan iterator returned non-nil span %s while iter has error", i.keyspanIter, i.span), - err)) - } - } // Check the upper bound if we have one. - if i.upper != nil && i.cmp(i.span.Start, i.upper) >= 0 { + if i.opts.UpperBound != nil && i.cmp(i.span.Start, i.opts.UpperBound) >= 0 { i.span = nil return } @@ -873,17 +895,17 @@ func (i *InterleavingIter) saveSpanForward(span *Span) { // NB: These truncations don't require setting `keyspanMarkerTruncated`: // That flag only applies to truncated span marker keys. - if i.lower != nil && i.cmp(i.span.Start, i.lower) < 0 { + if i.opts.LowerBound != nil && i.cmp(i.span.Start, i.opts.LowerBound) < 0 { i.truncated = true i.truncatedSpan = *i.span - i.truncatedSpan.Start = i.lower + i.truncatedSpan.Start = i.opts.LowerBound } - if i.upper != nil && i.cmp(i.upper, i.span.End) < 0 { + if i.opts.UpperBound != nil && i.cmp(i.opts.UpperBound, i.span.End) < 0 { if !i.truncated { i.truncated = true i.truncatedSpan = *i.span } - i.truncatedSpan.End = i.upper + i.truncatedSpan.End = i.opts.UpperBound } // If this is a part of a SeekPrefixGE call, we may also need to truncate to // the prefix's bounds. @@ -906,24 +928,17 @@ func (i *InterleavingIter) saveSpanForward(span *Span) { } } -func (i *InterleavingIter) saveSpanBackward(span *Span) { +func (i *InterleavingIter) saveSpanBackward(span *Span, err error) { i.span = span + i.err = firstError(i.err, err) i.truncated = false i.truncatedSpan = Span{} if i.span == nil { - i.err = firstError(i.err, i.keyspanIter.Error()) return } - if invariants.Enabled { - if err := i.keyspanIter.Error(); err != nil { - panic(errors.WithSecondaryError( - errors.AssertionFailedf("pebble: %T keyspan iterator returned non-nil span %s while iter has error", i.keyspanIter, i.span), - err)) - } - } // Check the lower bound if we have one. - if i.lower != nil && i.cmp(i.span.End, i.lower) <= 0 { + if i.opts.LowerBound != nil && i.cmp(i.span.End, i.opts.LowerBound) <= 0 { i.span = nil return } @@ -936,38 +951,36 @@ func (i *InterleavingIter) saveSpanBackward(span *Span) { // NB: These truncations don't require setting `keyspanMarkerTruncated`: // That flag only applies to truncated span marker keys. - if i.lower != nil && i.cmp(i.span.Start, i.lower) < 0 { + if i.opts.LowerBound != nil && i.cmp(i.span.Start, i.opts.LowerBound) < 0 { i.truncated = true i.truncatedSpan = *i.span - i.truncatedSpan.Start = i.lower + i.truncatedSpan.Start = i.opts.LowerBound } - if i.upper != nil && i.cmp(i.upper, i.span.End) < 0 { + if i.opts.UpperBound != nil && i.cmp(i.opts.UpperBound, i.span.End) < 0 { if !i.truncated { i.truncated = true i.truncatedSpan = *i.span } - i.truncatedSpan.End = i.upper + i.truncatedSpan.End = i.opts.UpperBound } if i.truncated && i.comparer.Equal(i.truncatedSpan.Start, i.truncatedSpan.End) { i.span = nil } } -func (i *InterleavingIter) yieldNil() (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) yieldNil() *base.InternalKV { i.withinSpan = false i.clearMask() - return i.verify(nil, base.LazyValue{}) + return i.verify(nil) } -func (i *InterleavingIter) yieldPointKey() (*base.InternalKey, base.LazyValue) { - return i.verify(i.pointKey, i.pointVal) +func (i *InterleavingIter) yieldPointKey() *base.InternalKV { + return i.verify(i.pointKV) } -func (i *InterleavingIter) yieldSyntheticSpanMarker( - lowerBound []byte, -) (*base.InternalKey, base.LazyValue) { - i.spanMarker.UserKey = i.startKey() - i.spanMarker.Trailer = base.MakeTrailer(base.InternalKeySeqNumMax, i.span.Keys[0].Kind()) +func (i *InterleavingIter) yieldSyntheticSpanStartMarker(lowerBound []byte) *base.InternalKV { + i.spanMarker.K.UserKey = i.startKey() + i.spanMarker.K.Trailer = base.MakeTrailer(base.SeqNumMax, i.span.Keys[0].Kind()) // Truncate the key we return to our lower bound if we have one. Note that // we use the lowerBound function parameter, not i.lower. The lowerBound @@ -979,7 +992,7 @@ func (i *InterleavingIter) yieldSyntheticSpanMarker( // bound for truncating a span. The span a-z will be truncated to [k, // z). If i.upper == k, we'd mistakenly try to return a span [k, k), an // invariant violation. - if i.comparer.Equal(lowerBound, i.upper) { + if i.comparer.Equal(lowerBound, i.opts.UpperBound) { return i.yieldNil() } @@ -995,11 +1008,17 @@ func (i *InterleavingIter) yieldSyntheticSpanMarker( // reasoning around lifetimes, always copy the bound into keyBuf when // truncating. i.keyBuf = append(i.keyBuf[:0], lowerBound...) - i.spanMarker.UserKey = i.keyBuf + i.spanMarker.K.UserKey = i.keyBuf i.spanMarkerTruncated = true } i.maybeUpdateMask() - return i.verify(&i.spanMarker, base.LazyValue{}) + return i.verify(&i.spanMarker) +} + +func (i *InterleavingIter) yieldSyntheticSpanEndMarker() *base.InternalKV { + i.spanMarker.K.UserKey = i.endKey() + i.spanMarker.K.Trailer = base.MakeTrailer(base.SeqNumMax, i.span.Keys[0].Kind()) + return i.verify(&i.spanMarker) } func (i *InterleavingIter) disablePrefixMode() { @@ -1011,28 +1030,26 @@ func (i *InterleavingIter) disablePrefixMode() { } } -func (i *InterleavingIter) verify( - k *base.InternalKey, v base.LazyValue, -) (*base.InternalKey, base.LazyValue) { +func (i *InterleavingIter) verify(kv *base.InternalKV) *base.InternalKV { // Wrap the entire function body in the invariants build tag, so that // production builds elide this entire function. if invariants.Enabled { switch { case i.dir == -1 && i.spanMarkerTruncated: panic("pebble: invariant violation: truncated span key in reverse iteration") - case k != nil && i.lower != nil && i.cmp(k.UserKey, i.lower) < 0: + case kv != nil && i.opts.LowerBound != nil && !kv.K.IsExclusiveSentinel() && + i.cmp(kv.K.UserKey, i.opts.LowerBound) < 0: panic("pebble: invariant violation: key < lower bound") - case k != nil && i.upper != nil && i.cmp(k.UserKey, i.upper) >= 0: + case kv != nil && i.opts.UpperBound != nil && !kv.K.IsExclusiveSentinel() && + !base.UserKeyExclusive(i.opts.UpperBound).IsUpperBoundForInternalKey(i.comparer.Compare, kv.K): panic("pebble: invariant violation: key ≥ upper bound") - case i.err != nil && k != nil: + case i.err != nil && kv != nil: panic("pebble: invariant violation: accumulated error swallowed") case i.err == nil && i.pointIter.Error() != nil: panic("pebble: invariant violation: pointIter swallowed") - case i.err == nil && i.keyspanIter.Error() != nil: - panic("pebble: invariant violation: keyspanIter error swallowed") } } - return k, v + return kv } func (i *InterleavingIter) savedKeyspan() { @@ -1044,15 +1061,15 @@ func (i *InterleavingIter) savedKeyspan() { // hasn't been updated with the current keyspan yet. func (i *InterleavingIter) maybeUpdateMask() { switch { - case i.mask == nil, i.maskSpanChangedCalled: + case i.opts.Mask == nil, i.maskSpanChangedCalled: return case !i.withinSpan || i.span.Empty(): i.clearMask() case i.truncated: - i.mask.SpanChanged(&i.truncatedSpan) + i.opts.Mask.SpanChanged(&i.truncatedSpan) i.maskSpanChangedCalled = true default: - i.mask.SpanChanged(i.span) + i.opts.Mask.SpanChanged(i.span) i.maskSpanChangedCalled = true } } @@ -1060,9 +1077,9 @@ func (i *InterleavingIter) maybeUpdateMask() { // clearMask clears the current mask, if a mask is configured and no mask should // be active. func (i *InterleavingIter) clearMask() { - if i.mask != nil { + if i.opts.Mask != nil { i.maskSpanChangedCalled = false - i.mask.SpanChanged(nil) + i.opts.Mask.SpanChanged(nil) } } @@ -1073,15 +1090,22 @@ func (i *InterleavingIter) startKey() []byte { return i.span.Start } -func (i *InterleavingIter) savePoint(key *base.InternalKey, value base.LazyValue) { - i.pointKey, i.pointVal = key, value - if key == nil { +func (i *InterleavingIter) endKey() []byte { + if i.truncated { + return i.truncatedSpan.End + } + return i.span.End +} + +func (i *InterleavingIter) savePoint(kv *base.InternalKV) { + i.pointKV = kv + if kv == nil { i.err = firstError(i.err, i.pointIter.Error()) } if invariants.Enabled { - if err := i.pointIter.Error(); key != nil && err != nil { + if err := i.pointIter.Error(); kv != nil && err != nil { panic(errors.WithSecondaryError( - errors.AssertionFailedf("pebble: %T point iterator returned non-nil key %q while iter has error", i.pointIter, key), + base.AssertionFailedf("pebble: %T point iterator returned non-nil key %q while iter has error", i.pointIter, kv), err)) } } @@ -1094,9 +1118,13 @@ func (i *InterleavingIter) savePoint(key *base.InternalKey, value base.LazyValue // // Span will never return an invalid or empty span. func (i *InterleavingIter) Span() *Span { + if invariants.Enabled && i.pointIter == nil { + panic("Span() called after close") + } if !i.withinSpan || len(i.span.Keys) == 0 { return nil - } else if i.truncated { + } + if i.truncated { return &i.truncatedSpan } return i.span @@ -1104,18 +1132,34 @@ func (i *InterleavingIter) Span() *Span { // SetBounds implements (base.InternalIterator).SetBounds. func (i *InterleavingIter) SetBounds(lower, upper []byte) { - i.lower, i.upper = lower, upper + i.opts.LowerBound, i.opts.UpperBound = lower, upper i.pointIter.SetBounds(lower, upper) i.Invalidate() } +// SetContext implements (base.InternalIterator).SetContext. +func (i *InterleavingIter) SetContext(ctx context.Context) { + i.pointIter.SetContext(ctx) + i.keyspanIter.SetContext(ctx) +} + +// DebugTree is part of the InternalIterator interface. +func (i *InterleavingIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.pointIter != nil { + i.pointIter.DebugTree(n) + } + if i.keyspanIter != nil { + i.keyspanIter.DebugTree(n) + } +} + // Invalidate invalidates the interleaving iterator's current position, clearing // its state. This prevents optimizations such as reusing the current span on // seek. func (i *InterleavingIter) Invalidate() { i.span = nil - i.pointKey = nil - i.pointVal = base.LazyValue{} + i.pointKV = nil } // Error implements (base.InternalIterator).Error. @@ -1125,9 +1169,11 @@ func (i *InterleavingIter) Error() error { // Close implements (base.InternalIterator).Close. func (i *InterleavingIter) Close() error { - perr := i.pointIter.Close() - rerr := i.keyspanIter.Close() - return firstError(perr, rerr) + err := i.pointIter.Close() + i.pointIter = nil + i.keyspanIter.Close() + i.keyspanIter = nil + return err } // String implements (base.InternalIterator).String. diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/iter.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/iter.go similarity index 69% rename from vendor/github.com/cockroachdb/pebble/internal/keyspan/iter.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/iter.go index 7f8ceb8..88e7c8b 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/iter.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/iter.go @@ -5,8 +5,10 @@ package keyspan import ( - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/manifest" + "context" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" ) // FragmentIterator defines an iterator interface over spans. The spans @@ -17,22 +19,30 @@ import ( // positioning method. Some implementations (eg, keyspan.Iter) may provide // longer lifetimes but implementations need only guarantee stability until the // next positioning method. +// +// If any positioning method fails to find a span, the iterator is left +// positioned at an exhausted position in the direction of iteration. For +// example, a caller than finds SeekGE(k)=nil may call Prev to move the iterator +// to the last span. +// +// If an error occurs during any positioning method, the method returns a nil +// span and a non-nil error. type FragmentIterator interface { // SeekGE moves the iterator to the first span covering a key greater than // or equal to the given key. This is equivalent to seeking to the first // span with an end key greater than the given key. - SeekGE(key []byte) *Span + SeekGE(key []byte) (*Span, error) // SeekLT moves the iterator to the last span covering a key less than the // given key. This is equivalent to seeking to the last span with a start // key less than the given key. - SeekLT(key []byte) *Span + SeekLT(key []byte) (*Span, error) // First moves the iterator to the first span. - First() *Span + First() (*Span, error) // Last moves the iterator to the last span. - Last() *Span + Last() (*Span, error) // Next moves the iterator to the next span. // @@ -40,7 +50,7 @@ type FragmentIterator interface { // key/value pair due to either a prior call to SeekLT or Prev which // returned an invalid span. It is not allowed to call Next when the // previous call to SeekGE, SeekPrefixGE or Next returned an invalid span. - Next() *Span + Next() (*Span, error) // Prev moves the iterator to the previous span. // @@ -48,23 +58,25 @@ type FragmentIterator interface { // key/value pair due to either a prior call to SeekGE or Next which // returned an invalid span. It is not allowed to call Prev when the // previous call to SeekLT or Prev returned an invalid span. - Prev() *Span + Prev() (*Span, error) - // Error returns any accumulated error. - // - // TODO(jackson): Lift errors into return values on the positioning methods. - Error() error - - // Close closes the iterator and returns any accumulated error. Exhausting - // the iterator is not considered to be an error. It is valid to call Close + // Close closes the iterator. It is not in general valid to call Close // multiple times. Other methods should not be called after the iterator has - // been closed. - Close() error -} + // been closed. Spans returned by a previous method should also not be used + // after the iterator has been closed. + Close() + + // WrapChildren wraps any child iterators using the given function. The + // function can call WrapChildren to recursively wrap an entire iterator + // stack. Used only for debug logging. + WrapChildren(wrap WrapFn) -// TableNewSpanIter creates a new iterator for range key spans for the given -// file. -type TableNewSpanIter func(file *manifest.FileMetadata, iterOptions SpanIterOptions) (FragmentIterator, error) + // SetContext replaces the context provided at iterator creation, or the last + // one provided by SetContext. + SetContext(ctx context.Context) + + base.IteratorDebug +} // SpanIterOptions is a subset of IterOptions that are necessary to instantiate // per-sstable span iterators. @@ -106,7 +118,7 @@ func (i *Iter) Init(cmp base.Compare, spans []Span) { } // SeekGE implements FragmentIterator.SeekGE. -func (i *Iter) SeekGE(key []byte) *Span { +func (i *Iter) SeekGE(key []byte) (*Span, error) { // NB: manually inlined sort.Search is ~5% faster. // // Define f(j) = false iff the span i.spans[j] is strictly before `key` @@ -129,13 +141,13 @@ func (i *Iter) SeekGE(key []byte) *Span { // i.index == upper, f(i.index-1) == false, and f(upper) (= f(i.index)) == // true => answer is i.index. if i.index >= len(i.spans) { - return nil + return nil, nil } - return &i.spans[i.index] + return &i.spans[i.index], nil } // SeekLT implements FragmentIterator.SeekLT. -func (i *Iter) SeekLT(key []byte) *Span { +func (i *Iter) SeekLT(key []byte) (*Span, error) { // NB: manually inlined sort.Search is ~5% faster. // // Define f(-1) == false and f(n) == true. @@ -158,63 +170,67 @@ func (i *Iter) SeekLT(key []byte) *Span { // the largest whose key is < the key sought. i.index-- if i.index < 0 { - return nil + return nil, nil } - return &i.spans[i.index] + return &i.spans[i.index], nil } // First implements FragmentIterator.First. -func (i *Iter) First() *Span { +func (i *Iter) First() (*Span, error) { if len(i.spans) == 0 { - return nil + return nil, nil } i.index = 0 - return &i.spans[i.index] + return &i.spans[i.index], nil } // Last implements FragmentIterator.Last. -func (i *Iter) Last() *Span { +func (i *Iter) Last() (*Span, error) { if len(i.spans) == 0 { - return nil + return nil, nil } i.index = len(i.spans) - 1 - return &i.spans[i.index] + return &i.spans[i.index], nil } // Next implements FragmentIterator.Next. -func (i *Iter) Next() *Span { +func (i *Iter) Next() (*Span, error) { if i.index >= len(i.spans) { - return nil + return nil, nil } i.index++ if i.index >= len(i.spans) { - return nil + return nil, nil } - return &i.spans[i.index] + return &i.spans[i.index], nil } // Prev implements FragmentIterator.Prev. -func (i *Iter) Prev() *Span { +func (i *Iter) Prev() (*Span, error) { if i.index < 0 { - return nil + return nil, nil } i.index-- if i.index < 0 { - return nil + return nil, nil } - return &i.spans[i.index] + return &i.spans[i.index], nil } -// Error implements FragmentIterator.Error. -func (i *Iter) Error() error { - return nil -} +// SetContext is part of the FragmentIterator interface. +func (i *Iter) SetContext(ctx context.Context) {} // Close implements FragmentIterator.Close. -func (i *Iter) Close() error { - return nil -} +func (i *Iter) Close() {} func (i *Iter) String() string { - return "fragmented-spans" + return "keyspan.Iter" +} + +// WrapChildren implements FragmentIterator. +func (i *Iter) WrapChildren(wrap WrapFn) {} + +// DebugTree is part of the FragmentIterator interface. +func (i *Iter) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p)", i, i) } diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl/doc.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl/doc.go new file mode 100644 index 0000000..bbc96a7 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl/doc.go @@ -0,0 +1,3 @@ +// Package keyspanimpl contains Pebble-specific implementations of keyspan +// fragment iterators. +package keyspanimpl diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl/level_iter.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl/level_iter.go new file mode 100644 index 0000000..e7d86ff --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl/level_iter.go @@ -0,0 +1,504 @@ +// Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package keyspanimpl + +import ( + "context" + "fmt" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// TableNewSpanIter creates a new iterator for range key spans for the given +// file. +type TableNewSpanIter func( + ctx context.Context, file *manifest.TableMetadata, iterOptions keyspan.SpanIterOptions, +) (keyspan.FragmentIterator, error) + +// LevelIter provides a merged view of spans from sstables in an L1+ level or an +// L0 sublevel. +// +// LevelIter takes advantage of level invariants to only have one sstable span +// block open at one time, opened using the newIter function passed in. +// +// A LevelIter is configured with a key type that is either KeyTypePoint +// (corresponding to range dels) or KeyTypeRange (corresponding to range keys). +// The key type decides which bounds we use for the files (and which files we +// filter out). +// +// LevelIter supports emitting "straddling spans": these are empty spans that +// cover the gaps between the keyspaces of adjacent files. This is an +// optimization to avoid unnecessarily loading files in cases where spans are +// very sparse (in the context of merging spans from multiple levels). We +// currently produce straddling spans only in range key mode. +// +// TODO(radu): investigate enabling straddling spans for rangedel mode. +type LevelIter struct { + cmp base.Compare + keyType manifest.KeyType + // The LSM level this LevelIter is initialized for. + level manifest.Layer + // newIter creates a range del iterator if keyType is KeyTypePoint or a range + // key iterator if keyType is KeyTypeRange. + newIter TableNewSpanIter + // ctx is passed to TableNewSpanIter. + ctx context.Context + + // The options that were passed in. + tableOpts keyspan.SpanIterOptions + + files manifest.LevelIterator + + // file always corresponds to the current position of LevelIter.files. + file *manifest.TableMetadata + pos levelIterPos + // fileIter is the iterator for LevelIter.file when pos is atFile; it is nil + // otherwise. + fileIter keyspan.FragmentIterator + + // lastIter retains the last opened iterator, in case that the next time we + // need an iterator it is for the same file. When fileIter is not nil, + // fileIter is the same with lastIter and file is the same with lastIterFile. + lastIter keyspan.FragmentIterator + lastIterFile *manifest.TableMetadata + + wrapFn keyspan.WrapFn + straddleSpan keyspan.Span + + // TODO(bilal): Add InternalIteratorStats. +} + +// LevelIter implements the keyspan.FragmentIterator interface. +var _ keyspan.FragmentIterator = (*LevelIter)(nil) + +// NewLevelIter returns a LevelIter. +// +// newIter must create a range del iterator for the given file if keyType is +// KeyTypePoint or a range key iterator if keyType is KeyTypeRange. +func NewLevelIter( + ctx context.Context, + opts keyspan.SpanIterOptions, + cmp base.Compare, + newIter TableNewSpanIter, + files manifest.LevelIterator, + level manifest.Layer, + keyType manifest.KeyType, +) *LevelIter { + l := &LevelIter{} + l.Init(ctx, opts, cmp, newIter, files, level, keyType) + return l +} + +// Init initializes a LevelIter. +// +// newIter must create a range del iterator for the given file if keyType is +// KeyTypePoint or a range key iterator if keyType is KeyTypeRange. +func (l *LevelIter) Init( + ctx context.Context, + opts keyspan.SpanIterOptions, + cmp base.Compare, + newIter TableNewSpanIter, + files manifest.LevelIterator, + level manifest.Layer, + keyType manifest.KeyType, +) { + if keyType != manifest.KeyTypePoint && keyType != manifest.KeyTypeRange { + panic("keyType must be point or range") + } + *l = LevelIter{ + cmp: cmp, + keyType: keyType, + level: level, + newIter: newIter, + ctx: ctx, + tableOpts: opts, + files: files.Filter(keyType), + } + l.setPosAfterFile(nil) +} + +// levelIterPos narrows down the position of the iterator in relation to the file: +// +// - atFile: the iterator is currently positioned inside LevelIter.file. +// +// - beforeFile: the iterator is currently positioned right before +// LevelIter.file. If .file is not the first file, this position corresponds +// to a straddle span. +// +// - afterFile: the iterator is currently positioned right after +// LevelIter.file. If .file is not the last file, this position corresponds +// to a straddle span. +// +// Example: +// +// beforeFile atFile afterFile +// | | | +// v v v +// ..--- .files.Prev() ------- .file ------- .files.Next() ---... +// +// Note that each straddle position can be represented in two different ways +// (either after one file, or before the other file). We use the one which makes +// it easier to keep l.file in sync with the l.files iterator (which depends on +// the iteration direction). +// +// When file is nil, it should be considered a sentinel either before or after +// all the files. When file is nil and pos is afterFile, we are positioned +// after the imaginary start sentinel, i.e. before the first file: +// +// afterFile +// | +// v +// .file=nil ------- .files.First() ---... +// +// When file is nil and pos is beforeFile, we are positioned after the +// imaginary end sentinel, i.e. after the last file: +// +// +// beforeFile +// | +// v +// ...--- .files.Last() ------- .file=nil +// +// Note that when straddle spans are not emitted, the position is always +// `atFile` unless the iterator is exhausted. +type levelIterPos uint8 + +const ( + atFile levelIterPos = iota + beforeFile + afterFile +) + +// SeekGE implements keyspan.FragmentIterator. +func (l *LevelIter) SeekGE(key []byte) (*keyspan.Span, error) { + file := l.files.SeekGE(l.cmp, key) + if file == nil { + l.setPosBeforeFile(nil) + return nil, nil + } + if l.straddleSpansEnabled() && l.cmp(key, file.RangeKeyBounds.SmallestUserKey()) < 0 { + // Peek at the previous file. + if prevFile := l.files.Prev(); prevFile != nil { + // We could unconditionally return an empty span between the seek + // key and f.RangeKeyBounds.Smallest(), however if this span is to + // the left of all range keys on this level, it could lead to + // inconsistent behaviour in relative positioning operations. + // Consider this example, with a b-c range key: + // SeekGE(a) -> a-b:{} + // Next() -> b-c{(#5,RANGEKEYSET,@4,foo)} + // Prev() -> nil + // Iterators higher up in the iterator stack rely on this sort + // of relative positioning consistency. + // + // TODO(bilal): Investigate ways to be able to return straddle spans in + // cases similar to the above, while still retaining correctness. + // Return a straddling key instead of loading the file. + l.setPosAfterFile(prevFile) + return l.makeStraddleSpan(prevFile, file), nil + } + // Return the iterator to file. + l.files.Next() + } + + if err := l.setPosAtFile(file); err != nil { + return nil, err + } + if span, err := l.fileIter.SeekGE(key); span != nil || err != nil { + return span, err + } + return l.moveToNextFile() +} + +// SeekLT implements keyspan.FragmentIterator. +func (l *LevelIter) SeekLT(key []byte) (*keyspan.Span, error) { + file := l.files.SeekLT(l.cmp, key) + if file == nil { + l.setPosAfterFile(nil) + return nil, nil + } + if l.straddleSpansEnabled() && l.cmp(file.RangeKeyBounds.LargestUserKey(), key) < 0 { + // Peek at the next file. + if nextFile := l.files.Next(); nextFile != nil { + // We could unconditionally return an empty span between f.LargestRangeKey + // and the seek key, however if this span is to the right of all range keys + // on this level, it could lead to inconsistent behaviour in relative + // positioning operations. Consider this example, with a b-c range key: + // SeekLT(d) -> c-d:{} + // Prev() -> b-c{(#5,RANGEKEYSET,@4,foo)} + // Next() -> nil + // Iterators higher up in the iterator stack rely on this sort of relative + // positioning consistency. + // + // TODO(bilal): Investigate ways to be able to return straddle spans in + // cases similar to the above, while still retaining correctness. + // Return a straddling key instead of loading the file. + l.setPosBeforeFile(nextFile) + return l.makeStraddleSpan(file, nextFile), nil + } + // Return the iterator to file. + l.files.Prev() + } + if err := l.setPosAtFile(file); err != nil { + return nil, err + } + if span, err := l.fileIter.SeekLT(key); span != nil || err != nil { + return span, err + } + return l.moveToPrevFile() +} + +// First implements keyspan.FragmentIterator. +func (l *LevelIter) First() (*keyspan.Span, error) { + file := l.files.First() + if file == nil { + l.setPosBeforeFile(nil) + return nil, nil + } + if err := l.setPosAtFile(file); err != nil { + return nil, err + } + if span, err := l.fileIter.First(); span != nil || err != nil { + return span, err + } + return l.moveToNextFile() +} + +// Last implements keyspan.FragmentIterator. +func (l *LevelIter) Last() (*keyspan.Span, error) { + file := l.files.Last() + if file == nil { + l.setPosAfterFile(nil) + return nil, nil + } + if err := l.setPosAtFile(file); err != nil { + return nil, err + } + if span, err := l.fileIter.Last(); span != nil || err != nil { + return span, err + } + return l.moveToPrevFile() +} + +// Next implements keyspan.FragmentIterator. +func (l *LevelIter) Next() (*keyspan.Span, error) { + if l.file == nil { + if l.pos == afterFile { + return l.First() + } + // Iterator is exhausted. + return nil, nil + } + switch l.pos { + case atFile: + if span, err := l.fileIter.Next(); span != nil || err != nil { + return span, err + } + case beforeFile: + // We were positioned on a straddle span before l.file; now we can advance to the file. + if err := l.setPosAtFile(l.file); err != nil { + return nil, err + } + if span, err := l.fileIter.First(); span != nil || err != nil { + return span, err + } + case afterFile: + // We were positioned on a straddle span after l.file. Move to the next file. + } + return l.moveToNextFile() +} + +// Prev implements keyspan.FragmentIterator. +func (l *LevelIter) Prev() (*keyspan.Span, error) { + if l.file == nil { + if l.pos == beforeFile { + return l.Last() + } + // Iterator is exhausted. + return nil, nil + } + switch l.pos { + case atFile: + if span, err := l.fileIter.Prev(); span != nil || err != nil { + return span, err + } + case afterFile: + // We were positioned on a straddle span after l.file; now we can advance + // (backwards) to the file. + if err := l.setPosAtFile(l.file); err != nil { + return nil, err + } + if span, err := l.fileIter.Last(); span != nil || err != nil { + return span, err + } + case beforeFile: + // We were positioned on a straddle span before l.file. Move to the previous file. + } + return l.moveToPrevFile() +} + +func (l *LevelIter) moveToNextFile() (*keyspan.Span, error) { + if invariants.Enabled && l.pos == beforeFile { + panic("moveToNextFile with beforeFile pos") + } + for { + nextFile := l.files.Next() + if nextFile == nil { + l.setPosBeforeFile(nil) + return nil, nil + } + // Emit a straddle span, if necessary. + if l.pos == atFile && nextFile != nil && l.needStraddleSpan(l.file, nextFile) { + span := l.makeStraddleSpan(l.file, nextFile) + l.setPosBeforeFile(nextFile) + return span, nil + } + if err := l.setPosAtFile(nextFile); err != nil { + return nil, err + } + if span, err := l.fileIter.First(); span != nil || err != nil { + return span, err + } + // The file had no spans; continue. + } +} + +func (l *LevelIter) moveToPrevFile() (*keyspan.Span, error) { + if invariants.Enabled && l.pos == afterFile { + panic("eofBackward with afterFile pos") + } + for { + prevFile := l.files.Prev() + if prevFile == nil { + l.setPosAfterFile(nil) + return nil, nil + } + // Emit a straddle span, if necessary. + if l.pos == atFile && l.file != nil && l.needStraddleSpan(prevFile, l.file) { + span := l.makeStraddleSpan(prevFile, l.file) + l.setPosAfterFile(prevFile) + return span, nil + } + if err := l.setPosAtFile(prevFile); err != nil { + return nil, err + } + if span, err := l.fileIter.Last(); span != nil || err != nil { + return span, err + } + // The file had no spans; continue. + } +} + +// SetContext is part of the FragmentIterator interface. +func (l *LevelIter) SetContext(ctx context.Context) { + l.ctx = ctx + if l.lastIter != nil { + l.lastIter.SetContext(ctx) + } +} + +// Close implements keyspan.FragmentIterator. +func (l *LevelIter) Close() { + l.file = nil + l.fileIter = nil + if l.lastIter != nil { + l.lastIter.Close() + l.lastIter = nil + l.lastIterFile = nil + } +} + +// String implements keyspan.FragmentIterator. +func (l *LevelIter) String() string { + if l.file != nil { + return fmt.Sprintf("%s: fileNum=%s", l.level, l.file.TableNum) + } + return fmt.Sprintf("%s: fileNum=", l.level) +} + +// WrapChildren implements FragmentIterator. +func (l *LevelIter) WrapChildren(wrap keyspan.WrapFn) { + if l.fileIter != nil { + l.fileIter = wrap(l.fileIter) + } + l.wrapFn = wrap +} + +// DebugTree is part of the FragmentIterator interface. +func (l *LevelIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p) %s", l, l, l.level) + if l.fileIter != nil { + l.fileIter.DebugTree(n) + } +} + +func (l *LevelIter) setPosBeforeFile(f *manifest.TableMetadata) { + l.setPosInternal(f, beforeFile) +} + +func (l *LevelIter) setPosAfterFile(f *manifest.TableMetadata) { + l.setPosInternal(f, afterFile) +} + +// setPosAtFile sets the current position and opens an iterator for the file (if +// necessary). +func (l *LevelIter) setPosAtFile(f *manifest.TableMetadata) error { + l.setPosInternal(f, atFile) + // See if the last iterator was for the same file; if not, close it and open a + // new one. + if l.lastIter == nil || l.lastIterFile != f { + if l.lastIter != nil { + l.lastIter.Close() + l.lastIter = nil + l.lastIterFile = nil + } + iter, err := l.newIter(l.ctx, l.file, l.tableOpts) + if err != nil { + return err + } + iter = keyspan.MaybeAssert(iter, l.cmp) + if l.wrapFn != nil { + iter = l.wrapFn(iter) + } + l.lastIter = iter + l.lastIterFile = f + } + l.fileIter = l.lastIter + return nil +} + +// setPos sets l.file and l.pos (and closes the iteris for the new file). +func (l *LevelIter) setPosInternal(f *manifest.TableMetadata, pos levelIterPos) { + l.file = f + l.fileIter = nil + l.pos = pos +} + +func (l *LevelIter) straddleSpansEnabled() bool { + return l.keyType == manifest.KeyTypeRange +} + +// needStraddleSpan returns true if straddle spans are enabled and there is a +// gap between the bounds of the files. file and nextFile are assumed to be +// consecutive files in the level, in the order they appear in the level. +func (l *LevelIter) needStraddleSpan(file, nextFile *manifest.TableMetadata) bool { + // We directly use range key bounds because that is the current condition for + // straddleSpansEnabled. + return l.straddleSpansEnabled() && l.cmp(file.RangeKeyBounds.LargestUserKey(), nextFile.RangeKeyBounds.SmallestUserKey()) < 0 +} + +// makeStraddleSpan returns a straddle span that covers the gap between file and +// nextFile. +func (l *LevelIter) makeStraddleSpan(file, nextFile *manifest.TableMetadata) *keyspan.Span { + l.straddleSpan = keyspan.Span{ + Start: file.RangeKeyBounds.LargestUserKey(), + End: nextFile.RangeKeyBounds.SmallestUserKey(), + Keys: nil, + } + return &l.straddleSpan +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/merging_iter.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl/merging_iter.go similarity index 86% rename from vendor/github.com/cockroachdb/pebble/internal/keyspan/merging_iter.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl/merging_iter.go index c73ba59..f6141d3 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/merging_iter.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl/merging_iter.go @@ -2,16 +2,20 @@ // of this source code is governed by a BSD-style license that can be found in // the LICENSE file. -package keyspan +package keyspanimpl import ( "bytes" + "cmp" + "context" "fmt" - "sort" + "slices" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" ) // TODO(jackson): Consider implementing an optimization to seek lower levels @@ -21,10 +25,6 @@ import ( // seeks would require introducing key comparisons to switchTo{Min,Max}Heap // where there currently are none. -// TODO(jackson): There are several opportunities to use base.Equal in the -// MergingIter implementation, but will require a bit of plumbing to thread the -// Equal function. - // MergingIter merges spans across levels of the LSM, exposing an iterator over // spans that yields sets of spans fragmented at unique user key boundaries. // @@ -168,6 +168,7 @@ import ( // accommodate this, find{Next,Prev}FragmentSet copy the initial boundary if the // subsequent Next/Prev would move to the next span. type MergingIter struct { + comparer *base.Comparer *MergingBuffers // start and end hold the bounds for the span currently under the // iterator position. @@ -179,12 +180,11 @@ type MergingIter struct { // transformer defines a transformation to be applied to a span before it's // yielded to the user. Transforming may filter individual keys contained // within the span. - transformer Transformer + transformer keyspan.Transformer // span holds the iterator's current span. This span is used as the // destination for transforms. Every tranformed span overwrites the // previous. - span Span - err error + span keyspan.Span dir int8 // alloc preallocates mergingIterLevel and mergingIterItems for use by the @@ -202,17 +202,18 @@ type MergingIter struct { // MergingBuffers holds buffers used while merging keyspans. type MergingBuffers struct { // keys holds all of the keys across all levels that overlap the key span - // [start, end), sorted by Trailer descending. This slice is reconstituted + // [start, end), sorted by InternalKeyTrailer descending. This slice is reconstituted // in synthesizeKeys from each mergingIterLevel's keys every time the // [start, end) bounds change. // // Each element points into a child iterator's memory, so the keys may not // be directly modified. - keys keysBySeqNumKind + keys []keyspan.Key // levels holds levels allocated by MergingIter.init. The MergingIter will // prefer use of its `manifest.NumLevels+3` array, so this slice will be // longer if set. levels []mergingIterLevel + wrapFn keyspan.WrapFn // heap holds a slice for the merging iterator heap allocated by // MergingIter.init. The MergingIter will prefer use of its // `manifest.NumLevels+3` items array, so this slice will be longer if set. @@ -223,70 +224,86 @@ type MergingBuffers struct { // PrepareForReuse discards any excessively large buffers. func (bufs *MergingBuffers) PrepareForReuse() { - if cap(bufs.buf) > bufferReuseMaxCapacity { + if cap(bufs.buf) > keyspan.BufferReuseMaxCapacity { bufs.buf = nil } } // MergingIter implements the FragmentIterator interface. -var _ FragmentIterator = (*MergingIter)(nil) +var _ keyspan.FragmentIterator = (*MergingIter)(nil) type mergingIterLevel struct { - iter FragmentIterator + iter keyspan.FragmentIterator // heapKey holds the current key at this level for use within the heap. heapKey boundKey } -func (l *mergingIterLevel) next() { +func (l *mergingIterLevel) next() error { if l.heapKey.kind == boundKindFragmentStart { l.heapKey = boundKey{ kind: boundKindFragmentEnd, key: l.heapKey.span.End, span: l.heapKey.span, } - return + return nil } - if s := l.iter.Next(); s == nil { + s, err := l.iter.Next() + switch { + case err != nil: + return err + case s == nil: l.heapKey = boundKey{kind: boundKindInvalid} - } else { + return nil + default: l.heapKey = boundKey{ kind: boundKindFragmentStart, key: s.Start, span: s, } + return nil } } -func (l *mergingIterLevel) prev() { +func (l *mergingIterLevel) prev() error { if l.heapKey.kind == boundKindFragmentEnd { l.heapKey = boundKey{ kind: boundKindFragmentStart, key: l.heapKey.span.Start, span: l.heapKey.span, } - return + return nil } - if s := l.iter.Prev(); s == nil { + s, err := l.iter.Prev() + switch { + case err != nil: + return err + case s == nil: l.heapKey = boundKey{kind: boundKindInvalid} - } else { + return nil + default: l.heapKey = boundKey{ kind: boundKindFragmentEnd, key: s.End, span: s, } + return nil } } // Init initializes the merging iterator with the provided fragment iterators. func (m *MergingIter) Init( - cmp base.Compare, transformer Transformer, bufs *MergingBuffers, iters ...FragmentIterator, + comparer *base.Comparer, + transformer keyspan.Transformer, + bufs *MergingBuffers, + iters ...keyspan.FragmentIterator, ) { *m = MergingIter{ + comparer: comparer, MergingBuffers: bufs, transformer: transformer, } - m.heap.cmp = cmp + m.heap.cmp = comparer.Compare levels, items := m.levels, m.heap.items // Invariant: cap(levels) >= cap(items) @@ -306,21 +323,25 @@ func (m *MergingIter) Init( } for i := range m.levels { m.levels[i] = mergingIterLevel{iter: iters[i]} + if m.wrapFn != nil { + m.levels[i].iter = m.wrapFn(m.levels[i].iter) + } } } // AddLevel adds a new level to the bottom of the merging iterator. AddLevel // must be called after Init and before any other method. -func (m *MergingIter) AddLevel(iter FragmentIterator) { +func (m *MergingIter) AddLevel(iter keyspan.FragmentIterator) { + if m.wrapFn != nil { + iter = m.wrapFn(iter) + } m.levels = append(m.levels, mergingIterLevel{iter: iter}) } // SeekGE moves the iterator to the first span covering a key greater than // or equal to the given key. This is equivalent to seeking to the first // span with an end key greater than the given key. -func (m *MergingIter) SeekGE(key []byte) *Span { - m.invalidate() // clear state about current position - +func (m *MergingIter) SeekGE(key []byte) (*keyspan.Span, error) { // SeekGE(k) seeks to the first span with an end key greater than the given // key. The merged span M that we're searching for might straddle the seek // `key`. In this case, the M.Start may be a key ≤ the seek key. @@ -366,16 +387,19 @@ func (m *MergingIter) SeekGE(key []byte) *Span { // root of the max heap is a preliminary value for `M.Start`. for i := range m.levels { l := &m.levels[i] - s := l.iter.SeekLT(key) - if s == nil { + s, err := l.iter.SeekLT(key) + switch { + case err != nil: + return nil, err + case s == nil: l.heapKey = boundKey{kind: boundKindInvalid} - } else if m.cmp(s.End, key) <= 0 { + case m.comparer.Compare(s.End, key) <= 0: l.heapKey = boundKey{ kind: boundKindFragmentEnd, key: s.End, span: s, } - } else { + default: // s.End > key && s.Start < key // We need to use this span's start bound, since that's the largest // bound ≤ key. @@ -387,13 +411,13 @@ func (m *MergingIter) SeekGE(key []byte) *Span { } } m.initMaxHeap() - if m.err != nil { - return nil - } else if len(m.heap.items) == 0 { + if len(m.heap.items) == 0 { // There are no spans covering any key < `key`. There is no span that // straddles the seek key. Reorient the heap into a min heap and return // the first span we find in the forward direction. - m.switchToMinHeap() + if err := m.switchToMinHeap(); err != nil { + return nil, err + } return m.findNextFragmentSet() } @@ -424,11 +448,10 @@ func (m *MergingIter) SeekGE(key []byte) *Span { // every level, and then establish a min heap. This allows us to obtain the // smallest boundary key > `key`, which will serve as our candidate end // bound. - m.switchToMinHeap() - if m.err != nil { - return nil + if err := m.switchToMinHeap(); err != nil { + return nil, err } else if len(m.heap.items) == 0 { - return nil + return nil, nil } // Check for the case 3 described above. It's possible that when we switch @@ -436,26 +459,25 @@ func (m *MergingIter) SeekGE(key []byte) *Span { // equal to the seek key `key`. In this case, we want this key to be our // start boundary. if m.heap.items[0].boundKey.kind == boundKindFragmentStart && - m.cmp(m.heap.items[0].boundKey.key, key) == 0 { + m.comparer.Equal(m.heap.items[0].boundKey.key, key) { // Call findNextFragmentSet, which will set m.start to the heap root and // proceed forward. return m.findNextFragmentSet() } m.end = m.heap.items[0].boundKey.key - if found, s := m.synthesizeKeys(+1); found && s != nil { - return s + if found, s, err := m.synthesizeKeys(+1); err != nil { + return nil, err + } else if found && s != nil { + return s, nil } return m.findNextFragmentSet() - } // SeekLT moves the iterator to the last span covering a key less than the // given key. This is equivalent to seeking to the last span with a start // key less than the given key. -func (m *MergingIter) SeekLT(key []byte) *Span { - m.invalidate() // clear state about current position - +func (m *MergingIter) SeekLT(key []byte) (*keyspan.Span, error) { // SeekLT(k) seeks to the last span with a start key less than the given // key. The merged span M that we're searching for might straddle the seek // `key`. In this case, the M.End may be a key ≥ the seek key. @@ -501,16 +523,19 @@ func (m *MergingIter) SeekLT(key []byte) *Span { // root of the min heap is a preliminary value for `M.End`. for i := range m.levels { l := &m.levels[i] - s := l.iter.SeekGE(key) - if s == nil { + s, err := l.iter.SeekGE(key) + switch { + case err != nil: + return nil, err + case s == nil: l.heapKey = boundKey{kind: boundKindInvalid} - } else if m.cmp(s.Start, key) >= 0 { + case m.comparer.Compare(s.Start, key) >= 0: l.heapKey = boundKey{ kind: boundKindFragmentStart, key: s.Start, span: s, } - } else { + default: // s.Start < key // We need to use this span's end bound, since that's the smallest // bound > key. @@ -522,13 +547,13 @@ func (m *MergingIter) SeekLT(key []byte) *Span { } } m.initMinHeap() - if m.err != nil { - return nil - } else if len(m.heap.items) == 0 { + if len(m.heap.items) == 0 { // There are no spans covering any key ≥ `key`. There is no span that // straddles the seek key. Reorient the heap into a max heap and return // the first span we find in the reverse direction. - m.switchToMaxHeap() + if err := m.switchToMaxHeap(); err != nil { + return nil, err + } return m.findPrevFragmentSet() } @@ -559,37 +584,41 @@ func (m *MergingIter) SeekLT(key []byte) *Span { // every level, and then establish a max heap. This allows us to obtain the // largest boundary key < `key`, which will serve as our candidate start // bound. - m.switchToMaxHeap() - if m.err != nil { - return nil + if err := m.switchToMaxHeap(); err != nil { + return nil, err } else if len(m.heap.items) == 0 { - return nil + return nil, nil } // Check for the case 3 described above. It's possible that when we switch // heap directions, we discover an end boundary of some child span that is // equal to the seek key `key`. In this case, we want this key to be our end // boundary. if m.heap.items[0].boundKey.kind == boundKindFragmentEnd && - m.cmp(m.heap.items[0].boundKey.key, key) == 0 { + m.comparer.Equal(m.heap.items[0].boundKey.key, key) { // Call findPrevFragmentSet, which will set m.end to the heap root and // proceed backwards. return m.findPrevFragmentSet() } m.start = m.heap.items[0].boundKey.key - if found, s := m.synthesizeKeys(-1); found && s != nil { - return s + if found, s, err := m.synthesizeKeys(-1); err != nil { + return nil, err + } else if found && s != nil { + return s, nil } return m.findPrevFragmentSet() } // First seeks the iterator to the first span. -func (m *MergingIter) First() *Span { - m.invalidate() // clear state about current position +func (m *MergingIter) First() (*keyspan.Span, error) { for i := range m.levels { - if s := m.levels[i].iter.First(); s == nil { + s, err := m.levels[i].iter.First() + switch { + case err != nil: + return nil, err + case s == nil: m.levels[i].heapKey = boundKey{kind: boundKindInvalid} - } else { + default: m.levels[i].heapKey = boundKey{ kind: boundKindFragmentStart, key: s.Start, @@ -602,12 +631,15 @@ func (m *MergingIter) First() *Span { } // Last seeks the iterator to the last span. -func (m *MergingIter) Last() *Span { - m.invalidate() // clear state about current position +func (m *MergingIter) Last() (*keyspan.Span, error) { for i := range m.levels { - if s := m.levels[i].iter.Last(); s == nil { + s, err := m.levels[i].iter.Last() + switch { + case err != nil: + return nil, err + case s == nil: m.levels[i].heapKey = boundKey{kind: boundKindInvalid} - } else { + default: m.levels[i].heapKey = boundKey{ kind: boundKindFragmentEnd, key: s.End, @@ -620,51 +652,45 @@ func (m *MergingIter) Last() *Span { } // Next advances the iterator to the next span. -func (m *MergingIter) Next() *Span { - if m.err != nil { - return nil - } +func (m *MergingIter) Next() (*keyspan.Span, error) { if m.dir == +1 && (m.end == nil || m.start == nil) { - return nil + return nil, nil } if m.dir != +1 { - m.switchToMinHeap() + if err := m.switchToMinHeap(); err != nil { + return nil, err + } } return m.findNextFragmentSet() } // Prev advances the iterator to the previous span. -func (m *MergingIter) Prev() *Span { - if m.err != nil { - return nil - } +func (m *MergingIter) Prev() (*keyspan.Span, error) { if m.dir == -1 && (m.end == nil || m.start == nil) { - return nil + return nil, nil } if m.dir != -1 { - m.switchToMaxHeap() + if err := m.switchToMaxHeap(); err != nil { + return nil, err + } } return m.findPrevFragmentSet() } -// Error returns any accumulated error. -func (m *MergingIter) Error() error { - if m.heap.len() == 0 || m.err != nil { - return m.err +// SetContext is part of the FragmentIterator interface. +func (m *MergingIter) SetContext(ctx context.Context) { + for i := range m.levels { + m.levels[i].iter.SetContext(ctx) } - return m.levels[m.heap.items[0].index].iter.Error() } // Close closes the iterator, releasing all acquired resources. -func (m *MergingIter) Close() error { +func (m *MergingIter) Close() { for i := range m.levels { - if err := m.levels[i].iter.Close(); err != nil && m.err == nil { - m.err = err - } + m.levels[i].iter.Close() } m.levels = nil m.heap.items = m.heap.items[:0] - return m.err } // String implements fmt.Stringer. @@ -692,17 +718,12 @@ func (m *MergingIter) initHeap() { index: i, boundKey: &l.heapKey, }) - } else { - m.err = firstError(m.err, l.iter.Error()) - if m.err != nil { - return - } } } m.heap.init() } -func (m *MergingIter) switchToMinHeap() { +func (m *MergingIter) switchToMinHeap() error { // switchToMinHeap reorients the heap for forward iteration, without moving // the current MergingIter position. @@ -741,19 +762,22 @@ func (m *MergingIter) switchToMinHeap() { if invariants.Enabled { for i := range m.levels { l := &m.levels[i] - if l.heapKey.kind != boundKindInvalid && m.cmp(l.heapKey.key, m.start) > 0 { + if l.heapKey.kind != boundKindInvalid && m.comparer.Compare(l.heapKey.key, m.start) > 0 { panic("pebble: invariant violation: max-heap key > m.start") } } } for i := range m.levels { - m.levels[i].next() + if err := m.levels[i].next(); err != nil { + return err + } } m.initMinHeap() + return nil } -func (m *MergingIter) switchToMaxHeap() { +func (m *MergingIter) switchToMaxHeap() error { // switchToMaxHeap reorients the heap for reverse iteration, without moving // the current MergingIter position. @@ -793,30 +817,29 @@ func (m *MergingIter) switchToMaxHeap() { if invariants.Enabled { for i := range m.levels { l := &m.levels[i] - if l.heapKey.kind != boundKindInvalid && m.cmp(l.heapKey.key, m.end) < 0 { + if l.heapKey.kind != boundKindInvalid && m.comparer.Compare(l.heapKey.key, m.end) < 0 { panic("pebble: invariant violation: min-heap key < m.end") } } } for i := range m.levels { - m.levels[i].prev() + if err := m.levels[i].prev(); err != nil { + return err + } } m.initMaxHeap() + return nil } -func (m *MergingIter) cmp(a, b []byte) int { - return m.heap.cmp(a, b) -} - -func (m *MergingIter) findNextFragmentSet() *Span { +func (m *MergingIter) findNextFragmentSet() (*keyspan.Span, error) { // Each iteration of this loop considers a new merged span between unique // user keys. An iteration may find that there exists no overlap for a given // span, (eg, if the spans [a,b), [d, e) exist within level iterators, the // below loop will still consider [b,d) before continuing to [d, e)). It // returns when it finds a span that is covered by at least one key. - for m.heap.len() > 0 && m.err == nil { + for m.heap.len() > 0 { // Initialize the next span's start bound. SeekGE and First prepare the // heap without advancing. Next leaves the heap in a state such that the // root is the smallest bound key equal to the returned span's end key, @@ -855,11 +878,15 @@ func (m *MergingIter) findNextFragmentSet() *Span { // L2: [c, e) // If we're positioned at L1's end(c) end boundary, we want to advance // to the first bound > c. - m.nextEntry() - for len(m.heap.items) > 0 && m.err == nil && m.cmp(m.heapRoot(), m.start) == 0 { - m.nextEntry() + if err := m.nextEntry(); err != nil { + return nil, err } - if len(m.heap.items) == 0 || m.err != nil { + for len(m.heap.items) > 0 && m.comparer.Equal(m.heapRoot(), m.start) { + if err := m.nextEntry(); err != nil { + return nil, err + } + } + if len(m.heap.items) == 0 { break } @@ -877,23 +904,25 @@ func (m *MergingIter) findNextFragmentSet() *Span { // we elide empty spans created by the mergingIter itself that don't overlap // with any child iterator returned spans (i.e. empty spans that bridge two // distinct child-iterator-defined spans). - if found, s := m.synthesizeKeys(+1); found && s != nil { - return s + if found, s, err := m.synthesizeKeys(+1); err != nil { + return nil, err + } else if found && s != nil { + return s, nil } } // Exhausted. m.clear() - return nil + return nil, nil } -func (m *MergingIter) findPrevFragmentSet() *Span { +func (m *MergingIter) findPrevFragmentSet() (*keyspan.Span, error) { // Each iteration of this loop considers a new merged span between unique // user keys. An iteration may find that there exists no overlap for a given // span, (eg, if the spans [a,b), [d, e) exist within level iterators, the // below loop will still consider [b,d) before continuing to [a, b)). It // returns when it finds a span that is covered by at least one key. - for m.heap.len() > 0 && m.err == nil { + for m.heap.len() > 0 { // Initialize the next span's end bound. SeekLT and Last prepare the // heap without advancing. Prev leaves the heap in a state such that the // root is the largest bound key equal to the returned span's start key, @@ -931,11 +960,15 @@ func (m *MergingIter) findPrevFragmentSet() *Span { // L2: [c, e) // If we're positioned at L1's start(c) start boundary, we want to prev // to move to the first bound < c. - m.prevEntry() - for len(m.heap.items) > 0 && m.err == nil && m.cmp(m.heapRoot(), m.end) == 0 { - m.prevEntry() + if err := m.prevEntry(); err != nil { + return nil, err + } + for len(m.heap.items) > 0 && m.comparer.Equal(m.heapRoot(), m.end) { + if err := m.prevEntry(); err != nil { + return nil, err + } } - if len(m.heap.items) == 0 || m.err != nil { + if len(m.heap.items) == 0 { break } @@ -953,13 +986,15 @@ func (m *MergingIter) findPrevFragmentSet() *Span { // we elide empty spans created by the mergingIter itself that don't overlap // with any child iterator returned spans (i.e. empty spans that bridge two // distinct child-iterator-defined spans). - if found, s := m.synthesizeKeys(-1); found && s != nil { - return s + if found, s, err := m.synthesizeKeys(-1); err != nil { + return nil, err + } else if found && s != nil { + return s, nil } } // Exhausted. m.clear() - return nil + return nil, nil } func (m *MergingIter) heapRoot() []byte { @@ -979,9 +1014,9 @@ func (m *MergingIter) heapRoot() []byte { // // The boolean return value, `found`, is true if the returned span overlaps // with a span returned by a child iterator. -func (m *MergingIter) synthesizeKeys(dir int8) (bool, *Span) { +func (m *MergingIter) synthesizeKeys(dir int8) (bool, *keyspan.Span, error) { if invariants.Enabled { - if m.cmp(m.start, m.end) >= 0 { + if m.comparer.Compare(m.start, m.end) >= 0 { panic(fmt.Sprintf("pebble: invariant violation: span start ≥ end: %s >= %s", m.start, m.end)) } } @@ -995,73 +1030,70 @@ func (m *MergingIter) synthesizeKeys(dir int8) (bool, *Span) { found = true } } + // Sort the keys by sequence number in descending order. + // // TODO(jackson): We should be able to remove this sort and instead // guarantee that we'll return keys in the order of the levels they're from. // With careful iterator construction, this would guarantee that they're // sorted by trailer descending for the range key iteration use case. - sort.Sort(&m.keys) + slices.SortFunc(m.keys, func(a, b keyspan.Key) int { + return cmp.Compare(b.Trailer, a.Trailer) + }) // Apply the configured transform. See VisibleTransform. - m.span = Span{ + m.span = keyspan.Span{ Start: m.start, End: m.end, Keys: m.keys, - KeysOrder: ByTrailerDesc, + KeysOrder: keyspan.ByTrailerDesc, } - // NB: m.heap.cmp is a base.Compare, whereas m.cmp is a method on - // MergingIter. - if err := m.transformer.Transform(m.heap.cmp, m.span, &m.span); err != nil { - m.err = err - return false, nil + if err := m.transformer.Transform(m.comparer.CompareRangeSuffixes, m.span, &m.span); err != nil { + return false, nil, err } - return found, &m.span -} - -func (m *MergingIter) invalidate() { - m.err = nil + return found, &m.span, nil } func (m *MergingIter) clear() { for fi := range m.keys { - m.keys[fi] = Key{} + m.keys[fi] = keyspan.Key{} } m.keys = m.keys[:0] } // nextEntry steps to the next entry. -func (m *MergingIter) nextEntry() { +func (m *MergingIter) nextEntry() error { l := &m.levels[m.heap.items[0].index] - l.next() + if err := l.next(); err != nil { + return err + } if !l.heapKey.valid() { // l.iter is exhausted. - m.err = l.iter.Error() - if m.err == nil { - m.heap.pop() - } - return + m.heap.pop() + return nil } if m.heap.len() > 1 { m.heap.fix(0) } + return nil } // prevEntry steps to the previous entry. -func (m *MergingIter) prevEntry() { +func (m *MergingIter) prevEntry() error { l := &m.levels[m.heap.items[0].index] - l.prev() + if err := l.prev(); err != nil { + return err + } if !l.heapKey.valid() { // l.iter is exhausted. - m.err = l.iter.Error() - if m.err == nil { - m.heap.pop() - } - return + m.heap.pop() + return nil } if m.heap.len() > 1 { m.heap.fix(0) } + return nil } // DebugString returns a string representing the current internal state of the @@ -1075,6 +1107,24 @@ func (m *MergingIter) DebugString() string { return buf.String() } +// WrapChildren implements FragmentIterator. +func (m *MergingIter) WrapChildren(wrap keyspan.WrapFn) { + for i := range m.levels { + m.levels[i].iter = wrap(m.levels[i].iter) + } + m.wrapFn = wrap +} + +// DebugTree is part of the FragmentIterator interface. +func (m *MergingIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", m, m) + for i := range m.levels { + if iter := m.levels[i].iter; iter != nil { + m.levels[i].iter.DebugTree(n) + } + } +} + type mergingIterItem struct { // boundKey points to the corresponding mergingIterLevel's `iterKey`. *boundKey @@ -1183,7 +1233,7 @@ type boundKey struct { // // If kind is boundKindFragmentStart, then key is span.Start. If kind is // boundKindFragmentEnd, then key is span.End. - span *Span + span *keyspan.Span } func (k boundKey) valid() bool { diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/logging_iter.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/logging_iter.go new file mode 100644 index 0000000..a66bd6f --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/logging_iter.go @@ -0,0 +1,159 @@ +// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package keyspan + +import ( + "context" + "fmt" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// WrapFn is the prototype for a function that wraps a FragmentIterator. +type WrapFn func(in FragmentIterator) FragmentIterator + +// InjectLogging wraps all iterators in a stack with logging iterators, +// producing log messages showing each operation and its result. +func InjectLogging(iter FragmentIterator, logger base.Logger) FragmentIterator { + // All iterators in the stack will use the same logging state. + state := &loggingState{ + log: logger, + } + var wrap WrapFn + wrap = func(in FragmentIterator) FragmentIterator { + if in == nil { + return nil + } + // Recursively wrap all descendants. + in.WrapChildren(wrap) + return newLoggingIter(state, in) + } + return wrap(iter) +} + +func newLoggingIter(state *loggingState, iter FragmentIterator) FragmentIterator { + return &loggingIter{ + iter: iter, + state: state, + context: fmt.Sprintf("%T(%p):", iter, iter), + } +} + +// loggingIter is a pass-through FragmentIterator wrapper which performs checks +// on what the wrapped iterator returns. +type loggingIter struct { + iter FragmentIterator + state *loggingState + context string +} + +// loggingState is shared by all iterators in a stack. +type loggingState struct { + node treeprinter.Node + log base.Logger +} + +func (i *loggingIter) opStartf(format string, args ...any) func(results ...any) { + savedNode := i.state.node + + n := i.state.node + topLevelOp := false + if n == (treeprinter.Node{}) { + n = treeprinter.New() + topLevelOp = true + } + op := fmt.Sprintf(format, args...) + + child := n.Childf("%s %s", i.context, op) + i.state.node = child + + return func(results ...any) { + if len(results) > 0 { + child.Childf("%s", fmt.Sprint(results...)) + } + if topLevelOp { + for _, row := range n.FormattedRows() { + i.state.log.Infof("%s\n", row) + } + } + i.state.node = savedNode + } +} + +var _ FragmentIterator = (*loggingIter)(nil) + +// SeekGE implements FragmentIterator. +func (i *loggingIter) SeekGE(key []byte) (*Span, error) { + opEnd := i.opStartf("SeekGE(%q)", key) + span, err := i.iter.SeekGE(key) + opEnd(span, err) + return span, err +} + +// SeekLT implements FragmentIterator. +func (i *loggingIter) SeekLT(key []byte) (*Span, error) { + opEnd := i.opStartf("SeekLT(%q)", key) + span, err := i.iter.SeekLT(key) + opEnd(span, err) + return span, err +} + +// First implements FragmentIterator. +func (i *loggingIter) First() (*Span, error) { + opEnd := i.opStartf("First()") + span, err := i.iter.First() + opEnd(span, err) + return span, err +} + +// Last implements FragmentIterator. +func (i *loggingIter) Last() (*Span, error) { + opEnd := i.opStartf("Last()") + span, err := i.iter.Last() + opEnd(span, err) + return span, err +} + +// Next implements FragmentIterator. +func (i *loggingIter) Next() (*Span, error) { + opEnd := i.opStartf("Next()") + span, err := i.iter.Next() + opEnd(span, err) + return span, err +} + +// Prev implements FragmentIterator. +func (i *loggingIter) Prev() (*Span, error) { + opEnd := i.opStartf("Prev()") + span, err := i.iter.Prev() + opEnd(span, err) + return span, err +} + +// SetContext is part of the FragmentIterator interface. +func (i *loggingIter) SetContext(ctx context.Context) { + i.iter.SetContext(ctx) +} + +// Close implements FragmentIterator. +func (i *loggingIter) Close() { + opEnd := i.opStartf("Close()") + i.iter.Close() + opEnd() +} + +// WrapChildren implements FragmentIterator. +func (i *loggingIter) WrapChildren(wrap WrapFn) { + i.iter = wrap(i.iter) +} + +// DebugTree is part of the FragmentIterator interface. +func (i *loggingIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.iter != nil { + i.iter.DebugTree(n) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/seek.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/seek.go new file mode 100644 index 0000000..2eafc06 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/seek.go @@ -0,0 +1,24 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package keyspan + +import "github.com/cockroachdb/pebble/v2/internal/base" + +// SeekLE seeks to the span that contains or is before the target key. If an +// error occurs while seeking iter, a nil span and non-nil error is returned. +func SeekLE(cmp base.Compare, iter FragmentIterator, key []byte) (*Span, error) { + // Seek to the smallest span that contains a key ≥ key. If some span + // contains the key `key`, SeekGE will return it. + iterSpan, err := iter.SeekGE(key) + if err != nil { + return nil, err + } + if iterSpan != nil && cmp(key, iterSpan.Start) >= 0 { + return iterSpan, nil + } + // No span covers exactly `key`. Step backwards to move onto the largest + // span < key. + return iter.Prev() +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/keyspan/span.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/span.go similarity index 69% rename from vendor/github.com/cockroachdb/pebble/internal/keyspan/span.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/span.go index 257b373..4db835e 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/keyspan/span.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/span.go @@ -2,17 +2,17 @@ // of this source code is governed by a BSD-style license that can be found in // the LICENSE file. -package keyspan // import "github.com/cockroachdb/pebble/internal/keyspan" +package keyspan // import "github.com/cockroachdb/pebble/v2/internal/keyspan" import ( "bytes" + "cmp" "fmt" - "sort" - "strconv" + "slices" "strings" "unicode" - "github.com/cockroachdb/pebble/internal/base" + "github.com/cockroachdb/pebble/v2/internal/base" ) // Span represents a set of keys over a span of user key space. All of the keys @@ -26,6 +26,13 @@ import ( // Currently the only supported key kinds are: // // RANGEDEL, RANGEKEYSET, RANGEKEYUNSET, RANGEKEYDEL. +// +// Spans either have only RANGEDEL keys (range del spans), or a mix of +// RANGEKESET/RANGEKEYUNSET/RANGEKEYDEL keys (range key spans). +// +// Note that at the user level, range key span start and end keys never have +// suffixes. Internally, range key spans get fragmented along sstable +// boundaries; however, this is transparent to the user. type Span struct { // Start and End encode the user key range of all the contained items, with // an inclusive start key and exclusive end key. Both Start and End must be @@ -47,7 +54,7 @@ type Span struct { type KeysOrder int8 const ( - // ByTrailerDesc indicates a Span's keys are sorted by Trailer descending. + // ByTrailerDesc indicates a Span's keys are sorted by InternalKeyTrailer descending. // This is the default ordering, and the ordering used during physical // storage. ByTrailerDesc KeysOrder = iota @@ -61,7 +68,7 @@ const ( // is applied. type Key struct { // Trailer contains the key kind and sequence number. - Trailer uint64 + Trailer base.InternalKeyTrailer // Suffix holds an optional suffix associated with the key. This is only // non-nil for RANGEKEYSET and RANGEKEYUNSET keys. Suffix []byte @@ -72,17 +79,17 @@ type Key struct { } // SeqNum returns the sequence number component of the key. -func (k Key) SeqNum() uint64 { - return k.Trailer >> 8 +func (k Key) SeqNum() base.SeqNum { + return k.Trailer.SeqNum() } // VisibleAt returns true if the provided key is visible at the provided // snapshot sequence number. It interprets batch sequence numbers as always // visible, because non-visible batch span keys are filtered when they're // fragmented. -func (k Key) VisibleAt(snapshot uint64) bool { +func (k Key) VisibleAt(snapshot base.SeqNum) bool { seq := k.SeqNum() - return seq < snapshot || seq&base.InternalKeySeqNumBatch != 0 + return seq < snapshot || seq&base.SeqNumBatchBit != 0 } // Kind returns the kind component of the key. @@ -93,12 +100,47 @@ func (k Key) Kind() base.InternalKeyKind { // Equal returns true if this Key is equal to the given key. Two keys are said // to be equal if the two Keys have equal trailers, suffix and value. Suffix // comparison uses the provided base.Compare func. Value comparison is bytewise. -func (k Key) Equal(equal base.Equal, b Key) bool { +func (k Key) Equal(suffixCmp base.CompareRangeSuffixes, b Key) bool { return k.Trailer == b.Trailer && - equal(k.Suffix, b.Suffix) && + suffixCmp(k.Suffix, b.Suffix) == 0 && bytes.Equal(k.Value, b.Value) } +// CopyFrom copies the contents of another key, retaining the Suffix and Value slices. +func (k *Key) CopyFrom(other Key) { + k.Trailer = other.Trailer + k.Suffix = append(k.Suffix[:0], other.Suffix...) + k.Value = append(k.Value[:0], other.Value...) +} + +// Clone creates a deep clone of the key, copying the Suffix and Value +// slices. +func (k Key) Clone() Key { + res := Key{ + Trailer: k.Trailer, + } + if len(k.Suffix) > 0 { + res.Suffix = slices.Clone(k.Suffix) + } + if len(k.Value) > 0 { + res.Value = slices.Clone(k.Value) + } + return res +} + +func (k Key) String() string { + var b strings.Builder + fmt.Fprintf(&b, "(#%d,%s", k.SeqNum(), k.Kind()) + if len(k.Suffix) > 0 || len(k.Value) > 0 { + fmt.Fprintf(&b, ",%s", k.Suffix) + } + if len(k.Value) > 0 { + fmt.Fprintf(&b, ",%s", k.Value) + } + b.WriteString(")") + return b.String() +} + // Valid returns true if the span is defined. func (s *Span) Valid() bool { return s.Start != nil && s.End != nil @@ -113,6 +155,11 @@ func (s *Span) Empty() bool { return s == nil || len(s.Keys) == 0 } +// Bounds returns Start and End as UserKeyBounds. +func (s *Span) Bounds() base.UserKeyBounds { + return base.UserKeyBoundsEndExclusive(s.Start, s.End) +} + // SmallestKey returns the smallest internal key defined by the span's keys. // It requires the Span's keys be in ByTrailerDesc order. It panics if the span // contains no keys or its keys are sorted in a different order. @@ -151,7 +198,7 @@ func (s *Span) LargestKey() base.InternalKey { // SmallestSeqNum returns the smallest sequence number of a key contained within // the span. It requires the Span's keys be in ByTrailerDesc order. It panics if // the span contains no keys or its keys are sorted in a different order. -func (s *Span) SmallestSeqNum() uint64 { +func (s *Span) SmallestSeqNum() base.SeqNum { if len(s.Keys) == 0 { panic("pebble: Span contains no keys") } else if s.KeysOrder != ByTrailerDesc { @@ -164,7 +211,7 @@ func (s *Span) SmallestSeqNum() uint64 { // LargestSeqNum returns the largest sequence number of a key contained within // the span. It requires the Span's keys be in ByTrailerDesc order. It panics if // the span contains no keys or its keys are sorted in a different order. -func (s *Span) LargestSeqNum() uint64 { +func (s *Span) LargestSeqNum() base.SeqNum { if len(s.Keys) == 0 { panic("pebble: Span contains no keys") } else if s.KeysOrder != ByTrailerDesc { @@ -173,6 +220,26 @@ func (s *Span) LargestSeqNum() uint64 { return s.Keys[0].SeqNum() } +// LargestVisibleSeqNum returns the largest sequence number of a key contained +// within the span that's also visible at the provided snapshot sequence number. +// It requires the Span's keys be in ByTrailerDesc order. It panics if the span +// contains no keys or its keys are sorted in a different order. +func (s *Span) LargestVisibleSeqNum(snapshot base.SeqNum) (largest base.SeqNum, ok bool) { + if s == nil { + return 0, false + } else if len(s.Keys) == 0 { + panic("pebble: Span contains no keys") + } else if s.KeysOrder != ByTrailerDesc { + panic("pebble: span's keys unexpectedly not in trailer order") + } + for i := range s.Keys { + if s.Keys[i].VisibleAt(snapshot) { + return s.Keys[i].SeqNum(), true + } + } + return 0, false +} + // TODO(jackson): Replace most of the calls to Visible with more targeted calls // that avoid the need to construct a new Span. @@ -182,7 +249,7 @@ func (s *Span) LargestSeqNum() uint64 { // // Visible may incur an allocation, so callers should prefer targeted, // non-allocating methods when possible. -func (s Span) Visible(snapshot uint64) Span { +func (s Span) Visible(snapshot base.SeqNum) Span { if s.KeysOrder != ByTrailerDesc { panic("pebble: span's keys unexpectedly not in trailer order") } @@ -212,7 +279,7 @@ func (s Span) Visible(snapshot uint64) Span { lastBatchIdx := -1 lastNonVisibleIdx := -1 for i := range s.Keys { - if seqNum := s.Keys[i].SeqNum(); seqNum&base.InternalKeySeqNumBatch != 0 { + if seqNum := s.Keys[i].SeqNum(); seqNum&base.SeqNumBatchBit != 0 { // Batch key. Always visible. lastBatchIdx = i } else if seqNum >= snapshot { @@ -259,13 +326,13 @@ func (s Span) Visible(snapshot uint64) Span { // // VisibleAt requires the Span's keys be in ByTrailerDesc order. It panics if // the span's keys are sorted in a different order. -func (s *Span) VisibleAt(snapshot uint64) bool { +func (s *Span) VisibleAt(snapshot base.SeqNum) bool { if s.KeysOrder != ByTrailerDesc { panic("pebble: span's keys unexpectedly not in trailer order") } if len(s.Keys) == 0 { return false - } else if first := s.Keys[0].SeqNum(); first&base.InternalKeySeqNumBatch != 0 { + } else if first := s.Keys[0].SeqNum(); first&base.SeqNumBatchBit != 0 { // Only visible batch keys are included when an Iterator's batch spans // are fragmented. They must always be visible. return true @@ -279,41 +346,17 @@ func (s *Span) VisibleAt(snapshot uint64) bool { } } -// ShallowClone returns the span with a Keys slice owned by the span itself. -// None of the key byte slices are cloned (see Span.DeepClone). -func (s *Span) ShallowClone() Span { +// Clone clones the span, creating copies of all contained slices. Clone is +// allocation heavy and should not be used in hot paths. +func (s *Span) Clone() Span { c := Span{ - Start: s.Start, - End: s.End, - Keys: make([]Key, len(s.Keys)), + Start: slices.Clone(s.Start), + End: slices.Clone(s.End), KeysOrder: s.KeysOrder, } - copy(c.Keys, s.Keys) - return c -} - -// DeepClone clones the span, creating copies of all contained slices. DeepClone -// is intended for non-production code paths like tests, the level checker, etc -// because it is allocation heavy. -func (s *Span) DeepClone() Span { - c := Span{ - Start: make([]byte, len(s.Start)), - End: make([]byte, len(s.End)), - Keys: make([]Key, len(s.Keys)), - KeysOrder: s.KeysOrder, - } - copy(c.Start, s.Start) - copy(c.End, s.End) - for i := range s.Keys { - c.Keys[i].Trailer = s.Keys[i].Trailer - if len(s.Keys[i].Suffix) > 0 { - c.Keys[i].Suffix = make([]byte, len(s.Keys[i].Suffix)) - copy(c.Keys[i].Suffix, s.Keys[i].Suffix) - } - if len(s.Keys[i].Value) > 0 { - c.Keys[i].Value = make([]byte, len(s.Keys[i].Value)) - copy(c.Keys[i].Value, s.Keys[i].Value) - } + c.Keys = make([]Key, len(s.Keys)) + for i := range c.Keys { + c.Keys[i] = s.Keys[i].Clone() } return c } @@ -327,7 +370,7 @@ func (s *Span) Contains(cmp base.Compare, key []byte) bool { // // Covers requires the Span's keys be in ByTrailerDesc order. It panics if the // span's keys are sorted in a different order. -func (s Span) Covers(seqNum uint64) bool { +func (s Span) Covers(seqNum base.SeqNum) bool { if s.KeysOrder != ByTrailerDesc { panic("pebble: span's keys unexpectedly not in trailer order") } @@ -343,14 +386,14 @@ func (s Span) Covers(seqNum uint64) bool { // // CoversAt requires the Span's keys be in ByTrailerDesc order. It panics if the // span's keys are sorted in a different order. -func (s *Span) CoversAt(snapshot, seqNum uint64) bool { +func (s *Span) CoversAt(snapshot, seqNum base.SeqNum) bool { if s.KeysOrder != ByTrailerDesc { panic("pebble: span's keys unexpectedly not in trailer order") } // NB: A key is visible at `snapshot` if its sequence number is strictly // less than `snapshot`. See base.Visible. for i := range s.Keys { - if kseq := s.Keys[i].SeqNum(); kseq&base.InternalKeySeqNumBatch != 0 { + if kseq := s.Keys[i].SeqNum(); kseq&base.SeqNumBatchBit != 0 { // Only visible batch keys are included when an Iterator's batch spans // are fragmented. They must always be visible. return kseq > seqNum @@ -361,6 +404,33 @@ func (s *Span) CoversAt(snapshot, seqNum uint64) bool { return false } +// Reset clears the span's Start, End, and Keys fields, retaining the slices for +// reuse. +func (s *Span) Reset() { + s.Start = s.Start[:0] + s.End = s.End[:0] + s.Keys = s.Keys[:0] +} + +// CopyFrom deep-copies the contents of the other span, retaining the slices +// allocated in this span. +func (s *Span) CopyFrom(other *Span) { + s.Start = append(s.Start[:0], other.Start...) + s.End = append(s.End[:0], other.End...) + + // We want to preserve any existing Suffix/Value buffers. + if cap(s.Keys) >= len(other.Keys) { + s.Keys = s.Keys[:len(other.Keys)] + } else { + s.Keys = append(s.Keys[:cap(s.Keys)], make([]Key, len(other.Keys)-cap(s.Keys))...) + } + for i := range other.Keys { + s.Keys[i].CopyFrom(other.Keys[i]) + } + + s.KeysOrder = other.KeysOrder +} + // String returns a string representation of the span. func (s Span) String() string { return fmt.Sprint(prettySpan{Span: s, formatKey: base.DefaultFormatter}) @@ -387,36 +457,48 @@ func (s prettySpan) Format(fs fmt.State, c rune) { if i > 0 { fmt.Fprint(fs, " ") } - fmt.Fprintf(fs, "(#%d,%s", k.SeqNum(), k.Kind()) - if len(k.Suffix) > 0 || len(k.Value) > 0 { - fmt.Fprintf(fs, ",%s", k.Suffix) - } - if len(k.Value) > 0 { - fmt.Fprintf(fs, ",%s", k.Value) - } - fmt.Fprint(fs, ")") + fmt.Fprint(fs, k.String()) } fmt.Fprintf(fs, "}") } -// SortKeysByTrailer sorts a keys slice by trailer. -func SortKeysByTrailer(keys *[]Key) { - // NB: keys is a pointer to a slice instead of a slice to avoid `sorted` - // escaping to the heap. - sorted := (*keysBySeqNumKind)(keys) - sort.Sort(sorted) +// SortKeysByTrailer sorts a Keys slice by trailer. +func SortKeysByTrailer(keys []Key) { + slices.SortFunc(keys, func(a, b Key) int { + // Trailer are ordered in decreasing number order. + return -cmp.Compare(a.Trailer, b.Trailer) + }) } -// KeysBySuffix implements sort.Interface, sorting its member Keys slice to by -// Suffix in the order dictated by Cmp. -type KeysBySuffix struct { - Cmp base.Compare - Keys []Key +// SortKeysByTrailerAndSuffix sorts a Keys slice by trailer, and among keys with +// equal trailers, by suffix. +func SortKeysByTrailerAndSuffix(suffixCmp base.CompareRangeSuffixes, keys []Key) { + slices.SortFunc(keys, func(a, b Key) int { + // Trailer are ordered in decreasing number order. + if v := cmp.Compare(b.Trailer, a.Trailer); v != 0 { + return v + } + return suffixCmp(a.Suffix, b.Suffix) + }) +} + +// SortSpansByStartKey sorts the spans by start key. +// +// This is the ordering required by the Fragmenter. Usually spans are naturally +// sorted by their start key, but that isn't true for range deletion tombstones +// in the legacy range-del-v1 block format. +func SortSpansByStartKey(cmp base.Compare, spans []Span) { + slices.SortFunc(spans, func(a, b Span) int { + return cmp(a.Start, b.Start) + }) } -func (s *KeysBySuffix) Len() int { return len(s.Keys) } -func (s *KeysBySuffix) Less(i, j int) bool { return s.Cmp(s.Keys[i].Suffix, s.Keys[j].Suffix) < 0 } -func (s *KeysBySuffix) Swap(i, j int) { s.Keys[i], s.Keys[j] = s.Keys[j], s.Keys[i] } +// SortSpansByEndKey sorts the spans by the end key. +func SortSpansByEndKey(cmp base.Compare, spans []Span) { + slices.SortFunc(spans, func(a, b Span) int { + return cmp(a.End, b.End) + }) +} // ParseSpan parses the string representation of a Span. It's intended for // tests. ParseSpan panics if passed a malformed span representation. @@ -435,9 +517,12 @@ func ParseSpan(input string) Span { // Each of the remaining parts represents a single Key. s.Keys = make([]Key, 0, len(parts)-2) for _, p := range parts[2:] { + if len(p) >= 2 && p[0] == '(' && p[len(p)-1] == ')' { + p = p[1 : len(p)-1] + } keyFields := strings.FieldsFunc(p, func(r rune) bool { switch r { - case '#', ',', '(', ')': + case '#', ',': return true default: return unicode.IsSpace(r) @@ -445,12 +530,7 @@ func ParseSpan(input string) Span { }) var k Key - // Parse the sequence number. - seqNum, err := strconv.ParseUint(keyFields[0], 10, 64) - if err != nil { - panic(fmt.Sprintf("invalid sequence number: %q: %s", keyFields[0], err)) - } - // Parse the key kind. + seqNum := base.ParseSeqNum(keyFields[0]) kind := base.ParseKind(keyFields[1]) k.Trailer = base.MakeTrailer(seqNum, kind) // Parse the optional suffix. @@ -463,5 +543,11 @@ func ParseSpan(input string) Span { } s.Keys = append(s.Keys, k) } + for i := 1; i < len(s.Keys); i++ { + if s.Keys[i-1].Trailer < s.Keys[i].Trailer { + panic(fmt.Sprintf("span keys not sorted: %s %s", s.Keys[i-1], s.Keys[i])) + } + } + s.KeysOrder = ByTrailerDesc return s } diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/test_utils.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/test_utils.go new file mode 100644 index 0000000..d588839 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/test_utils.go @@ -0,0 +1,590 @@ +// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package keyspan + +import ( + "bytes" + "context" + "fmt" + "go/token" + "io" + "reflect" + "strconv" + "strings" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/dsl" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// This file contains testing facilities for Spans and FragmentIterators. It's +// defined here so that it may be used by the keyspan package to test its +// various FragmentIterator implementations. +// +// TODO(jackson): Move keyspan.{Span,Key,FragmentIterator} into internal/base, +// and then move the testing facilities to an independent package, eg +// internal/itertest. Alternatively, make all tests that use it use the +// keyspan_test package, which can then import a separate itertest package. + +// probe defines an interface for probes that may inspect or mutate internal +// span iterator behavior. +type probe interface { + // probe inspects, and possibly manipulates, iterator operations' results. + probe(*probeContext) +} + +func parseProbes(probeDSLs ...string) []probe { + probes := make([]probe, len(probeDSLs)) + var err error + for i := range probeDSLs { + probes[i], err = probeParser.Parse(probeDSLs[i]) + if err != nil { + panic(err) + } + } + return probes +} + +func attachProbes(iter FragmentIterator, pctx probeContext, probes ...probe) FragmentIterator { + if pctx.log == nil { + pctx.log = io.Discard + } + for i := range probes { + iter = &probeIterator{ + iter: iter, + probe: probes[i], + probeCtx: pctx, + } + } + return iter +} + +// ParseAndAttachProbes parses DSL probes and attaches them to an iterator. +func ParseAndAttachProbes( + iter FragmentIterator, log io.Writer, probeDSLs ...string, +) FragmentIterator { + pctx := probeContext{log: log} + return attachProbes(iter, pctx, parseProbes(probeDSLs...)...) +} + +// probeContext provides the context within which a probe is run. It includes +// information about the iterator operation in progress. +type probeContext struct { + op + log io.Writer +} + +type op struct { + Kind OpKind + SeekKey []byte + Span *Span + Err error +} + +// ErrInjected is an error artificially injected for testing. +var ErrInjected = &errorProbe{name: "ErrInjected", err: errors.New("injected error")} + +var probeParser = func() *dsl.Parser[probe] { + valuerParser := dsl.NewParser[valuer]() + valuerParser.DefineConstant("StartKey", func() valuer { return startKey{} }) + valuerParser.DefineFunc("Bytes", + func(p *dsl.Parser[valuer], s *dsl.Scanner) valuer { + v := bytesConstant{bytes: []byte(s.ConsumeString())} + s.Consume(token.RPAREN) + return v + }) + + predicateParser := dsl.NewPredicateParser[*probeContext]() + predicateParser.DefineFunc("Equal", + func(p *dsl.Parser[dsl.Predicate[*probeContext]], s *dsl.Scanner) dsl.Predicate[*probeContext] { + eq := equal{ + valuerParser.ParseFromPos(s, s.Scan()), + valuerParser.ParseFromPos(s, s.Scan()), + } + s.Consume(token.RPAREN) + return eq + }) + for i, name := range opNames { + opKind := OpKind(i) + predicateParser.DefineConstant(name, func() dsl.Predicate[*probeContext] { + // An OpKind implements dsl.Predicate[*probeContext]. + return opKind + }) + } + probeParser := dsl.NewParser[probe]() + probeParser.DefineConstant("ErrInjected", func() probe { return ErrInjected }) + probeParser.DefineConstant("noop", func() probe { return noop{} }) + probeParser.DefineFunc("If", + func(p *dsl.Parser[probe], s *dsl.Scanner) probe { + probe := ifProbe{ + predicateParser.ParseFromPos(s, s.Scan()), + probeParser.ParseFromPos(s, s.Scan()), + probeParser.ParseFromPos(s, s.Scan()), + } + s.Consume(token.RPAREN) + return probe + }) + probeParser.DefineFunc("Return", + func(p *dsl.Parser[probe], s *dsl.Scanner) (ret probe) { + switch tok := s.Scan(); tok.Kind { + case token.STRING: + str, err := strconv.Unquote(tok.Lit) + if err != nil { + panic(err) + } + span := ParseSpan(str) + ret = returnSpan{s: &span} + case token.IDENT: + switch tok.Lit { + case "nil": + ret = returnSpan{s: nil} + default: + panic(errors.Newf("unrecognized return value %q", tok.Lit)) + } + } + s.Consume(token.RPAREN) + return ret + }) + probeParser.DefineFunc("Log", + func(p *dsl.Parser[probe], s *dsl.Scanner) (ret probe) { + ret = loggingProbe{prefix: s.ConsumeString()} + s.Consume(token.RPAREN) + return ret + }) + return probeParser +}() + +// probe implementations + +type errorProbe struct { + name string + err error +} + +func (p *errorProbe) String() string { return p.name } +func (p *errorProbe) Error() error { return p.err } +func (p *errorProbe) probe(pctx *probeContext) { + pctx.op.Err = p.err + pctx.op.Span = nil +} + +// ifProbe is a conditional probe. If its predicate evaluates to true, it probes +// using its Then probe. If its predicate evalutes to false, it probes using its +// Else probe. +type ifProbe struct { + Predicate dsl.Predicate[*probeContext] + Then probe + Else probe +} + +func (p ifProbe) String() string { return fmt.Sprintf("(If %s %s %s)", p.Predicate, p.Then, p.Else) } +func (p ifProbe) probe(pctx *probeContext) { + if p.Predicate.Evaluate(pctx) { + p.Then.probe(pctx) + } else { + p.Else.probe(pctx) + } +} + +type returnSpan struct { + s *Span +} + +func (p returnSpan) String() string { + if p.s == nil { + return "(Return nil)" + } + return fmt.Sprintf("(Return %q)", p.s.String()) +} + +func (p returnSpan) probe(pctx *probeContext) { + pctx.op.Span = p.s + pctx.op.Err = nil +} + +type noop struct{} + +func (noop) String() string { return "Noop" } +func (noop) probe(pctx *probeContext) {} + +type loggingProbe struct { + prefix string +} + +func (lp loggingProbe) String() string { return fmt.Sprintf("(Log %q)", lp.prefix) } +func (lp loggingProbe) probe(pctx *probeContext) { + opStr := strings.TrimPrefix(pctx.op.Kind.String(), "Op") + fmt.Fprintf(pctx.log, "%s%s(", lp.prefix, opStr) + if pctx.op.SeekKey != nil { + fmt.Fprintf(pctx.log, "%q", pctx.op.SeekKey) + } + fmt.Fprint(pctx.log, ") = ") + if pctx.op.Span == nil { + fmt.Fprint(pctx.log, "nil") + if pctx.op.Err != nil { + fmt.Fprintf(pctx.log, " ", pctx.op.Err) + } + } else { + fmt.Fprint(pctx.log, pctx.op.Span.String()) + } + fmt.Fprintln(pctx.log) +} + +// dsl.Predicate[*probeContext] implementations. + +type equal struct { + a, b valuer +} + +func (e equal) String() string { return fmt.Sprintf("(Equal %s %s)", e.a, e.b) } +func (e equal) Evaluate(pctx *probeContext) bool { + return reflect.DeepEqual(e.a.value(pctx), e.b.value(pctx)) +} + +// OpKind indicates the type of iterator operation being performed. +type OpKind int8 + +// OpKind values. +const ( + OpSeekGE OpKind = iota + OpSeekLT + OpFirst + OpLast + OpNext + OpPrev + OpClose + numOpKinds +) + +func (o OpKind) String() string { return opNames[o] } + +// Evaluate implements dsl.Predicate. +func (o OpKind) Evaluate(pctx *probeContext) bool { return pctx.op.Kind == o } + +var opNames = [numOpKinds]string{ + OpSeekGE: "OpSeekGE", + OpSeekLT: "OpSeekLT", + OpFirst: "OpFirst", + OpLast: "OpLast", + OpNext: "OpNext", + OpPrev: "OpPrev", + OpClose: "OpClose", +} + +// valuer implementations + +type valuer interface { + fmt.Stringer + value(pctx *probeContext) any +} + +type bytesConstant struct { + bytes []byte +} + +func (b bytesConstant) String() string { return fmt.Sprintf("%q", string(b.bytes)) } +func (b bytesConstant) value(pctx *probeContext) any { return b.bytes } + +type startKey struct{} + +func (s startKey) String() string { return "StartKey" } +func (s startKey) value(pctx *probeContext) any { + if pctx.op.Span == nil { + return nil + } + return pctx.op.Span.Start +} + +type probeIterator struct { + iter FragmentIterator + probe probe + probeCtx probeContext +} + +// Assert that probeIterator implements the fragment iterator interface. +var _ FragmentIterator = (*probeIterator)(nil) + +func (p *probeIterator) handleOp(preProbeOp op) (*Span, error) { + p.probeCtx.op = preProbeOp + p.probe.probe(&p.probeCtx) + return p.probeCtx.op.Span, p.probeCtx.op.Err +} + +func (p *probeIterator) SeekGE(key []byte) (*Span, error) { + op := op{ + Kind: OpSeekGE, + SeekKey: key, + } + if p.iter != nil { + op.Span, op.Err = p.iter.SeekGE(key) + } + return p.handleOp(op) +} + +func (p *probeIterator) SeekLT(key []byte) (*Span, error) { + op := op{ + Kind: OpSeekLT, + SeekKey: key, + } + if p.iter != nil { + op.Span, op.Err = p.iter.SeekLT(key) + } + return p.handleOp(op) +} + +func (p *probeIterator) First() (*Span, error) { + op := op{Kind: OpFirst} + if p.iter != nil { + op.Span, op.Err = p.iter.First() + } + return p.handleOp(op) +} + +func (p *probeIterator) Last() (*Span, error) { + op := op{Kind: OpLast} + if p.iter != nil { + op.Span, op.Err = p.iter.Last() + } + return p.handleOp(op) +} + +func (p *probeIterator) Next() (*Span, error) { + op := op{Kind: OpNext} + if p.iter != nil { + op.Span, op.Err = p.iter.Next() + } + return p.handleOp(op) +} + +func (p *probeIterator) Prev() (*Span, error) { + op := op{Kind: OpPrev} + if p.iter != nil { + op.Span, op.Err = p.iter.Prev() + } + return p.handleOp(op) +} + +// SetContext is part of the FragmentIterator interface. +func (p *probeIterator) SetContext(ctx context.Context) { + p.iter.SetContext(ctx) +} + +func (p *probeIterator) Close() { + op := op{Kind: OpClose} + if p.iter != nil { + p.iter.Close() + } + _, _ = p.handleOp(op) +} + +func (p *probeIterator) WrapChildren(wrap WrapFn) { + p.iter = wrap(p.iter) +} + +// DebugTree is part of the FragmentIterator interface. +func (p *probeIterator) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", p, p) + if p.iter != nil { + p.iter.DebugTree(n) + } +} + +// RunIterCmd evaluates a datadriven command controlling an internal +// keyspan.FragmentIterator, writing the results of the iterator operations to +// the provided writer. +func RunIterCmd(tdInput string, iter FragmentIterator, w io.Writer) { + lines := strings.Split(strings.TrimSpace(tdInput), "\n") + for i, line := range lines { + if i > 0 { + fmt.Fprintln(w) + } + line = strings.TrimSpace(line) + i := strings.IndexByte(line, '#') + iterCmd := line + if i > 0 { + iterCmd = string(line[:i]) + } + runIterOp(w, iter, iterCmd) + } +} + +var iterDelim = map[rune]bool{',': true, ' ': true, '(': true, ')': true, '"': true} + +func runIterOp(w io.Writer, it FragmentIterator, op string) { + fields := strings.FieldsFunc(op, func(r rune) bool { return iterDelim[r] }) + var s *Span + var err error + switch strings.ToLower(fields[0]) { + case "first": + s, err = it.First() + case "last": + s, err = it.Last() + case "seekge", "seek-ge": + if len(fields) == 1 { + panic(fmt.Sprintf("unable to parse iter op %q", op)) + } + s, err = it.SeekGE([]byte(fields[1])) + case "seeklt", "seek-lt": + if len(fields) == 1 { + panic(fmt.Sprintf("unable to parse iter op %q", op)) + } + s, err = it.SeekLT([]byte(fields[1])) + case "next": + s, err = it.Next() + case "prev": + s, err = it.Prev() + default: + panic(fmt.Sprintf("unrecognized iter op %q", fields[0])) + } + switch { + case err != nil: + fmt.Fprintf(w, " err=<%s>", err) + case s == nil: + fmt.Fprint(w, "") + default: + fmt.Fprint(w, s) + } +} + +// RunFragmentIteratorCmd runs a command on an iterator; intended for testing. +func RunFragmentIteratorCmd(iter FragmentIterator, input string, extraInfo func() string) string { + var b bytes.Buffer + for _, line := range strings.Split(input, "\n") { + parts := strings.Fields(line) + if len(parts) == 0 { + continue + } + var span *Span + var err error + switch parts[0] { + case "seek-ge": + if len(parts) != 2 { + return "seek-ge \n" + } + span, err = iter.SeekGE([]byte(strings.TrimSpace(parts[1]))) + case "seek-lt": + if len(parts) != 2 { + return "seek-lt \n" + } + span, err = iter.SeekLT([]byte(strings.TrimSpace(parts[1]))) + case "first": + span, err = iter.First() + case "last": + span, err = iter.Last() + case "next": + span, err = iter.Next() + case "prev": + span, err = iter.Prev() + default: + return fmt.Sprintf("unknown op: %s", parts[0]) + } + switch { + case err != nil: + fmt.Fprintf(&b, "err=%v\n", err) + case span == nil: + fmt.Fprintf(&b, ".\n") + default: + fmt.Fprintf(&b, "%s", span) + if extraInfo != nil { + fmt.Fprintf(&b, " (%s)", extraInfo()) + } + b.WriteByte('\n') + } + } + return b.String() +} + +// NewInvalidatingIter wraps a FragmentIterator; spans surfaced by the inner +// iterator are copied to buffers that are zeroed by subsequent iterator +// positioning calls. This is intended to help surface bugs in improper lifetime +// expectations of Spans. +func NewInvalidatingIter(iter FragmentIterator) FragmentIterator { + return &invalidatingIter{ + iter: iter, + } +} + +type invalidatingIter struct { + iter FragmentIterator + bufs [][]byte + keys []Key + span Span +} + +// invalidatingIter implements FragmentIterator. +var _ FragmentIterator = (*invalidatingIter)(nil) + +func (i *invalidatingIter) invalidate(s *Span, err error) (*Span, error) { + // Mangle the entirety of the byte bufs and the keys slice. + for j := range i.bufs { + for k := range i.bufs[j] { + i.bufs[j][k] = 0xff + } + i.bufs[j] = nil + } + for j := range i.keys { + i.keys[j] = Key{} + } + if s == nil { + return nil, err + } + + // Copy all of the span's slices into slices owned by the invalidating iter + // that we can invalidate on a subsequent positioning method. + i.bufs = i.bufs[:0] + i.keys = i.keys[:0] + i.span = Span{ + Start: i.saveBytes(s.Start), + End: i.saveBytes(s.End), + KeysOrder: s.KeysOrder, + } + for j := range s.Keys { + i.keys = append(i.keys, Key{ + Trailer: s.Keys[j].Trailer, + Suffix: i.saveBytes(s.Keys[j].Suffix), + Value: i.saveBytes(s.Keys[j].Value), + }) + } + i.span.Keys = i.keys + return &i.span, err +} + +func (i *invalidatingIter) saveBytes(b []byte) []byte { + if b == nil { + return nil + } + saved := append([]byte(nil), b...) + i.bufs = append(i.bufs, saved) + return saved +} + +func (i *invalidatingIter) SeekGE(key []byte) (*Span, error) { return i.invalidate(i.iter.SeekGE(key)) } +func (i *invalidatingIter) SeekLT(key []byte) (*Span, error) { return i.invalidate(i.iter.SeekLT(key)) } +func (i *invalidatingIter) First() (*Span, error) { return i.invalidate(i.iter.First()) } +func (i *invalidatingIter) Last() (*Span, error) { return i.invalidate(i.iter.Last()) } +func (i *invalidatingIter) Next() (*Span, error) { return i.invalidate(i.iter.Next()) } +func (i *invalidatingIter) Prev() (*Span, error) { return i.invalidate(i.iter.Prev()) } + +// SetContext is part of the FragmentIterator interface. +func (i *invalidatingIter) SetContext(ctx context.Context) { + i.iter.SetContext(ctx) +} + +func (i *invalidatingIter) Close() { + _, _ = i.invalidate(nil, nil) + i.iter.Close() +} + +func (i *invalidatingIter) WrapChildren(wrap WrapFn) { + i.iter = wrap(i.iter) +} + +// DebugTree is part of the FragmentIterator interface. +func (i *invalidatingIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.iter != nil { + i.iter.DebugTree(n) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/transformer.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/transformer.go new file mode 100644 index 0000000..7d789a1 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/transformer.go @@ -0,0 +1,139 @@ +// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package keyspan + +import "github.com/cockroachdb/pebble/v2/internal/base" + +// Transformer defines a transformation to be applied to a Span. +type Transformer interface { + // Transform takes a Span as input and writes the transformed Span to the + // provided output *Span pointer. The output Span's Keys slice may be reused + // by Transform to reduce allocations. + Transform(suffixCmp base.CompareRangeSuffixes, in Span, out *Span) error +} + +// The TransformerFunc type is an adapter to allow the use of ordinary functions +// as Transformers. If f is a function with the appropriate signature, +// TransformerFunc(f) is a Transformer that calls f. +type TransformerFunc func(base.CompareRangeSuffixes, Span, *Span) error + +// Transform calls f(cmp, in, out). +func (tf TransformerFunc) Transform(suffixCmp base.CompareRangeSuffixes, in Span, out *Span) error { + return tf(suffixCmp, in, out) +} + +// NoopTransform is a Transformer that performs no mutations. +var NoopTransform Transformer = TransformerFunc(func(_ base.CompareRangeSuffixes, s Span, dst *Span) error { + dst.Start, dst.End = s.Start, s.End + dst.Keys = append(dst.Keys[:0], s.Keys...) + return nil +}) + +// VisibleTransform filters keys that are invisible at the provided snapshot +// sequence number. +func VisibleTransform(snapshot base.SeqNum) Transformer { + return TransformerFunc(func(_ base.CompareRangeSuffixes, s Span, dst *Span) error { + dst.Start, dst.End = s.Start, s.End + dst.Keys = dst.Keys[:0] + for _, k := range s.Keys { + // NB: The InternalKeySeqNumMax value is used for the batch snapshot + // because a batch's visible span keys are filtered when they're + // fragmented. There's no requirement to enforce visibility at + // iteration time. + if base.Visible(k.SeqNum(), snapshot, base.SeqNumMax) { + dst.Keys = append(dst.Keys, k) + } + } + return nil + }) +} + +// TransformerIter is a FragmentIterator that applies a Transformer on all +// returned keys. Used for when a caller needs to apply a transformer on an +// iterator but does not otherwise need the mergingiter's merging ability. +type TransformerIter struct { + FragmentIterator + + // Transformer is applied on every Span returned by this iterator. + Transformer Transformer + // Suffix comparer in use for this keyspace. + SuffixCmp base.CompareRangeSuffixes + + span Span +} + +func (t *TransformerIter) applyTransform(span *Span) (*Span, error) { + if span == nil { + return nil, nil + } + t.span = Span{ + Start: t.span.Start[:0], + End: t.span.End[:0], + Keys: t.span.Keys[:0], + } + if err := t.Transformer.Transform(t.SuffixCmp, *span, &t.span); err != nil { + return nil, err + } + return &t.span, nil +} + +// SeekGE implements the FragmentIterator interface. +func (t *TransformerIter) SeekGE(key []byte) (*Span, error) { + span, err := t.FragmentIterator.SeekGE(key) + if err != nil { + return nil, err + } + return t.applyTransform(span) +} + +// SeekLT implements the FragmentIterator interface. +func (t *TransformerIter) SeekLT(key []byte) (*Span, error) { + span, err := t.FragmentIterator.SeekLT(key) + if err != nil { + return nil, err + } + return t.applyTransform(span) +} + +// First implements the FragmentIterator interface. +func (t *TransformerIter) First() (*Span, error) { + span, err := t.FragmentIterator.First() + if err != nil { + return nil, err + } + return t.applyTransform(span) +} + +// Last implements the FragmentIterator interface. +func (t *TransformerIter) Last() (*Span, error) { + span, err := t.FragmentIterator.Last() + if err != nil { + return nil, err + } + return t.applyTransform(span) +} + +// Next implements the FragmentIterator interface. +func (t *TransformerIter) Next() (*Span, error) { + span, err := t.FragmentIterator.Next() + if err != nil { + return nil, err + } + return t.applyTransform(span) +} + +// Prev implements the FragmentIterator interface. +func (t *TransformerIter) Prev() (*Span, error) { + span, err := t.FragmentIterator.Prev() + if err != nil { + return nil, err + } + return t.applyTransform(span) +} + +// Close implements the FragmentIterator interface. +func (t *TransformerIter) Close() { + t.FragmentIterator.Close() +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/truncate.go b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/truncate.go new file mode 100644 index 0000000..78fcfa9 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/keyspan/truncate.go @@ -0,0 +1,189 @@ +// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package keyspan + +import ( + "context" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// Truncate creates a new iterator where every span in the supplied iterator is +// truncated to be contained within the given user key bounds. +// +// Note that fragment iterator Spans always have exclusive end-keys; if the +// given bounds have an inclusive end key, then the input iterator must not +// produce a span that contains that key. The only difference between bounds.End +// being inclusive vs exclusive is this extra check. +func Truncate(cmp base.Compare, iter FragmentIterator, bounds base.UserKeyBounds) FragmentIterator { + return &truncatingIter{ + iter: iter, + cmp: cmp, + bounds: bounds, + } +} + +type truncatingIter struct { + iter FragmentIterator + cmp base.Compare + + bounds base.UserKeyBounds + + span Span +} + +// SeekGE implements FragmentIterator. +func (i *truncatingIter) SeekGE(key []byte) (*Span, error) { + span, err := i.iter.SeekGE(key) + if err != nil { + return nil, err + } + span, spanBoundsChanged, err := i.nextSpanWithinBounds(span, +1) + if err != nil { + return nil, err + } + // nextSpanWithinBounds could return a span that's less than key, if the end + // bound was truncated to end at a key less than or equal to `key`. Detect + // this case and next/invalidate the iter. + if spanBoundsChanged && i.cmp(span.End, key) <= 0 { + return i.Next() + } + return span, nil +} + +// SeekLT implements FragmentIterator. +func (i *truncatingIter) SeekLT(key []byte) (*Span, error) { + span, err := i.iter.SeekLT(key) + if err != nil { + return nil, err + } + span, spanBoundsChanged, err := i.nextSpanWithinBounds(span, -1) + if err != nil { + return nil, err + } + // nextSpanWithinBounds could return a span that's >= key, if the start bound + // was truncated to start at a key greater than or equal to `key`. Detect this + // case and prev/invalidate the iter. + if spanBoundsChanged && i.cmp(span.Start, key) >= 0 { + return i.Prev() + } + return span, nil +} + +// First implements FragmentIterator. +func (i *truncatingIter) First() (*Span, error) { + span, err := i.iter.First() + if err != nil { + return nil, err + } + span, _, err = i.nextSpanWithinBounds(span, +1) + return span, err +} + +// Last implements FragmentIterator. +func (i *truncatingIter) Last() (*Span, error) { + span, err := i.iter.Last() + if err != nil { + return nil, err + } + span, _, err = i.nextSpanWithinBounds(span, -1) + return span, err +} + +// Next implements FragmentIterator. +func (i *truncatingIter) Next() (*Span, error) { + span, err := i.iter.Next() + if err != nil { + return nil, err + } + span, _, err = i.nextSpanWithinBounds(span, +1) + return span, err +} + +// Prev implements FragmentIterator. +func (i *truncatingIter) Prev() (*Span, error) { + span, err := i.iter.Prev() + if err != nil { + return nil, err + } + span, _, err = i.nextSpanWithinBounds(span, -1) + return span, err +} + +// SetContext is part of the FragmentIterator interface. +func (i *truncatingIter) SetContext(ctx context.Context) { + i.iter.SetContext(ctx) +} + +// Close implements FragmentIterator. +func (i *truncatingIter) Close() { + i.iter.Close() +} + +// nextSpanWithinBounds returns the first span (starting with the given span and +// advancing in the given direction) that intersects the bounds. It returns a +// span that is entirely within the bounds; spanBoundsChanged indicates if the span +// bounds had to be truncated. +func (i *truncatingIter) nextSpanWithinBounds( + span *Span, dir int8, +) (_ *Span, spanBoundsChanged bool, _ error) { + var err error + for span != nil { + if i.bounds.End.Kind == base.Inclusive && span.Contains(i.cmp, i.bounds.End.Key) { + err := base.AssertionFailedf("inclusive upper bound %q inside span %s", i.bounds.End.Key, span) + if invariants.Enabled { + panic(err) + } + return nil, false, err + } + // Intersect [span.Start, span.End) with [i.bounds.Start, i.bounds.End.Key). + spanBoundsChanged = false + start := span.Start + if i.cmp(start, i.bounds.Start) < 0 { + spanBoundsChanged = true + start = i.bounds.Start + } + end := span.End + if i.cmp(end, i.bounds.End.Key) > 0 { + spanBoundsChanged = true + end = i.bounds.End.Key + } + if !spanBoundsChanged { + return span, false, nil + } + if i.cmp(start, end) < 0 { + i.span = Span{ + Start: start, + End: end, + Keys: span.Keys, + KeysOrder: span.KeysOrder, + } + return &i.span, true, nil + } + // Span is outside of bounds, find the next one. + if dir == +1 { + span, err = i.iter.Next() + } else { + span, err = i.iter.Prev() + } + } + // NB: err may be nil or non-nil. + return nil, false, err +} + +// WrapChildren implements FragmentIterator. +func (i *truncatingIter) WrapChildren(wrap WrapFn) { + i.iter = wrap(i.iter) +} + +// DebugTree is part of the FragmentIterator interface. +func (i *truncatingIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.iter != nil { + i.iter.DebugTree(n) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/lsmview/data.go b/vendor/github.com/cockroachdb/pebble/v2/internal/lsmview/data.go new file mode 100644 index 0000000..f34318a --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/lsmview/data.go @@ -0,0 +1,30 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package lsmview + +// Data encodes the data necessary to generate an LSM diagram. +type Data struct { + // LSM levels in newest-to-oldest order. + Levels []Level `json:"levels"` + + // Keys contains all table boundary keys, in sorted key order. + Keys []string `json:"keys"` +} + +// Level contains the data for a level of the LSM. +type Level struct { + Name string `json:"level_name"` + Tables []Table `json:"tables"` +} + +// Table contains the data for a table. +type Table struct { + Label string `json:"label"` + Size uint64 `json:"size"` + // SmallestKey, LargestKey are indexes into the Data.Keys list. + SmallestKey int `json:"smallest_key"` + LargestKey int `json:"largest_key"` + Details []string `json:"details"` +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/lsmview/url.go b/vendor/github.com/cockroachdb/pebble/v2/internal/lsmview/url.go new file mode 100644 index 0000000..e852e83 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/lsmview/url.go @@ -0,0 +1,41 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package lsmview + +import ( + "bytes" + "compress/zlib" + "encoding/base64" + "encoding/json" + "net/url" +) + +// GenerateURL generates a URL showing the LSM diagram. The URL contains the +// encoded and compressed data as the URL fragment. +func GenerateURL(data Data) (url.URL, error) { + var jsonBuf bytes.Buffer + if err := json.NewEncoder(&jsonBuf).Encode(data); err != nil { + return url.URL{}, err + } + + var compressed bytes.Buffer + encoder := base64.NewEncoder(base64.URLEncoding, &compressed) + compressor := zlib.NewWriter(encoder) + if _, err := jsonBuf.WriteTo(compressor); err != nil { + return url.URL{}, err + } + if err := compressor.Close(); err != nil { + return url.URL{}, err + } + if err := encoder.Close(); err != nil { + return url.URL{}, err + } + return url.URL{ + Scheme: "https", + Host: "raduberinde.github.io", + Path: "lsmview/decode.html", + Fragment: compressed.String(), + }, nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/annotator.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/annotator.go new file mode 100644 index 0000000..7170bca --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/annotator.go @@ -0,0 +1,444 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +import ( + "sort" + "sync/atomic" + + "github.com/cockroachdb/pebble/v2/internal/base" +) + +// The Annotator type defined below is used by other packages to lazily +// compute a value over a B-Tree. Each node of the B-Tree stores one +// `annotation` per annotator, containing the result of the computation over +// the node's subtree. +// +// An annotation is marked as valid if it's current with the current subtree +// state. Annotations are marked as invalid whenever a node will be mutated +// (in mut). Annotators may also return `false` from `Accumulate` to signal +// that a computation for a file is not stable and may change in the future. +// Annotations that include these unstable values are also marked as invalid +// on the node, ensuring that future queries for the annotation will recompute +// the value. + +// An Annotator defines a computation over a level's TableMetadata. If the +// computation is stable and uses inputs that are fixed for the lifetime of a +// TableMetadata, the LevelMetadata's internal data structures are annotated +// with the intermediary computations. This allows the computation to be +// computed incrementally as edits are applied to a level. +type Annotator[T any] struct { + Aggregator AnnotationAggregator[T] +} + +// An AnnotationAggregator defines how an annotation should be accumulated from +// a single TableMetadata and merged with other annotated values. +type AnnotationAggregator[T any] interface { + // Zero returns the zero value of an annotation. This value is returned + // when a LevelMetadata is empty. The dst argument, if non-nil, is an + // obsolete value previously returned by this Annotator and may be + // overwritten and reused to avoid a memory allocation. + Zero(dst *T) *T + + // Accumulate computes the annotation for a single file in a level's + // metadata. It merges the file's value into dst and returns a bool flag + // indicating whether or not the value is stable and okay to cache as an + // annotation. If the file's value may change over the life of the file, + // the annotator must return false. + // + // Implementations may modify dst and return it to avoid an allocation. + Accumulate(f *TableMetadata, dst *T) (v *T, cacheOK bool) + + // Merge combines two values src and dst, returning the result. + // Implementations may modify dst and return it to avoid an allocation. + Merge(src *T, dst *T) *T +} + +// A PartialOverlapAnnotationAggregator is an extension of AnnotationAggregator +// that allows for custom accumulation of range annotations for files that only +// partially overlap with the range. +type PartialOverlapAnnotationAggregator[T any] interface { + AnnotationAggregator[T] + AccumulatePartialOverlap(f *TableMetadata, dst *T, bounds base.UserKeyBounds) *T +} + +type annotation struct { + // annotator is a pointer to the Annotator that computed this annotation. + // NB: This is untyped to allow AnnotationAggregator to use Go generics, + // since annotations are stored in a slice on each node and a single + // slice cannot contain elements with different type parameters. + annotator interface{} + // v is contains the annotation value, the output of either + // AnnotationAggregator.Accumulate or AnnotationAggregator.Merge. + // NB: This is untyped for the same reason as annotator above. + v atomic.Value + // valid indicates whether future reads of the annotation may use the + // value as-is. If false, v will be zeroed and recalculated. + valid atomic.Bool +} + +func (a *Annotator[T]) findExistingAnnotation(n *node[*TableMetadata]) *annotation { + n.annotMu.RLock() + defer n.annotMu.RUnlock() + for i := range n.annot { + if n.annot[i].annotator == a { + return &n.annot[i] + } + } + return nil +} + +// findAnnotation finds this Annotator's annotation on a node, creating +// one if it doesn't already exist. +func (a *Annotator[T]) findAnnotation(n *node[*TableMetadata]) *annotation { + if a := a.findExistingAnnotation(n); a != nil { + return a + } + n.annotMu.Lock() + defer n.annotMu.Unlock() + + // This node has never been annotated by a. Create a new annotation. + n.annot = append(n.annot, annotation{ + annotator: a, + v: atomic.Value{}, + }) + n.annot[len(n.annot)-1].v.Store(a.Aggregator.Zero(nil)) + return &n.annot[len(n.annot)-1] +} + +// nodeAnnotation computes this annotator's annotation of this node across all +// files in the node's subtree. The second return value indicates whether the +// annotation is stable and thus cacheable. +func (a *Annotator[T]) nodeAnnotation(n *node[*TableMetadata]) (t *T, cacheOK bool) { + annot := a.findAnnotation(n) + // If the annotation is already marked as valid, we can return it without + // recomputing anything. + if annot.valid.Load() { + // The load of these two atomics should be safe, as we don't need to + // guarantee invalidations are concurrent-safe. See the comment on + // InvalidateLevelAnnotation about why. + return annot.v.Load().(*T), true + } + + t = a.Aggregator.Zero(t) + valid := true + + for i := int16(0); i <= n.count; i++ { + if !n.leaf { + v, ok := a.nodeAnnotation(n.children[i]) + t = a.Aggregator.Merge(v, t) + valid = valid && ok + } + + if i < n.count { + var ok bool + t, ok = a.Aggregator.Accumulate(n.items[i], t) + valid = valid && ok + } + } + + if valid { + // Two valid annotations should be identical, so this is + // okay. + annot.v.Store(t) + annot.valid.Store(valid) + } + + return t, valid +} + +// accumulateRangeAnnotation computes this annotator's annotation across all +// files in the node's subtree which overlap with the range defined by bounds. +// The computed annotation is accumulated into a.scratch. +func (a *Annotator[T]) accumulateRangeAnnotation( + n *node[*TableMetadata], + cmp base.Compare, + bounds base.UserKeyBounds, + // fullyWithinLowerBound and fullyWithinUpperBound indicate whether this + // node's subtree is already known to be within each bound. + fullyWithinLowerBound bool, + fullyWithinUpperBound bool, + dst *T, +) *T { + // If this node's subtree is fully within the bounds, compute a regular + // annotation. + if fullyWithinLowerBound && fullyWithinUpperBound { + v, _ := a.nodeAnnotation(n) + dst = a.Aggregator.Merge(v, dst) + return dst + } + + // We will accumulate annotations from each item in the end-exclusive + // range [leftItem, rightItem). + leftItem, rightItem := 0, int(n.count) + if !fullyWithinLowerBound { + // leftItem is the index of the first item that overlaps the lower bound. + leftItem = sort.Search(int(n.count), func(i int) bool { + return cmp(bounds.Start, n.items[i].Largest().UserKey) <= 0 + }) + } + if !fullyWithinUpperBound { + // rightItem is the index of the first item that does not overlap the + // upper bound. + rightItem = sort.Search(int(n.count), func(i int) bool { + return !bounds.End.IsUpperBoundFor(cmp, n.items[i].Smallest().UserKey) + }) + } + + // Accumulate annotations from every item that overlaps the bounds. + for i := leftItem; i < rightItem; i++ { + if i == leftItem || i == rightItem-1 { + if agg, ok := a.Aggregator.(PartialOverlapAnnotationAggregator[T]); ok { + fb := n.items[i].UserKeyBounds() + if cmp(bounds.Start, fb.Start) > 0 || bounds.End.CompareUpperBounds(cmp, fb.End) < 0 { + dst = agg.AccumulatePartialOverlap(n.items[i], dst, bounds) + continue + } + } + } + v, _ := a.Aggregator.Accumulate(n.items[i], dst) + dst = v + } + + if !n.leaf { + // We will accumulate annotations from each child in the end-inclusive + // range [leftChild, rightChild]. + leftChild, rightChild := leftItem, rightItem + // If the lower bound overlaps with the child at leftItem, there is no + // need to accumulate annotations from the child to its left. + if leftItem < int(n.count) && cmp(bounds.Start, n.items[leftItem].Smallest().UserKey) >= 0 { + leftChild++ + } + // If the upper bound spans beyond the child at rightItem, we must also + // accumulate annotations from the child to its right. + if rightItem < int(n.count) && bounds.End.IsUpperBoundFor(cmp, n.items[rightItem].Largest().UserKey) { + rightChild++ + } + + for i := leftChild; i <= rightChild; i++ { + dst = a.accumulateRangeAnnotation( + n.children[i], + cmp, + bounds, + // If this child is to the right of leftItem, then its entire + // subtree is within the lower bound. + fullyWithinLowerBound || i > leftItem, + // If this child is to the left of rightItem, then its entire + // subtree is within the upper bound. + fullyWithinUpperBound || i < rightItem, + dst, + ) + } + } + return dst +} + +// InvalidateAnnotation removes any existing cached annotations from this +// annotator from a node's subtree. +func (a *Annotator[T]) invalidateNodeAnnotation(n *node[*TableMetadata]) { + annot := a.findAnnotation(n) + annot.valid.Store(false) + if !n.leaf { + for i := int16(0); i <= n.count; i++ { + a.invalidateNodeAnnotation(n.children[i]) + } + } +} + +// LevelAnnotation calculates the annotation defined by this Annotator for all +// files in the given LevelMetadata. A pointer to the Annotator is used as the +// key for pre-calculated values, so the same Annotator must be used to avoid +// duplicate computation. +func (a *Annotator[T]) LevelAnnotation(lm LevelMetadata) *T { + if lm.Empty() { + return a.Aggregator.Zero(nil) + } + + v, _ := a.nodeAnnotation(lm.tree.root) + return v +} + +// MultiLevelAnnotation calculates the annotation defined by this Annotator for +// all files across the given levels. A pointer to the Annotator is used as the +// key for pre-calculated values, so the same Annotator must be used to avoid +// duplicate computation. +func (a *Annotator[T]) MultiLevelAnnotation(lms []LevelMetadata) *T { + aggregated := a.Aggregator.Zero(nil) + for l := 0; l < len(lms); l++ { + if !lms[l].Empty() { + v := a.LevelAnnotation(lms[l]) + aggregated = a.Aggregator.Merge(v, aggregated) + } + } + return aggregated +} + +// LevelRangeAnnotation calculates the annotation defined by this Annotator for +// the files within LevelMetadata which are within the range +// [lowerBound, upperBound). A pointer to the Annotator is used as the key for +// pre-calculated values, so the same Annotator must be used to avoid duplicate +// computation. +func (a *Annotator[T]) LevelRangeAnnotation( + cmp base.Compare, lm LevelMetadata, bounds base.UserKeyBounds, +) *T { + if lm.Empty() { + return a.Aggregator.Zero(nil) + } + + var dst *T + dst = a.Aggregator.Zero(dst) + dst = a.accumulateRangeAnnotation(lm.tree.root, cmp, bounds, false, false, dst) + return dst +} + +// VersionRangeAnnotation calculates the annotation defined by this Annotator +// for all files within the given Version which are within the range +// defined by bounds. +func (a *Annotator[T]) VersionRangeAnnotation(v *Version, bounds base.UserKeyBounds) *T { + var dst *T + dst = a.Aggregator.Zero(dst) + accumulateSlice := func(ls LevelSlice) { + if ls.Empty() { + return + } + dst = a.accumulateRangeAnnotation(ls.iter.r, v.cmp.Compare, bounds, false, false, dst) + } + for _, ls := range v.L0SublevelFiles { + accumulateSlice(ls) + } + for _, lm := range v.Levels[1:] { + accumulateSlice(lm.Slice()) + } + return dst +} + +// InvalidateLevelAnnotation clears any cached annotations defined by Annotator. +// A pointer to the Annotator is used as the key for pre-calculated values, so +// the same Annotator must be used to clear the appropriate cached annotation. +// Calls to InvalidateLevelAnnotation are *not* concurrent-safe with any other +// calls to Annotator methods for the same Annotator (concurrent calls from +// other annotators are fine). Any calls to this function must have some +// externally-guaranteed mutual exclusion. +func (a *Annotator[T]) InvalidateLevelAnnotation(lm LevelMetadata) { + if lm.Empty() { + return + } + a.invalidateNodeAnnotation(lm.tree.root) +} + +// SumAggregator defines an Aggregator which sums together a uint64 value +// across files. +type SumAggregator struct { + AccumulateFunc func(f *TableMetadata) (v uint64, cacheOK bool) + AccumulatePartialOverlapFunc func(f *TableMetadata, bounds base.UserKeyBounds) uint64 +} + +// Zero implements AnnotationAggregator.Zero, returning a new uint64 set to 0. +func (sa SumAggregator) Zero(dst *uint64) *uint64 { + if dst == nil { + return new(uint64) + } + *dst = 0 + return dst +} + +// Accumulate implements AnnotationAggregator.Accumulate, accumulating a single +// file's uint64 value. +func (sa SumAggregator) Accumulate(f *TableMetadata, dst *uint64) (v *uint64, cacheOK bool) { + accumulated, ok := sa.AccumulateFunc(f) + *dst += accumulated + return dst, ok +} + +// AccumulatePartialOverlap implements +// PartialOverlapAnnotationAggregator.AccumulatePartialOverlap, accumulating a +// single file's uint64 value for a file which only partially overlaps with the +// range defined by bounds. +func (sa SumAggregator) AccumulatePartialOverlap( + f *TableMetadata, dst *uint64, bounds base.UserKeyBounds, +) *uint64 { + if sa.AccumulatePartialOverlapFunc == nil { + v, _ := sa.Accumulate(f, dst) + return v + } + *dst += sa.AccumulatePartialOverlapFunc(f, bounds) + return dst +} + +// Merge implements AnnotationAggregator.Merge by summing two uint64 values. +func (sa SumAggregator) Merge(src *uint64, dst *uint64) *uint64 { + *dst += *src + return dst +} + +// SumAnnotator takes a function that computes a uint64 value from a single +// TableMetadata and returns an Annotator that sums together the values across +// files. +func SumAnnotator(accumulate func(f *TableMetadata) (v uint64, cacheOK bool)) *Annotator[uint64] { + return &Annotator[uint64]{ + Aggregator: SumAggregator{ + AccumulateFunc: accumulate, + }, + } +} + +// NumFilesAnnotator is an Annotator which computes an annotation value +// equal to the number of files included in the annotation. Particularly, it +// can be used to efficiently calculate the number of files in a given key +// range using range annotations. +var NumFilesAnnotator = SumAnnotator(func(f *TableMetadata) (uint64, bool) { + return 1, true +}) + +// PickFileAggregator implements the AnnotationAggregator interface. It defines +// an aggregator that picks a single file from a set of eligible files. +type PickFileAggregator struct { + // Filter takes a TableMetadata and returns whether it is eligible to be + // picked by this PickFileAggregator. The second return value indicates + // whether this eligibility is stable and thus cacheable. + Filter func(f *TableMetadata) (eligible bool, cacheOK bool) + // Compare compares two instances of TableMetadata and returns true if the + // first one should be picked over the second one. It may assume that both + // arguments are non-nil. + Compare func(f1 *TableMetadata, f2 *TableMetadata) bool +} + +// Zero implements AnnotationAggregator.Zero, returning nil as the zero value. +func (fa PickFileAggregator) Zero(dst *TableMetadata) *TableMetadata { + return nil +} + +func (fa PickFileAggregator) mergePickedFiles( + src *TableMetadata, dst *TableMetadata, +) *TableMetadata { + switch { + case src == nil: + return dst + case dst == nil: + return src + case fa.Compare(src, dst): + return src + default: + return dst + } +} + +// Accumulate implements AnnotationAggregator.Accumulate, accumulating a single +// file as long as it is eligible to be picked. +func (fa PickFileAggregator) Accumulate( + f *TableMetadata, dst *TableMetadata, +) (v *TableMetadata, cacheOK bool) { + eligible, ok := fa.Filter(f) + if eligible { + return fa.mergePickedFiles(f, dst), ok + } + return dst, ok +} + +// Merge implements AnnotationAggregator.Merge by picking a single file based +// on the output of PickFileAggregator.Compare. +func (fa PickFileAggregator) Merge(src *TableMetadata, dst *TableMetadata) *TableMetadata { + return fa.mergePickedFiles(src, dst) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/blob_metadata.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/blob_metadata.go new file mode 100644 index 0000000..5b78e2d --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/blob_metadata.go @@ -0,0 +1,1008 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +import ( + stdcmp "cmp" + "container/heap" + "fmt" + "iter" + "maps" + "slices" + "strings" + "sync/atomic" + "time" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/strparse" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/redact" +) + +// A BlobReference describes a sstable's reference to a blob value file. A +// BlobReference is immutable. +type BlobReference struct { + // FileID identifies the referenced blob file. FileID is stable. If a blob + // file is rewritten and a blob reference is preserved during a compaction, + // the new sstable's BlobReference will preserve the same FileID. + FileID base.BlobFileID + // ValueSize is the sum of the lengths of the uncompressed values within the + // blob file for which there exists a reference in the sstable. Note that if + // any of the referencing tables are virtualized tables, the ValueSize may + // be approximate. + ValueSize uint64 + // EstimatedPhysicalSize is an estimate of the physical size of the blob + // reference, in bytes. It's calculated by scaling the blob file's physical + // size according to the ValueSize of the blob reference relative to the + // total ValueSize of the blob file. + EstimatedPhysicalSize uint64 +} + +// MakeBlobReference creates a BlobReference from the given file ID, value size, +// and physical blob file. +func MakeBlobReference( + fileID base.BlobFileID, valueSize uint64, phys *PhysicalBlobFile, +) BlobReference { + if invariants.Enabled { + switch { + case valueSize > phys.ValueSize: + panic(errors.AssertionFailedf("pebble: blob reference value size %d is greater than the blob file's value size %d", + valueSize, phys.ValueSize)) + case valueSize == 0: + panic(errors.AssertionFailedf("pebble: blob reference value size %d is zero", valueSize)) + case phys.ValueSize == 0: + panic(errors.AssertionFailedf("pebble: blob file value size %d is zero", phys.ValueSize)) + } + } + return BlobReference{ + FileID: fileID, + ValueSize: valueSize, + // valueSize + // Reference size = ----------------- × phys.Size + // phys.ValueSize + // + // We perform the multiplication first to avoid floating point arithmetic. + EstimatedPhysicalSize: (valueSize * phys.Size) / phys.ValueSize, + } +} + +// BlobFileMetadata encapsulates a blob file ID used to identify a particular +// blob file, and a reference-counted physical blob file. Different Versions may +// contain different BlobFileMetadata with the same FileID, but if so they +// necessarily point to different PhysicalBlobFiles. +// +// See the BlobFileSet documentation for more details. +type BlobFileMetadata struct { + // FileID is a stable identifier for referencing a blob file containing + // values. It is the same domain as the BlobReference.FileID. Blob + // references use the FileID to look up the physical blob file containing + // referenced values. + FileID base.BlobFileID + // Physical is the metadata for the physical blob file. + // + // If the blob file has been replaced, Physical.FileNum ≠ FileID. Physical + // is always non-nil. + Physical *PhysicalBlobFile +} + +// SafeFormat implements redact.SafeFormatter. +func (m BlobFileMetadata) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("%s physical:{%s}", m.FileID, m.Physical) +} + +// String implements fmt.Stringer. +func (m BlobFileMetadata) String() string { + return redact.StringWithoutMarkers(m) +} + +// Ref increments the reference count for the physical blob file. +func (m BlobFileMetadata) Ref() { + m.Physical.ref() +} + +// Unref decrements the reference count for the physical blob file. If the +// reference count reaches zero, the blob file is added to the provided obsolete +// files set. +func (m BlobFileMetadata) Unref(of ObsoleteFilesSet) { + m.Physical.unref(of) +} + +// PhysicalBlobFile is metadata describing a physical blob value file. +type PhysicalBlobFile struct { + // FileNum is an ID that uniquely identifies the blob file. + FileNum base.DiskFileNum + // Size is the size of the file, in bytes. + Size uint64 + // ValueSize is the sum of the length of the uncompressed values stored in + // this blob file. + ValueSize uint64 + // File creation time in seconds since the epoch (1970-01-01 00:00:00 + // UTC). + CreationTime uint64 + + // Mutable state + + // refs holds the reference count for the blob file. Each ref is from a + // TableMetadata in a Version with a positive reference count. Each + // TableMetadata can refer to multiple blob files and will count as 1 ref + // for each of those blob files. The ref count is incremented when a new + // referencing TableMetadata is installed in a Version and decremented when + // that TableMetadata becomes obsolete. + refs atomic.Int32 +} + +// SafeFormat implements redact.SafeFormatter. +func (m *PhysicalBlobFile) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("%s size:[%d (%s)] vals:[%d (%s)]", + m.FileNum, redact.Safe(m.Size), humanize.Bytes.Uint64(m.Size), + redact.Safe(m.ValueSize), humanize.Bytes.Uint64(m.ValueSize)) +} + +// String implements fmt.Stringer. +func (m *PhysicalBlobFile) String() string { + return redact.StringWithoutMarkers(m) +} + +// ref increments the reference count for the blob file. +func (m *PhysicalBlobFile) ref() { + m.refs.Add(+1) +} + +// unref decrements the reference count for the blob file. If the reference +// count reaches zero, the blob file is added to the provided obsolete files +// set. +func (m *PhysicalBlobFile) unref(of ObsoleteFilesSet) { + refs := m.refs.Add(-1) + if refs < 0 { + panic(errors.AssertionFailedf("pebble: refs for blob file %s equal to %d", m.FileNum, refs)) + } else if refs == 0 { + of.AddBlob(m) + } +} + +// ParseBlobFileMetadataDebug parses a BlobFileMetadata from its string +// representation. This function is intended for use in tests. It's the inverse +// of BlobFileMetadata.String(). +func ParseBlobFileMetadataDebug(s string) (_ BlobFileMetadata, err error) { + defer func() { + if r := recover(); r != nil { + err = errors.CombineErrors(err, errFromPanic(r)) + } + }() + + // Input format: + // B000102 physical:{000000: size:[206536 (201KiB)], vals:[393256 (384KiB)] creationTime:1718851200} + p := strparse.MakeParser(debugParserSeparators, s) + fileID := p.BlobFileID() + p.Expect("physical") + p.Expect(":") + p.Expect("{") + physical, err := parsePhysicalBlobFileDebug(&p) + if err != nil { + return BlobFileMetadata{}, err + } + p.Expect("}") + return BlobFileMetadata{FileID: fileID, Physical: physical}, nil +} + +// ParsePhysicalBlobFileDebug parses a PhysicalBlobFile from its string +// representation. This function is intended for use in tests. It's the inverse +// of PhysicalBlobFile.String(). +// +// In production code paths, the PhysicalBlobFile is serialized in a binary +// format within a version edit under the tag tagNewBlobFile. +func ParsePhysicalBlobFileDebug(s string) (_ *PhysicalBlobFile, err error) { + defer func() { + if r := recover(); r != nil { + err = errors.CombineErrors(err, errFromPanic(r)) + } + }() + + // Input format: + // 000000: size:[206536 (201KiB)], vals:[393256 (384KiB)] creationTime:1718851200 + p := strparse.MakeParser(debugParserSeparators, s) + return parsePhysicalBlobFileDebug(&p) +} + +func parsePhysicalBlobFileDebug(p *strparse.Parser) (*PhysicalBlobFile, error) { + m := &PhysicalBlobFile{} + m.FileNum = p.DiskFileNum() + + maybeSkipParens := func() { + if p.Peek() != "(" { + return + } + for p.Next() != ")" { + // Skip. + } + } + + for !p.Done() && p.Peek() != "}" { + field := p.Next() + p.Expect(":") + switch field { + case "size": + p.Expect("[") + m.Size = p.Uint64() + maybeSkipParens() + p.Expect("]") + case "vals": + p.Expect("[") + m.ValueSize = p.Uint64() + maybeSkipParens() + p.Expect("]") + case "creationTime": + m.CreationTime = p.Uint64() + default: + p.Errf("unknown field %q", field) + } + } + return m, nil +} + +// BlobReferenceDepth is a statistic maintained per-sstable, indicating an upper +// bound on the number of blob files that a reader scanning the table would need +// to keep open if they only open and close referenced blob files once. In other +// words, it's the stack depth of blob files referenced by a sstable. If a +// flush or compaction rewrites an sstable's values to a new blob file, the +// resulting sstable has a blob reference depth of 1. When a compaction reuses +// blob references, the max blob reference depth of the files in each level is +// used, and then the depth is summed, and assigned to the output. This is a +// loose upper bound (assuming worst case distribution of keys in all inputs) +// but avoids tracking key spans for references and using key comparisons. +// +// Because the blob reference depth is the size of the working set of blob files +// referenced by the table, it cannot exceed the count of distinct blob file +// references. +// +// Example: Consider a compaction of file f0 from L0 and files f1, f2, f3 from +// L1, where the former has blob reference depth of 1 and files f1, f2, f3 all +// happen to have a blob-reference-depth of 1. Say we produce many output files, +// one of which is f4. We are assuming here that the blobs referenced by f0 +// whose keys happened to be written to f4 are spread all across the key span of +// f4. Say keys from f1 and f2 also made their way to f4. Then we will first +// have keys that refer to blobs referenced by f1,f0 and at some point once we +// move past the keys of f1, we will have keys that refer to blobs referenced by +// f2,f0. In some sense, we have a working set of 2 blob files at any point in +// time, and this is similar to the idea of level stack depth for reads -- hence +// we adopt the depth terminology. We want to keep this stack depth in check, +// since locality is important, while allowing it to be higher than 1, since +// otherwise we will need to rewrite blob files in every compaction (defeating +// the write amp benefit we are looking for). Similar to the level depth, this +// simplistic analysis does not take into account distribution of keys involved +// in the compaction and which of them have blob references. Also the locality +// is actually better than in this analysis because more of the keys will be +// from the lower level. +type BlobReferenceDepth int + +// BlobReferences is a slice of BlobReference. The order of the slice is +// significant and should be maintained. In practice, a sstable's BlobReferences +// are ordered by earliest appearance within the sstable. The ordering is +// persisted to the manifest. +type BlobReferences []BlobReference + +// Assert that *BlobReferences implements sstable.BlobReferences. +var _ sstable.BlobReferences = (*BlobReferences)(nil) + +// BlobFileIDByID returns the BlobFileID for the identified BlobReference. +func (br *BlobReferences) BlobFileIDByID(i blob.ReferenceID) base.BlobFileID { + return (*br)[i].FileID +} + +// IDByBlobFileID returns the reference ID for the given BlobFileID. If the +// blob file ID is not found, the second return value is false. +// IDByBlobFileID is linear in the length of the BlobReferences slice. +func (br *BlobReferences) IDByBlobFileID(fileID base.BlobFileID) (blob.ReferenceID, bool) { + for i, ref := range *br { + if ref.FileID == fileID { + return blob.ReferenceID(i), true + } + } + return blob.ReferenceID(len(*br)), false +} + +// BlobFileSet contains a set of blob files that are referenced by a version. +// It's used to maintain reference counts on blob files still in-use by some +// referenced version. +// +// It's backed by a copy-on-write B-Tree of BlobFileMetadata keyed by FileID. A +// version edit that adds or deletes m blob files updates m⋅log(n) nodes in the +// B-Tree. +// +// Initially a BlobFileMetadata has a FileID that matches the DiskFileNum of the +// backing physical blob file. However a blob file may be replaced without +// replacing the referencing TableMetadatas, which is recorded in the +// BlobFileSet by replacing the old BlobFileMetadata with a different +// PhysicalBlobFile. +type BlobFileSet struct { + tree btree[BlobFileMetadata] +} + +// MakeBlobFileSet creates a BlobFileSet from the given blob files. +func MakeBlobFileSet(entries []BlobFileMetadata) BlobFileSet { + return BlobFileSet{tree: makeBTree(btreeCmpBlobFileID, entries)} +} + +// All returns an iterator over all the blob files in the set. +func (s *BlobFileSet) All() iter.Seq[BlobFileMetadata] { + return s.tree.All() +} + +// Count returns the number of blob files in the set. +func (s *BlobFileSet) Count() int { + return s.tree.Count() +} + +// Lookup returns the file number of the physical blob file backing the given +// file ID. It returns false for the second return value if the FileID is not +// present in the set. +func (s *BlobFileSet) Lookup(fileID base.BlobFileID) (base.DiskFileNum, bool) { + phys, ok := s.LookupPhysical(fileID) + if !ok { + return 0, false + } + return phys.FileNum, true +} + +// LookupPhysical returns the *PhysicalBlobFile backing the given file ID. It +// returns false for the second return value if the FileID is not present in the +// set. +func (s *BlobFileSet) LookupPhysical(fileID base.BlobFileID) (*PhysicalBlobFile, bool) { + // LookupPhysical is performed during value retrieval to determine the + // physical blob file that should be read, so it's considered to be + // performance sensitive. We manually inline the B-Tree traversal and binary + // search with this in mind. + n := s.tree.root + for n != nil { + var h int + // Logic copied from sort.Search. + i, j := 0, int(n.count) + for i < j { + h = int(uint(i+j) >> 1) // avoid overflow when computing h + // i ≤ h < j + v := stdcmp.Compare(fileID, n.items[h].FileID) + if v == 0 { + // Found the sought blob file. + return n.items[h].Physical, true + } else if v > 0 { + i = h + 1 + } else { + j = h + } + } + // If we've reached a lead node without finding fileID, the file is not + // present. + if n.leaf { + return nil, false + } + n = n.children[i] + } + return nil, false +} + +// Assert that (*BlobFileSet) implements blob.FileMapping. +var _ blob.FileMapping = (*BlobFileSet)(nil) + +// clone returns a copy-on-write clone of the blob file set. +func (s *BlobFileSet) clone() BlobFileSet { + return BlobFileSet{tree: s.tree.Clone()} +} + +// insert inserts a blob file into the set. +func (s *BlobFileSet) insert(entry BlobFileMetadata) error { + return s.tree.Insert(entry) +} + +// remove removes a blob file from the set. +func (s *BlobFileSet) remove(entry BlobFileMetadata) { + // Removing an entry from the B-Tree may decrement file reference counts. + // However, the BlobFileSet is copy-on-write. We only mutate the BlobFileSet + // while constructing a new version edit. The current Version (to which the + // edit will be applied) should maintain a reference. So a call to remove() + // should never result in a file's reference count dropping to zero, and we + // pass assertNoObsoleteFiles{} to assert such. + s.tree.Delete(entry, assertNoObsoleteFiles{}) +} + +// release releases the blob file's references. It's called when unreferencing a +// Version. +func (s *BlobFileSet) release(of ObsoleteFilesSet) { + s.tree.Release(of) + s.tree = btree[BlobFileMetadata]{} +} + +// AggregateBlobFileStats records cumulative stats across blob files. +type AggregateBlobFileStats struct { + // Count is the number of blob files in the set. + Count uint64 + // PhysicalSize is the sum of the size of all blob files in the set. This + // is the size of the blob files on physical storage. Data within blob files + // is compressed, so this value may be less than ValueSize. + PhysicalSize uint64 + // ValueSize is the sum of the length of the uncompressed values in all blob + // files in the set. + ValueSize uint64 + // ReferencedValueSize is the sum of the length of the uncompressed values + // in all blob files in the set that are still referenced by live tables + // (i.e., in the latest version). + ReferencedValueSize uint64 + // ReferencesCount is the total number of tracked references in live tables + // (i.e., in the latest version). When virtual sstables are present, this + // count is per-virtual sstable (not per backing physical sstable). + ReferencesCount uint64 +} + +// String implements fmt.Stringer. +func (s AggregateBlobFileStats) String() string { + return fmt.Sprintf("Files:{Count: %d, Size: %d, ValueSize: %d}, References:{ValueSize: %d, Count: %d}", + s.Count, s.PhysicalSize, s.ValueSize, s.ReferencedValueSize, s.ReferencesCount) +} + +// CurrentBlobFileSet describes the set of blob files that are currently live in +// the latest Version. CurrentBlobFileSet is not thread-safe. In practice its +// use is protected by the versionSet logLock. +type CurrentBlobFileSet struct { + // files is a map of blob file IDs to *currentBlobFiles, recording metadata + // about the blob file's active references in the latest Version. + files map[base.BlobFileID]*currentBlobFile + // stats records cumulative stats across all blob files in the set. + stats AggregateBlobFileStats + // rewrite holds state and configuration in support of rewriting blob files + // to reclaim disk space. + rewrite struct { + // heuristic holds configuration of the heuristic used to determine + // which blob files are considered for rewrite. + heuristic BlobRewriteHeuristic + // candidates holds a set of blob files that a) contain some + // unreferenced values and b) are sufficiently old. The candidates are + // organized in a min heap of referenced ratios. + // + // Files with less referenced data relative to the total data set are at + // the top of the heap. Rewriting such files first maximizes the disk + // space reclaimed per byte written. + candidates currentBlobFileHeap[byReferencedRatio] + // recentlyCreated holds a set of blob files that contain unreferenced + // values but are not yet candidates for rewrite and replacement because + // they're too young. The set is organized as a min heap of creation + // times. + recentlyCreated currentBlobFileHeap[byCreationTime] + } +} + +type currentBlobFile struct { + // metadata is a copy of the current BlobFileMetadata in the latest Version. + metadata BlobFileMetadata + // references holds pointers to TableMetadatas that exist in the latest + // version and reference this blob file. When the length of references falls + // to zero, the blob file is either a zombie file (if BlobFileMetadata.refs + // > 0), or obsolete and ready to be deleted. + // + // INVARIANT: BlobFileMetadata.refs >= len(references) + // + // TODO(jackson): Rather than using 1 map per blob file which needs to grow + // and shrink over the lifetime of the blob file, we could use a single + // B-Tree that holds all blob references, sorted by blob file then by + // referencing table number. This would likely be more memory efficient, + // reduce overall number of pointers to chase and suffer fewer allocations + // (and we can pool the B-Tree nodes to further reduce allocs) + references map[*TableMetadata]struct{} + // referencedValueSize is the sum of the length of uncompressed values in + // this blob file that are still live. + referencedValueSize uint64 + // heapState holds a pointer to the heap that the blob file is in (if any) + // and its index in the heap's items slice. If referencedValueSize is less + // than metadata.Physical.ValueSize, the blob file belongs in one of the two + // heaps. If its physical file's creation time indicates its less than + // MinimumAgeSecs seconds old, it belongs in the recentlyCreated heap. + // Otherwise, it belongs in the candidates heap. + heapState struct { + heap heap.Interface + index int + } +} + +// BlobRewriteHeuristic configures the heuristic used to determine which blob +// files should be rewritten and replaced in order to reduce value-separation +// induced space amplification. +// +// The heuristic divides blob files into three categories: +// +// 1. Fully referenced: Blob files that are fully referenced by live tables. +// 2. Recently created: Blob files with garbage that were recently created. +// 3. Eligible: Blob files with garbage that are old. +// +// Files in the first category (fully referenced) should never be rewritten, +// because rewriting them has no impact on space amplification. +// +// Among files that are not fully referenced, the heuristic separates files into +// files that were recently created (less than MinimumAgeSecs seconds old) and +// files that are old and eligible for rewrite. We defer rewriting recently +// created files under the assumption that their references may be removed +// through ordinary compactions. The threshold for what is considered recent is +// the MinimumAgeSecs field. +type BlobRewriteHeuristic struct { + // CurrentTime returns the current time. + CurrentTime func() time.Time + // MinimumAge is the minimum age of a blob file that is considered for + // rewrite and replacement. + // + // TODO(jackson): Support updating this at runtime. Lowering the value is + // simple: pop from the recentlyCreated heap and push to the candidates + // heap. Raising the value is more complex: we would need to iterate over + // all the blob files in the candidates heap. + MinimumAge time.Duration +} + +// BlobRewriteHeuristicStats records statistics about the blob rewrite heuristic. +type BlobRewriteHeuristicStats struct { + CountFilesFullyReferenced int + CountFilesTooRecent int + CountFilesEligible int + NextEligible BlobFileMetadata + NextEligibleLivePct float64 + NextRewrite BlobFileMetadata + NextRewriteLivePct float64 +} + +// String implements fmt.Stringer. +func (s BlobRewriteHeuristicStats) String() string { + var sb strings.Builder + sb.WriteString(fmt.Sprintf("Counts:{FullyReferenced: %d, Eligible: %d, TooRecent: %d}", + s.CountFilesFullyReferenced, s.CountFilesEligible, s.CountFilesTooRecent)) + if s.NextEligible.FileID != 0 { + sb.WriteString(fmt.Sprintf("\nNextEligible: %s (%.1f%% live, created at %d)", + s.NextEligible, s.NextEligibleLivePct, s.NextEligible.Physical.CreationTime)) + } + if s.NextRewrite.FileID != 0 { + sb.WriteString(fmt.Sprintf("\nNextRewrite: %s (%.1f%% live, created at %d)", + s.NextRewrite, s.NextRewriteLivePct, s.NextRewrite.Physical.CreationTime)) + } + return sb.String() +} + +// Init initializes the CurrentBlobFileSet with the state of the provided +// BulkVersionEdit. This is used after replaying a manifest. +func (s *CurrentBlobFileSet) Init(bve *BulkVersionEdit, h BlobRewriteHeuristic) { + *s = CurrentBlobFileSet{files: make(map[base.BlobFileID]*currentBlobFile)} + s.rewrite.heuristic = h + if bve == nil { + return + } + for blobFileID, pbf := range bve.BlobFiles.Added { + s.files[blobFileID] = ¤tBlobFile{ + metadata: BlobFileMetadata{FileID: blobFileID, Physical: pbf}, + references: make(map[*TableMetadata]struct{}), + } + s.stats.Count++ + s.stats.PhysicalSize += pbf.Size + s.stats.ValueSize += pbf.ValueSize + } + // Record references to blob files from extant tables. Any referenced blob + // files should already exist in s.files. + for _, levelTables := range bve.AddedTables { + for _, table := range levelTables { + for _, ref := range table.BlobReferences { + cbf, ok := s.files[ref.FileID] + if !ok { + panic(errors.AssertionFailedf("pebble: referenced blob file %d not found", ref.FileID)) + } + cbf.references[table] = struct{}{} + cbf.referencedValueSize += ref.ValueSize + s.stats.ReferencedValueSize += ref.ValueSize + s.stats.ReferencesCount++ + } + } + } + + // Initialize the heaps. + // + // Iterate through all the blob files. If a file is fully referenced, there's + // no need to track it in either heap. Otherwise, if the file is sufficiently + // old, add it to the heap of rewrite candidates. Otherwise, add it to the + // heap of recently created blob files. + now := s.rewrite.heuristic.CurrentTime() + s.rewrite.candidates.items = make([]*currentBlobFile, 0, 16) + s.rewrite.recentlyCreated.items = make([]*currentBlobFile, 0, 16) + for _, cbf := range s.files { + if cbf.referencedValueSize >= cbf.metadata.Physical.ValueSize { + // The blob file is fully referenced. There's no need to track it in + // either heap. + continue + } + + // If the blob file is sufficiently old, add it to the heap of rewrite + // candidates. Otherwise, add it to the heap of recently created blob + // files. + if now.Sub(time.Unix(int64(cbf.metadata.Physical.CreationTime), 0)) >= s.rewrite.heuristic.MinimumAge { + cbf.heapState.heap = &s.rewrite.candidates + cbf.heapState.index = len(s.rewrite.candidates.items) + s.rewrite.candidates.items = append(s.rewrite.candidates.items, cbf) + } else { + // This blob file is too young. Add it to the heap of recently + // created blob files. + cbf.heapState.heap = &s.rewrite.recentlyCreated + cbf.heapState.index = len(s.rewrite.recentlyCreated.items) + s.rewrite.recentlyCreated.items = append(s.rewrite.recentlyCreated.items, cbf) + } + } + // Establish the heap invariants. + heap.Init(&s.rewrite.candidates) + heap.Init(&s.rewrite.recentlyCreated) + s.maybeVerifyHeapStateInvariants() +} + +// Stats returns the cumulative stats across all blob files in the set and the +// stats for the rewrite heaps. +func (s *CurrentBlobFileSet) Stats() (AggregateBlobFileStats, BlobRewriteHeuristicStats) { + rewriteStats := BlobRewriteHeuristicStats{ + CountFilesTooRecent: len(s.rewrite.recentlyCreated.items), + CountFilesEligible: len(s.rewrite.candidates.items), + CountFilesFullyReferenced: int(s.stats.Count) - len(s.rewrite.candidates.items) - len(s.rewrite.recentlyCreated.items), + } + if len(s.rewrite.candidates.items) > 0 { + rewriteStats.NextRewrite = s.rewrite.candidates.items[0].metadata + rewriteStats.NextRewriteLivePct = 100 * float64(s.rewrite.candidates.items[0].referencedValueSize) / + float64(s.rewrite.candidates.items[0].metadata.Physical.ValueSize) + } + if len(s.rewrite.recentlyCreated.items) > 0 { + rewriteStats.NextEligible = s.rewrite.recentlyCreated.items[0].metadata + rewriteStats.NextEligibleLivePct = 100 * float64(s.rewrite.recentlyCreated.items[0].referencedValueSize) / + float64(s.rewrite.recentlyCreated.items[0].metadata.Physical.ValueSize) + } + return s.stats, rewriteStats +} + +// String implements fmt.Stringer. +func (s *CurrentBlobFileSet) String() string { + stats, rewriteStats := s.Stats() + var sb strings.Builder + sb.WriteString("CurrentBlobFileSet:\n") + sb.WriteString(stats.String()) + sb.WriteString("\n") + sb.WriteString(rewriteStats.String()) + return sb.String() +} + +// Metadatas returns a slice of all blob file metadata in the set, sorted by +// file number for determinism. +func (s *CurrentBlobFileSet) Metadatas() []BlobFileMetadata { + m := make([]BlobFileMetadata, 0, len(s.files)) + for _, cbf := range s.files { + m = append(m, cbf.metadata) + } + slices.SortFunc(m, func(a, b BlobFileMetadata) int { + return stdcmp.Compare(a.FileID, b.FileID) + }) + return m +} + +// ApplyAndUpdateVersionEdit applies a version edit to the current blob file +// set, updating its internal tracking of extant blob file references. If after +// applying the version edit a blob file has no more references, the version +// edit is modified to record the blob file removal. +func (s *CurrentBlobFileSet) ApplyAndUpdateVersionEdit(ve *VersionEdit) error { + defer s.maybeVerifyHeapStateInvariants() + + currentTime := s.rewrite.heuristic.CurrentTime() + + // Insert new blob files into the set. + for _, m := range ve.NewBlobFiles { + // Check whether we already have a blob file with this ID. This is + // possible if the blob file is being replaced. + if cbf, ok := s.files[m.FileID]; ok { + // There should be a DeletedBlobFileEntry for this file ID and the + // FileNum currently in the set. + dbfe := DeletedBlobFileEntry{FileID: m.FileID, FileNum: cbf.metadata.Physical.FileNum} + if _, ok := ve.DeletedBlobFiles[dbfe]; !ok { + return errors.AssertionFailedf("pebble: new blob file %d already exists", m.FileID) + } + // The file is being replaced. Update the statistics and cbf.Physical. + s.stats.PhysicalSize -= cbf.metadata.Physical.Size + s.stats.ValueSize -= cbf.metadata.Physical.ValueSize + cbf.metadata.Physical = m.Physical + s.stats.PhysicalSize += m.Physical.Size + s.stats.ValueSize += m.Physical.ValueSize + // Remove the blob file from the rewrite candidate heap. + if cbf.heapState.heap != nil { + heap.Remove(cbf.heapState.heap, cbf.heapState.index) + cbf.heapState.heap = nil + cbf.heapState.index = -1 + } + // It's possible that the new blob file is still not fully + // referenced. Additional references could have been removed while + // the blob file rewrite was occurring. We need to re-add it to the + // heap of recently created files. + if cbf.referencedValueSize < cbf.metadata.Physical.ValueSize { + heap.Push(&s.rewrite.recentlyCreated, cbf) + } + continue + } + + blobFileID := m.FileID + cbf := ¤tBlobFile{references: make(map[*TableMetadata]struct{})} + cbf.metadata = BlobFileMetadata{FileID: blobFileID, Physical: m.Physical} + s.files[blobFileID] = cbf + s.stats.Count++ + s.stats.PhysicalSize += m.Physical.Size + s.stats.ValueSize += m.Physical.ValueSize + } + + // Update references to blob files from new tables. Any referenced blob + // files should already exist in s.files. + newTables := make(map[base.TableNum]struct{}) + for _, e := range ve.NewTables { + newTables[e.Meta.TableNum] = struct{}{} + for _, ref := range e.Meta.BlobReferences { + cbf, ok := s.files[ref.FileID] + if !ok { + return errors.AssertionFailedf("pebble: referenced blob file %d not found", ref.FileID) + } + cbf.references[e.Meta] = struct{}{} + cbf.referencedValueSize += ref.ValueSize + s.stats.ReferencedValueSize += ref.ValueSize + s.stats.ReferencesCount++ + } + } + + // Remove references to blob files from deleted tables. Any referenced blob + // files should already exist in s.files. If the removal of a reference + // causes the blob file's ref count to drop to zero, the blob file is a + // zombie. We update the version edit to record the blob file removal and + // remove it from the set. + for _, meta := range ve.DeletedTables { + for _, ref := range meta.BlobReferences { + cbf, ok := s.files[ref.FileID] + if !ok { + return errors.AssertionFailedf("pebble: referenced blob file %d not found", ref.FileID) + } + if invariants.Enabled { + if ref.ValueSize > cbf.referencedValueSize { + return errors.AssertionFailedf("pebble: referenced value size %d for blob file %s is greater than the referenced value size %d", + ref.ValueSize, cbf.metadata.FileID, cbf.referencedValueSize) + } + if _, ok := cbf.references[meta]; !ok { + return errors.AssertionFailedf("pebble: deleted table %s's reference to blob file %s not known", + meta.TableNum, ref.FileID) + } + } + + // Decrement the stats for this reference. We decrement even if the + // table is being moved, because we incremented the stats when we + // iterated over the version edit's new tables. + cbf.referencedValueSize -= ref.ValueSize + s.stats.ReferencedValueSize -= ref.ValueSize + s.stats.ReferencesCount-- + if _, ok := newTables[meta.TableNum]; ok { + // This table was added to a different level of the LSM in the + // same version edit. It's being moved. We can preserve the + // existing reference. We still needed to reduce the counts + // above because we doubled it when we incremented stats on + // account of files in NewTables. + continue + } + // Remove the reference of this table to this blob file. + delete(cbf.references, meta) + + // If there are no more references to the blob file, remove it from + // the set and add the removal of the blob file to the version edit. + if len(cbf.references) == 0 { + if cbf.referencedValueSize != 0 { + return errors.AssertionFailedf("pebble: referenced value size %d is non-zero for blob file %s with no refs", + cbf.referencedValueSize, cbf.metadata.FileID) + } + + // Remove the blob file from any heap it's in. + if cbf.heapState.heap != nil { + heap.Remove(cbf.heapState.heap, cbf.heapState.index) + cbf.heapState.heap = nil + cbf.heapState.index = -1 + } + + if ve.DeletedBlobFiles == nil { + ve.DeletedBlobFiles = make(map[DeletedBlobFileEntry]*PhysicalBlobFile) + } + + ve.DeletedBlobFiles[DeletedBlobFileEntry{ + FileID: cbf.metadata.FileID, + FileNum: cbf.metadata.Physical.FileNum, + }] = cbf.metadata.Physical + s.stats.Count-- + s.stats.PhysicalSize -= cbf.metadata.Physical.Size + s.stats.ValueSize -= cbf.metadata.Physical.ValueSize + delete(s.files, cbf.metadata.FileID) + continue + } + + if cbf.referencedValueSize >= cbf.metadata.Physical.ValueSize { + // This blob file is fully referenced. + continue + } + + // Update the heap position of the blob file. + if currentTime.Sub(time.Unix(int64(cbf.metadata.Physical.CreationTime), 0)) < s.rewrite.heuristic.MinimumAge { + // This blob file is too young. It belongs in the heap of + // recently created blob files. Add it to the heap if it's not + // already there. If it is already there, its position in the + // heap is unchanged. + if cbf.heapState.heap == nil { + heap.Push(&s.rewrite.recentlyCreated, cbf) + } + continue + } + + // This blob file is sufficiently old to be eligible for rewriting. + // It belongs in the rewrite candidates heap. + switch cbf.heapState.heap { + case &s.rewrite.recentlyCreated: + // We need to move it from the recently created heap to the + // rewrite candidates heap. + heap.Remove(cbf.heapState.heap, cbf.heapState.index) + heap.Push(&s.rewrite.candidates, cbf) + case &s.rewrite.candidates: + // This blob file is already in the rewrite candidates heap. We + // just need to fix up its position. + // + // TODO(jackson): When a version edit removes multiple + // references to a blob file, we'll fix it up multiple times. + // Should we remember the updated set of blob files and only fix + // up once at the end? + heap.Fix(cbf.heapState.heap, cbf.heapState.index) + case nil: + // This blob file is not in any heap. This is possible if this + // is the first time a reference to the file has been removed. + heap.Push(&s.rewrite.candidates, cbf) + default: + panic("unreachable") + } + } + } + s.moveAgedBlobFilesToCandidatesHeap(currentTime) + return nil +} + +// ReplacementCandidate returns the next blob file that should be rewritten. If +// there are no candidates, the second return value is false. Successive calls +// to ReplacementCandidate may (but are not guaranteed to) return the same blob +// file until the blob file is replaced. +func (s *CurrentBlobFileSet) ReplacementCandidate() (BlobFileMetadata, bool) { + s.moveAgedBlobFilesToCandidatesHeap(s.rewrite.heuristic.CurrentTime()) + if len(s.rewrite.candidates.items) == 0 { + return BlobFileMetadata{}, false + } + return s.rewrite.candidates.items[0].metadata, true +} + +// ReferencingTables returns a slice containing the set of tables that reference +// the blob file with the provided file ID. The returned slice is sorted by +// table number. +func (s *CurrentBlobFileSet) ReferencingTables(fileID base.BlobFileID) []*TableMetadata { + cbf, ok := s.files[fileID] + if !ok { + return nil + } + tables := slices.Collect(maps.Keys(cbf.references)) + slices.SortFunc(tables, func(a, b *TableMetadata) int { + return stdcmp.Compare(a.TableNum, b.TableNum) + }) + return tables +} + +// moveAgedBlobFilesToCandidatesHeap moves blob files from the recentlyCreated +// heap to the candidates heap if at the provided timestamp they are considered +// old enough to be eligible for rewriting. +func (s *CurrentBlobFileSet) moveAgedBlobFilesToCandidatesHeap(now time.Time) { + defer s.maybeVerifyHeapStateInvariants() + for len(s.rewrite.recentlyCreated.items) > 0 { + root := s.rewrite.recentlyCreated.items[0] + if now.Sub(time.Unix(int64(root.metadata.Physical.CreationTime), 0)) < s.rewrite.heuristic.MinimumAge { + return + } + + heap.Remove(&s.rewrite.recentlyCreated, root.heapState.index) + heap.Push(&s.rewrite.candidates, root) + } +} + +func (s *CurrentBlobFileSet) maybeVerifyHeapStateInvariants() { + if invariants.Enabled { + for i, cbf := range s.rewrite.candidates.items { + if cbf.heapState.heap != &s.rewrite.candidates { + panic(errors.AssertionFailedf("pebble: heap state mismatch %v != %v", cbf.heapState.heap, &s.rewrite.candidates)) + } else if cbf.heapState.index != i { + panic(errors.AssertionFailedf("pebble: heap index mismatch %d != %d", cbf.heapState.index, i)) + } + } + for i, cbf := range s.rewrite.recentlyCreated.items { + if cbf.heapState.heap != &s.rewrite.recentlyCreated { + panic(errors.AssertionFailedf("pebble: heap state mismatch %v != %v", cbf.heapState.heap, &s.rewrite.recentlyCreated)) + } else if cbf.heapState.index != i { + panic(errors.AssertionFailedf("pebble: heap index mismatch %d != %d", cbf.heapState.index, i)) + } + } + } +} + +// byReferencedRatio is a currentBlobFileOrdering that orders blob files by +// the ratio of the blob file's referenced value size to its total value size. +type byReferencedRatio struct{} + +func (byReferencedRatio) less(a, b *currentBlobFile) bool { + // TODO(jackson): Consider calculating the ratio whenever the references are + // updated and saving it on the currentBlobFile, rather than recalculating it + // on every comparison. + return float64(a.referencedValueSize)/float64(a.metadata.Physical.ValueSize) < + float64(b.referencedValueSize)/float64(b.metadata.Physical.ValueSize) +} + +// byCreationTime is a currentBlobFileOrdering that orders blob files by +// creation time. +type byCreationTime struct{} + +func (byCreationTime) less(a, b *currentBlobFile) bool { + return a.metadata.Physical.CreationTime < b.metadata.Physical.CreationTime +} + +type currentBlobFileOrdering interface { + less(*currentBlobFile, *currentBlobFile) bool +} + +// currentBlobFileHeap is a heap of currentBlobFiles with a configurable +// ordering. +type currentBlobFileHeap[O currentBlobFileOrdering] struct { + items []*currentBlobFile + ordering O +} + +// Assert that *currentBlobFileHeap implements heap.Interface. +var _ heap.Interface = (*currentBlobFileHeap[currentBlobFileOrdering])(nil) + +func (s *currentBlobFileHeap[O]) Len() int { return len(s.items) } + +func (s *currentBlobFileHeap[O]) Less(i, j int) bool { + if invariants.Enabled { + if s.items[i].heapState.heap != s { + panic(errors.AssertionFailedf("pebble: heap state mismatch %v != %v", s.items[i].heapState.heap, s)) + } else if s.items[j].heapState.heap != s { + panic(errors.AssertionFailedf("pebble: heap state mismatch %v != %v", s.items[j].heapState.heap, s)) + } + } + return s.ordering.less(s.items[i], s.items[j]) +} + +func (s *currentBlobFileHeap[O]) Swap(i, j int) { + s.items[i], s.items[j] = s.items[j], s.items[i] + s.items[i].heapState.index = i + s.items[j].heapState.index = j + if invariants.Enabled { + if s.items[i].heapState.heap != s { + panic(errors.AssertionFailedf("pebble: heap state mismatch %v != %v", s.items[i].heapState.heap, s)) + } else if s.items[j].heapState.heap != s { + panic(errors.AssertionFailedf("pebble: heap state mismatch %v != %v", s.items[j].heapState.heap, s)) + } + } +} + +func (s *currentBlobFileHeap[O]) Push(x any) { + n := len(s.items) + item := x.(*currentBlobFile) + item.heapState.index = n + item.heapState.heap = s + s.items = append(s.items, item) +} + +func (s *currentBlobFileHeap[O]) Pop() any { + old := s.items + n := len(old) + item := old[n-1] + old[n-1] = nil + s.items = old[0 : n-1] + item.heapState.index = -1 + item.heapState.heap = nil + return item +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/manifest/btree.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/btree.go similarity index 68% rename from vendor/github.com/cockroachdb/pebble/internal/manifest/btree.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/manifest/btree.go index 6e56af8..2d8fa7d 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/manifest/btree.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/btree.go @@ -6,88 +6,58 @@ package manifest import ( "bytes" + stdcmp "cmp" "fmt" + "iter" "strings" + "sync" "sync/atomic" "unsafe" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/invariants" ) -// The Annotator type defined below is used by other packages to lazily -// compute a value over a B-Tree. Each node of the B-Tree stores one -// `annotation` per annotator, containing the result of the computation over -// the node's subtree. -// -// An annotation is marked as valid if it's current with the current subtree -// state. Annotations are marked as invalid whenever a node will be mutated -// (in mut). Annotators may also return `false` from `Accumulate` to signal -// that a computation for a file is not stable and may change in the future. -// Annotations that include these unstable values are also marked as invalid -// on the node, ensuring that future queries for the annotation will recompute -// the value. - -// An Annotator defines a computation over a level's FileMetadata. If the -// computation is stable and uses inputs that are fixed for the lifetime of -// a FileMetadata, the LevelMetadata's internal data structures are annotated -// with the intermediary computations. This allows the computation to be -// computed incrementally as edits are applied to a level. -type Annotator interface { - // Zero returns the zero value of an annotation. This value is returned - // when a LevelMetadata is empty. The dst argument, if non-nil, is an - // obsolete value previously returned by this Annotator and may be - // overwritten and reused to avoid a memory allocation. - Zero(dst interface{}) (v interface{}) - - // Accumulate computes the annotation for a single file in a level's - // metadata. It merges the file's value into dst and returns a bool flag - // indicating whether or not the value is stable and okay to cache as an - // annotation. If the file's value may change over the life of the file, - // the annotator must return false. - // - // Implementations may modify dst and return it to avoid an allocation. - Accumulate(m *FileMetadata, dst interface{}) (v interface{}, cacheOK bool) - - // Merge combines two values src and dst, returning the result. - // Implementations may modify dst and return it to avoid an allocation. - Merge(src interface{}, dst interface{}) interface{} -} - -type btreeCmp func(*FileMetadata, *FileMetadata) int +// btreeCmp is a comparator function used by the B-Tree implementation to +// compare items. +type btreeCmp[M fileMetadata] func(M, M) int -func btreeCmpSeqNum(a, b *FileMetadata) int { +// btreeCmpSeqNum is a comparator function that compares two TableMetadata items +// by their sequence number. It's used for L0, the only level that allows files +// to overlap. +func btreeCmpSeqNum(a, b *TableMetadata) int { return a.cmpSeqNum(b) } -func btreeCmpSmallestKey(cmp Compare) btreeCmp { - return func(a, b *FileMetadata) int { +// btreeCmpSmallestKey is a comparator function that compares two TableMetadata +// items by their smallest key. It's used for all levels except L0. +func btreeCmpSmallestKey(cmp Compare) btreeCmp[*TableMetadata] { + return func(a, b *TableMetadata) int { return a.cmpSmallestKey(b, cmp) } } -// btreeCmpSpecificOrder is used in tests to construct a B-Tree with a -// specific ordering of FileMetadata within the tree. It's typically used to -// test consistency checking code that needs to construct a malformed B-Tree. -func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp { - m := map[*FileMetadata]int{} +// btreeCmpBlobFileID is a comparator function that compares two BlobFileIDEntry +// items by their file ID. It's used for the blob file set's B-Tree. +func btreeCmpBlobFileID(a, b BlobFileMetadata) int { + return stdcmp.Compare(a.FileID, b.FileID) +} + +// btreeCmpSpecificOrder is used in tests to construct a B-Tree with a specific +// ordering of TableMetadata within the tree. It's typically used to test +// consistency checking code that needs to construct a malformed B-Tree. +func btreeCmpSpecificOrder(files []*TableMetadata) btreeCmp[*TableMetadata] { + m := map[*TableMetadata]int{} for i, f := range files { m[f] = i } - return func(a, b *FileMetadata) int { + return func(a, b *TableMetadata) int { ai, aok := m[a] bi, bok := m[b] if !aok || !bok { panic("btreeCmpSliceOrder called with unknown files") } - switch { - case ai < bi: - return -1 - case ai > bi: - return +1 - default: - return 0 - } + return stdcmp.Compare(ai, bi) } } @@ -97,17 +67,17 @@ const ( minItems = degree - 1 ) -type annotation struct { - annotator Annotator - // v is an annotation value, the output of either - // annotator.Value or annotator.Merge. - v interface{} - // valid indicates whether future reads of the annotation may use v as-is. - // If false, v will be zeroed and recalculated. - valid bool +// fileMetadata is the type of file metadata stored in the B-Tree. +type fileMetadata interface { + String() string + Ref() + Unref(ObsoleteFilesSet) } -type leafNode struct { +// Assert that TableMetadata implements fileMetadata. +var _ fileMetadata = (*TableMetadata)(nil) + +type leafNode[M fileMetadata] struct { ref atomic.Int32 count int16 leaf bool @@ -120,31 +90,33 @@ type leafNode struct { // count=subtreeCount, however the unsafe casting [leafToNode] performs make // it risky and cumbersome. subtreeCount int - items [maxItems]*FileMetadata + items [maxItems]M // annot contains one annotation per annotator, merged over the entire - // node's files (and all descendants for non-leaf nodes). - annot []annotation + // node's files (and all descendants for non-leaf nodes). Protected by + // annotMu. + annotMu sync.RWMutex + annot []annotation } -type node struct { - leafNode - children [maxItems + 1]*node +type node[M fileMetadata] struct { + leafNode[M] + children [maxItems + 1]*node[M] } //go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct. -func leafToNode(ln *leafNode) *node { - return (*node)(unsafe.Pointer(ln)) +func leafToNode[M fileMetadata](ln *leafNode[M]) *node[M] { + return (*node[M])(unsafe.Pointer(ln)) } -func newLeafNode() *node { - n := leafToNode(new(leafNode)) +func newLeafNode[M fileMetadata]() *node[M] { + n := leafToNode(new(leafNode[M])) n.leaf = true n.ref.Store(1) return n } -func newNode() *node { - n := new(node) +func newNode[M fileMetadata]() *node[M] { + n := new(node[M]) n.ref.Store(1) return n } @@ -158,15 +130,18 @@ func newNode() *node { // // When a node is cloned, the provided pointer will be redirected to the new // mutable node. -func mut(n **node) *node { +func mut[M fileMetadata](n **node[M]) *node[M] { if (*n).ref.Load() == 1 { - // Exclusive ownership. Can mutate in place. + // Exclusive ownership. Can mutate in place. Still need to lock out + // any concurrent writes to annot. + (*n).annotMu.Lock() + defer (*n).annotMu.Unlock() // Whenever a node will be mutated, reset its annotations to be marked - // as uncached. This ensures any future calls to (*node).annotation - // will recompute annotations on the modified subtree. + // as uncached. This ensures any future calls to (*node).annotation will + // recompute annotations on the modified subtree. for i := range (*n).annot { - (*n).annot[i].valid = false + (*n).annot[i].valid.Store(false) } return *n } @@ -177,25 +152,72 @@ func mut(n **node) *node { // reference count to be greater than 1, we might be racing // with another call to decRef on this node. c := (*n).clone() - (*n).decRef(true /* contentsToo */, nil) + (*n).decRef(true /* contentsToo */, assertNoObsoleteFiles{}) *n = c // NB: We don't need to clear annotations, because (*node).clone does not // copy them. return *n } +// ObsoleteFilesSet accumulates files that now have zero references. +type ObsoleteFilesSet interface { + // AddBacking appends the provided TableBacking to the list of obsolete + // files. + AddBacking(*TableBacking) + // AddBlob appends the provided BlobFileMetadata to the list of obsolete + // files. + AddBlob(*PhysicalBlobFile) +} + +// assertNoObsoleteFiles is an obsoleteFiles implementation that panics if its +// methods are called. +// +// There are two sources of node dereferences: tree mutations and Version +// dereferences. Files should only be made obsolete during Version dereferences, +// during which this implementation will not be used (see Version.Unref). +type assertNoObsoleteFiles struct{} + +// Assert that assertNoObsoleteFiles implements ObsoleteFilesSet. +var _ ObsoleteFilesSet = assertNoObsoleteFiles{} + +// AddBacking appends the provided TableBacking to the list of obsolete files. +func (assertNoObsoleteFiles) AddBacking(fb *TableBacking) { + panic(errors.AssertionFailedf("file backing %s dereferenced to zero during tree mutation", fb.DiskFileNum)) +} + +// AddBlob appends the provided BlobFileMetadata to the list of obsolete files. +func (assertNoObsoleteFiles) AddBlob(bm *PhysicalBlobFile) { + panic(errors.AssertionFailedf("blob file %s dereferenced to zero during tree mutation", bm.FileNum)) +} + +// ignoreObsoleteFiles is an ObsoleteFilesSet implementation that ignores +// obsolete files. It's used in some contexts where we construct ephemeral +// B-Trees which do not need to track obsolete files and in tests. +type ignoreObsoleteFiles struct{} + +// Assert that ignoreObsoleteFiles implements ObsoleteFilesSet. +var _ ObsoleteFilesSet = ignoreObsoleteFiles{} + +// AddBacking appends the provided TableBacking to the list of obsolete files. +func (ignoreObsoleteFiles) AddBacking(fb *TableBacking) {} + +// AddBlob appends the provided BlobFileMetadata to the list of obsolete files. +func (ignoreObsoleteFiles) AddBlob(bm *PhysicalBlobFile) {} + // incRef acquires a reference to the node. -func (n *node) incRef() { +func (n *node[M]) incRef() { n.ref.Add(1) } -// decRef releases a reference to the node. If requested, the method will unref -// its items and recurse into child nodes and decrease their refcounts as well. +// decRef releases a reference to the node. If requested, the method will call +// the provided unref func on its items and recurse into child nodes and +// decrease their refcounts as well. +// // Some internal codepaths that manually copy the node's items or children to // new nodes pass contentsToo=false to preserve existing reference counts during -// operations that should yield a net-zero change to descendant refcounts. -// When a node is released, its contained files are dereferenced. -func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) { +// operations that should yield a net-zero change to descendant refcounts. When +// a node is released, its contained files are dereferenced. +func (n *node[M]) decRef(contentsToo bool, obsolete ObsoleteFilesSet) { if n.ref.Add(-1) > 0 { // Other references remain. Can't free. return @@ -203,26 +225,11 @@ func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) { // Dereference the node's metadata and release child references if // requested. Some internal callers may not want to propagate the deref - // because they're manually copying the filemetadata and children to other + // because they're manually copying the file metadata and children to other // nodes, and they want to preserve the existing reference count. if contentsToo { for _, f := range n.items[:n.count] { - if f.Unref() == 0 { - // There are two sources of node dereferences: tree mutations - // and Version dereferences. Files should only be made obsolete - // during Version dereferences, during which `obsolete` will be - // non-nil. - if obsolete == nil { - panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum)) - } - // Reference counting is performed on the FileBacking. In the case - // of a virtual sstable, this reference counting is performed on - // a FileBacking which is shared by every single virtual sstable - // with the same backing sstable. If the reference count hits 0, - // then we know that the FileBacking won't be required by any - // sstable in Pebble, and that the backing sstable can be deleted. - *obsolete = append(*obsolete, f.FileBacking) - } + f.Unref(obsolete) } if !n.leaf { for i := int16(0); i <= n.count; i++ { @@ -233,12 +240,12 @@ func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) { } // clone creates a clone of the receiver with a single reference count. -func (n *node) clone() *node { - var c *node +func (n *node[M]) clone() *node[M] { + var c *node[M] if n.leaf { - c = newLeafNode() + c = newLeafNode[M]() } else { - c = newNode() + c = newNode[M]() } // NB: copy field-by-field without touching n.ref to avoid // triggering the race detector and looking like a data race. @@ -262,7 +269,7 @@ func (n *node) clone() *node { // insertAt inserts the provided file and node at the provided index. This // function is for use only as a helper function for internal B-Tree code. // Clients should not invoke it directly. -func (n *node) insertAt(index int, item *FileMetadata, nd *node) { +func (n *node[M]) insertAt(index int, item M, nd *node[M]) { if index < int(n.count) { copy(n.items[index+1:n.count+1], n.items[index:n.count]) if !n.leaf { @@ -279,7 +286,7 @@ func (n *node) insertAt(index int, item *FileMetadata, nd *node) { // pushBack inserts the provided file and node at the tail of the node's items. // This function is for use only as a helper function for internal B-Tree code. // Clients should not invoke it directly. -func (n *node) pushBack(item *FileMetadata, nd *node) { +func (n *node[M]) pushBack(item M, nd *node[M]) { n.items[n.count] = item if !n.leaf { n.children[n.count+1] = nd @@ -290,7 +297,7 @@ func (n *node) pushBack(item *FileMetadata, nd *node) { // pushFront inserts the provided file and node at the head of the // node's items. This function is for use only as a helper function for internal B-Tree // code. Clients should not invoke it directly. -func (n *node) pushFront(item *FileMetadata, nd *node) { +func (n *node[M]) pushFront(item M, nd *node[M]) { if !n.leaf { copy(n.children[1:n.count+2], n.children[:n.count+1]) n.children[0] = nd @@ -303,8 +310,8 @@ func (n *node) pushFront(item *FileMetadata, nd *node) { // removeAt removes a value at a given index, pulling all subsequent values // back. This function is for use only as a helper function for internal B-Tree // code. Clients should not invoke it directly. -func (n *node) removeAt(index int) (*FileMetadata, *node) { - var child *node +func (n *node[M]) removeAt(index int) (M, *node[M]) { + var child *node[M] if !n.leaf { child = n.children[index+1] copy(n.children[index+1:n.count], n.children[index+2:n.count+1]) @@ -313,17 +320,17 @@ func (n *node) removeAt(index int) (*FileMetadata, *node) { n.count-- out := n.items[index] copy(n.items[index:n.count], n.items[index+1:n.count+1]) - n.items[n.count] = nil + clear(n.items[n.count : n.count+1]) return out, child } // popBack removes and returns the last element in the list. This function is // for use only as a helper function for internal B-Tree code. Clients should // not invoke it directly. -func (n *node) popBack() (*FileMetadata, *node) { +func (n *node[M]) popBack() (M, *node[M]) { n.count-- out := n.items[n.count] - n.items[n.count] = nil + clear(n.items[n.count : n.count+1]) if n.leaf { return out, nil } @@ -335,9 +342,9 @@ func (n *node) popBack() (*FileMetadata, *node) { // popFront removes and returns the first element in the list. This function is // for use only as a helper function for internal B-Tree code. Clients should // not invoke it directly. -func (n *node) popFront() (*FileMetadata, *node) { +func (n *node[M]) popFront() (M, *node[M]) { n.count-- - var child *node + var child *node[M] if !n.leaf { child = n.children[0] copy(n.children[:n.count+1], n.children[1:n.count+2]) @@ -345,7 +352,7 @@ func (n *node) popFront() (*FileMetadata, *node) { } out := n.items[0] copy(n.items[:n.count], n.items[1:n.count+1]) - n.items[n.count] = nil + clear(n.items[n.count : n.count+1]) return out, child } @@ -355,14 +362,14 @@ func (n *node) popFront() (*FileMetadata, *node) { // // This function is for use only as a helper function for internal B-Tree code. // Clients should not invoke it directly. -func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) { +func (n *node[M]) find(bcmp btreeCmp[M], item M) (index int, found bool) { // Logic copied from sort.Search. Inlining this gave // an 11% speedup on BenchmarkBTreeDeleteInsert. i, j := 0, int(n.count) for i < j { h := int(uint(i+j) >> 1) // avoid overflow when computing h // i ≤ h < j - v := cmp(item, n.items[h]) + v := bcmp(item, n.items[h]) if v == 0 { return h, true } else if v > 0 { @@ -406,19 +413,17 @@ func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) { // // This function is for use only as a helper function for internal B-Tree code. // Clients should not invoke it directly. -func (n *node) split(i int) (*FileMetadata, *node) { +func (n *node[M]) split(i int) (M, *node[M]) { out := n.items[i] - var next *node + var next *node[M] if n.leaf { - next = newLeafNode() + next = newLeafNode[M]() } else { - next = newNode() + next = newNode[M]() } next.count = n.count - int16(i+1) copy(next.items[:], n.items[i+1:n.count]) - for j := int16(i); j < n.count; j++ { - n.items[j] = nil - } + clear(n.items[i:n.count]) if !n.leaf { copy(next.children[:], n.children[i+1:n.count+1]) descendantsMoved := 0 @@ -441,14 +446,14 @@ func (n *node) split(i int) (*FileMetadata, *node) { // Insert inserts a item into the subtree rooted at this node, making sure no // nodes in the subtree exceed maxItems items. -func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { - i, found := n.find(cmp, item) +func (n *node[M]) Insert(bcmp btreeCmp[M], item M) error { + i, found := n.find(bcmp, item) if found { // cmp provides a total ordering of the files within a level. // If we're inserting a metadata that's equal to an existing item // in the tree, we're inserting a file into a level twice. return errors.Errorf("files %s and %s collided on sort keys", - errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum)) + item, n.items[i]) } if n.leaf { n.insertAt(i, item, nil) @@ -459,7 +464,7 @@ func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2) n.insertAt(i, splitLa, splitNode) - switch cmp := cmp(item, n.items[i]); { + switch cmp := bcmp(item, n.items[i]); { case cmp < 0: // no change, we want first split node case cmp > 0: @@ -468,12 +473,12 @@ func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { // cmp provides a total ordering of the files within a level. // If we're inserting a metadata that's equal to an existing item // in the tree, we're inserting a file into a level twice. - return errors.Errorf("files %s and %s collided on sort keys", - errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum)) + return errors.Errorf("metadatas %s and %s collided on keys", + item, n.items[i]) } } - err := mut(&n.children[i]).Insert(cmp, item) + err := mut(&n.children[i]).Insert(bcmp, item) if err == nil { n.subtreeCount++ } @@ -483,12 +488,12 @@ func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { // removeMax removes and returns the maximum item from the subtree rooted at // this node. This function is for use only as a helper function for internal // B-Tree code. Clients should not invoke it directly. -func (n *node) removeMax() *FileMetadata { +func (n *node[M]) removeMax() M { if n.leaf { n.count-- n.subtreeCount-- out := n.items[n.count] - n.items[n.count] = nil + clear(n.items[n.count : n.count+1]) return out } child := mut(&n.children[n.count]) @@ -500,22 +505,23 @@ func (n *node) removeMax() *FileMetadata { return child.removeMax() } -// Remove removes a item from the subtree rooted at this node. Returns -// the item that was removed or nil if no matching item was found. -func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { - i, found := n.find(cmp, item) +// Remove removes a item from the subtree rooted at this node. Returns the item +// that was removed. It returns true for the second argument if an item was +// found. +func (n *node[M]) Remove(bcmp btreeCmp[M], item M) (out M, found bool) { + i, found := n.find(bcmp, item) if n.leaf { if found { out, _ = n.removeAt(i) n.subtreeCount-- - return out + return out, true } - return nil + return out, false } if n.children[i].count <= minItems { // Child not large enough to remove from. n.rebalanceOrMerge(i) - return n.Remove(cmp, item) + return n.Remove(bcmp, item) } child := mut(&n.children[i]) if found { @@ -523,21 +529,21 @@ func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { out = n.items[i] n.items[i] = child.removeMax() n.subtreeCount-- - return out + return out, true } // File is not in this node and child is large enough to remove from. - out = child.Remove(cmp, item) - if out != nil { + out, found = child.Remove(bcmp, item) + if found { n.subtreeCount-- } - return out + return out, found } // rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove a // item from it while keeping it at or above minItems. This function is for use // only as a helper function for internal B-Tree code. Clients should not invoke // it directly. -func (n *node) rebalanceOrMerge(i int) { +func (n *node[M]) rebalanceOrMerge(i int) { switch { case i > 0 && n.children[i-1].count > minItems: // Rebalance from left sibling. @@ -661,83 +667,11 @@ func (n *node) rebalanceOrMerge(i int) { child.count += mergeChild.count + 1 child.subtreeCount += mergeChild.subtreeCount + 1 - mergeChild.decRef(false /* contentsToo */, nil) - } -} - -// InvalidateAnnotation removes any existing cached annotations for the provided -// annotator from this node's subtree. -func (n *node) InvalidateAnnotation(a Annotator) { - // Find this annotator's annotation on this node. - var annot *annotation - for i := range n.annot { - if n.annot[i].annotator == a { - annot = &n.annot[i] - } - } - - if annot != nil && annot.valid { - annot.valid = false - annot.v = a.Zero(annot.v) - } - if !n.leaf { - for i := int16(0); i <= n.count; i++ { - n.children[i].InvalidateAnnotation(a) - } + mergeChild.decRef(false /* contentsToo */, assertNoObsoleteFiles{}) } } -// Annotation retrieves, computing if not already computed, the provided -// annotator's annotation of this node. The second return value indicates -// whether the future reads of this annotation may use the first return value -// as-is. If false, the annotation is not stable and may change on a subsequent -// computation. -func (n *node) Annotation(a Annotator) (interface{}, bool) { - // Find this annotator's annotation on this node. - var annot *annotation - for i := range n.annot { - if n.annot[i].annotator == a { - annot = &n.annot[i] - } - } - - // If it exists and is marked as valid, we can return it without - // recomputing anything. - if annot != nil && annot.valid { - return annot.v, true - } - - if annot == nil { - // This is n's first time being annotated by a. - // Create a new zeroed annotation. - n.annot = append(n.annot, annotation{ - annotator: a, - v: a.Zero(nil), - }) - annot = &n.annot[len(n.annot)-1] - } else { - // There's an existing annotation that must be recomputed. - // Zero its value. - annot.v = a.Zero(annot.v) - } - - annot.valid = true - for i := int16(0); i <= n.count; i++ { - if !n.leaf { - v, ok := n.children[i].Annotation(a) - annot.v = a.Merge(v, annot.v) - annot.valid = annot.valid && ok - } - if i < n.count { - v, ok := a.Accumulate(n.items[i], annot.v) - annot.v = v - annot.valid = annot.valid && ok - } - } - return annot.v, annot.valid -} - -func (n *node) verifyInvariants() { +func (n *node[M]) verifyInvariants() { recomputedSubtreeCount := int(n.count) if !n.leaf { for i := int16(0); i <= n.count; i++ { @@ -753,31 +687,31 @@ func (n *node) verifyInvariants() { // btree is an implementation of a B-Tree. // -// btree stores FileMetadata in an ordered structure, allowing easy insertion, +// btree stores TableMetadata in an ordered structure, allowing easy insertion, // removal, and iteration. The B-Tree stores items in order based on cmp. The // first level of the LSM uses a cmp function that compares sequence numbers. -// All other levels compare using the FileMetadata.Smallest. +// All other levels compare using the TableMetadata.Smallest. // // Write operations are not safe for concurrent mutation by multiple // goroutines, but Read operations are. -type btree struct { - root *node - cmp btreeCmp +type btree[M fileMetadata] struct { + root *node[M] + bcmp btreeCmp[M] } // Release dereferences and clears the root node of the btree, removing all -// items from the btree. In doing so, it decrements contained file counts. -// It returns a slice of newly obsolete backing files, if any. -func (t *btree) Release() (obsolete []*FileBacking) { +// items from the btree. In doing so, it unrefs files associated with the +// tables. Any files that no longer have outstanding references are added to the +// provided obsoleteFiles. +func (t *btree[M]) Release(of ObsoleteFilesSet) { if t.root != nil { - t.root.decRef(true /* contentsToo */, &obsolete) + t.root.decRef(true /* contentsToo */, of) t.root = nil } - return obsolete } // Clone clones the btree, lazily. It does so in constant time. -func (t *btree) Clone() btree { +func (t *btree[M]) Clone() btree[M] { c := *t if c.root != nil { // Incrementing the reference count on the root node is sufficient to @@ -799,14 +733,15 @@ func (t *btree) Clone() btree { return c } -// Delete removes the provided file from the tree. -// It returns true if the file now has a zero reference count. -func (t *btree) Delete(item *FileMetadata) (obsolete bool) { +// Delete removes the provided table from the tree, unrefing the table's files +// if it's found. If any files are unreferenced to zero, they're added to the +// provided obsoleteFiles. +func (t *btree[M]) Delete(item M, of ObsoleteFilesSet) { if t.root == nil || t.root.count == 0 { - return false + return } - if out := mut(&t.root).Remove(t.cmp, item); out != nil { - obsolete = out.Unref() == 0 + if out, found := mut(&t.root).Remove(t.bcmp, item); found { + out.Unref(of) } if invariants.Enabled { t.root.verifyInvariants() @@ -818,19 +753,18 @@ func (t *btree) Delete(item *FileMetadata) (obsolete bool) { } else { t.root = t.root.children[0] } - old.decRef(false /* contentsToo */, nil) + old.decRef(false /* contentsToo */, assertNoObsoleteFiles{}) } - return obsolete } // Insert adds the given item to the tree. If a item in the tree already // equals the given one, Insert panics. -func (t *btree) Insert(item *FileMetadata) error { +func (t *btree[M]) Insert(item M) error { if t.root == nil { - t.root = newLeafNode() + t.root = newLeafNode[M]() } else if t.root.count >= maxItems { splitLa, splitNode := mut(&t.root).split(maxItems / 2) - newRoot := newNode() + newRoot := newNode[M]() newRoot.count = 1 newRoot.items[0] = splitLa newRoot.children[0] = t.root @@ -839,22 +773,36 @@ func (t *btree) Insert(item *FileMetadata) error { t.root = newRoot } item.Ref() - err := mut(&t.root).Insert(t.cmp, item) + err := mut(&t.root).Insert(t.bcmp, item) if invariants.Enabled { t.root.verifyInvariants() } return err } -// Iter returns a new iterator object. It is not safe to continue using an -// iterator after modifications are made to the tree. If modifications are made, -// create a new iterator. -func (t *btree) Iter() iterator { - return iterator{r: t.root, pos: -1, cmp: t.cmp} +// All returns an iterator over all the items in the tree. +func (t *btree[M]) All() iter.Seq[M] { + iter := iterator[M]{r: t.root, pos: -1, cmp: t.bcmp} + iter.first() + return func(yield func(M) bool) { + for iter.valid() { + if !yield(iter.cur()) { + return + } + iter.next() + } + } +} + +// tableMetadataIter returns a new iterator over a B-Tree of *TableMetadata. It +// is not safe to continue using an iterator after modifications are made to the +// tree. If modifications are made, create a new iterator. +func tableMetadataIter(tree *btree[*TableMetadata]) iterator[*TableMetadata] { + return iterator[*TableMetadata]{r: tree.root, pos: -1, cmp: tree.bcmp} } // Count returns the number of files contained within the B-Tree. -func (t *btree) Count() int { +func (t *btree[M]) Count() int { if t.root == nil { return 0 } @@ -863,7 +811,7 @@ func (t *btree) Count() int { // String returns a string description of the tree. The format is // similar to the https://en.wikipedia.org/wiki/Newick_format. -func (t *btree) String() string { +func (t *btree[M]) String() string { if t.Count() == 0 { return ";" } @@ -872,7 +820,7 @@ func (t *btree) String() string { return b.String() } -func (n *node) writeString(b *strings.Builder) { +func (n *node[M]) writeString(b *strings.Builder) { if n.leaf { for i := int16(0); i < n.count; i++ { if i != 0 { @@ -893,29 +841,29 @@ func (n *node) writeString(b *strings.Builder) { } // iterStack represents a stack of (node, pos) tuples, which captures -// iteration state as an iterator descends a btree. -type iterStack struct { +// iteration state as an iterator descends a btree of TableMetadata. +type iterStack[M fileMetadata] struct { // a contains aLen stack frames when an iterator stack is short enough. // If the iterator stack overflows the capacity of iterStackArr, the stack // is moved to s and aLen is set to -1. - a iterStackArr + a iterStackArr[M] aLen int16 // -1 when using s - s []iterFrame + s []iterFrame[M] } // Used to avoid allocations for stacks below a certain size. -type iterStackArr [3]iterFrame +type iterStackArr[M fileMetadata] [3]iterFrame[M] -type iterFrame struct { - n *node +type iterFrame[M fileMetadata] struct { + n *node[M] pos int16 } -func (is *iterStack) push(f iterFrame) { +func (is *iterStack[M]) push(f iterFrame[M]) { if is.aLen == -1 { is.s = append(is.s, f) } else if int(is.aLen) == len(is.a) { - is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen)) + is.s = make([]iterFrame[M], int(is.aLen)+1, 2*int(is.aLen)) copy(is.s, is.a[:]) is.s[int(is.aLen)] = f is.aLen = -1 @@ -925,7 +873,7 @@ func (is *iterStack) push(f iterFrame) { } } -func (is *iterStack) pop() iterFrame { +func (is *iterStack[M]) pop() iterFrame[M] { if is.aLen == -1 { f := is.s[len(is.s)-1] is.s = is.s[:len(is.s)-1] @@ -935,26 +883,26 @@ func (is *iterStack) pop() iterFrame { return is.a[is.aLen] } -func (is *iterStack) len() int { +func (is *iterStack[M]) len() int { if is.aLen == -1 { return len(is.s) } return int(is.aLen) } -func (is *iterStack) clone() iterStack { +func (is *iterStack[M]) clone() iterStack[M] { // If the iterator is using the embedded iterStackArr, we only need to // copy the struct itself. if is.s == nil { return *is } clone := *is - clone.s = make([]iterFrame, len(is.s)) + clone.s = make([]iterFrame[M], len(is.s)) copy(clone.s, is.s) return clone } -func (is *iterStack) nth(n int) (f iterFrame, ok bool) { +func (is *iterStack[M]) nth(n int) (f iterFrame[M], ok bool) { if is.aLen == -1 { if n >= len(is.s) { return f, false @@ -967,7 +915,7 @@ func (is *iterStack) nth(n int) (f iterFrame, ok bool) { return is.a[n], true } -func (is *iterStack) reset() { +func (is *iterStack[M]) reset() { if is.aLen == -1 { is.s = is.s[:0] } else { @@ -975,27 +923,27 @@ func (is *iterStack) reset() { } } -// iterator is responsible for search and traversal within a btree. -type iterator struct { +// an iterator provides search and traversal within a btree of *TableMetadata. +type iterator[M fileMetadata] struct { // the root node of the B-Tree. - r *node + r *node[M] // n and pos make up the current position of the iterator. // If valid, n.items[pos] is the current value of the iterator. // // n may be nil iff i.r is nil. - n *node + n *node[M] pos int16 - // cmp dictates the ordering of the FileMetadata. - cmp func(*FileMetadata, *FileMetadata) int + // cmp dictates the ordering of the TableMetadata. + cmp func(M, M) int // a stack of n's ancestors within the B-Tree, alongside the position // taken to arrive at n. If non-empty, the bottommost frame of the stack // will always contain the B-Tree root. - s iterStack + s iterStack[M] } // countLeft returns the count of files that are to the left of the current // iterator position. -func (i *iterator) countLeft() int { +func (i *iterator[M]) countLeft() int { if i.r == nil { return 0 } @@ -1046,19 +994,19 @@ func (i *iterator) countLeft() int { return count } -func (i *iterator) clone() iterator { +func (i *iterator[M]) clone() iterator[M] { c := *i c.s = i.s.clone() return c } -func (i *iterator) reset() { +func (i *iterator[M]) reset() { i.n = i.r i.pos = -1 i.s.reset() } -func (i iterator) String() string { +func (i iterator[M]) String() string { var buf bytes.Buffer for n := 0; ; n++ { f, ok := i.s.nth(n) @@ -1075,7 +1023,7 @@ func (i iterator) String() string { return buf.String() } -func cmpIter(a, b iterator) int { +func cmpIter[M fileMetadata](a, b iterator[M]) int { if a.r != b.r { panic("compared iterators from different btrees") } @@ -1117,7 +1065,7 @@ func cmpIter(a, b iterator) int { // end sentinel state which sorts after everything else. var aok, bok bool for i := 0; ; i++ { - var af, bf iterFrame + var af, bf iterFrame[M] af, aok = a.s.nth(i) bf, bok = b.s.nth(i) if !aok || !bok { @@ -1138,15 +1086,11 @@ func cmpIter(a, b iterator) int { if af.n != bf.n { panic("nonmatching nodes during btree iterator comparison") } - switch { - case af.pos < bf.pos: - return -1 - case af.pos > bf.pos: - return +1 - default: - // Continue up both iterators' stacks (equivalently, down the - // B-Tree away from the root). + if v := stdcmp.Compare(af.pos, bf.pos); v != 0 { + return v } + // Otherwise continue up both iterators' stacks (equivalently, down the + // B-Tree away from the root). } if aok && bok { @@ -1155,79 +1099,60 @@ func cmpIter(a, b iterator) int { if an != bn { panic("nonmatching nodes during btree iterator comparison") } + if v := stdcmp.Compare(apos, bpos); v != 0 { + return v + } switch { - case apos < bpos: + case aok: + // a is positioned at a leaf child at this position and b is at an + // end sentinel state. return -1 - case apos > bpos: + case bok: + // b is positioned at a leaf child at this position and a is at an + // end sentinel state. return +1 default: - switch { - case aok: - // a is positioned at a leaf child at this position and b is at an - // end sentinel state. - return -1 - case bok: - // b is positioned at a leaf child at this position and a is at an - // end sentinel state. - return +1 - default: - return 0 - } + return 0 } } -func (i *iterator) descend(n *node, pos int16) { - i.s.push(iterFrame{n: n, pos: pos}) +func (i *iterator[M]) descend(n *node[M], pos int16) { + i.s.push(iterFrame[M]{n: n, pos: pos}) i.n = n.children[pos] i.pos = 0 } // ascend ascends up to the current node's parent and resets the position // to the one previously set for this parent node. -func (i *iterator) ascend() { +func (i *iterator[M]) ascend() { f := i.s.pop() i.n = f.n i.pos = f.pos } -// seek repositions the iterator over the first file for which fn returns -// true, mirroring the semantics of the standard library's sort.Search -// function. Like sort.Search, seek requires the iterator's B-Tree to be -// ordered such that fn returns false for some (possibly empty) prefix of the -// tree's files, and then true for the (possibly empty) remainder. -func (i *iterator) seek(fn func(*FileMetadata) bool) { +// find seeks the iterator to the provided table metadata if it exists in the +// tree. It returns true if the table metadata is found and false otherwise. If +// find returns false, the position of the iterator is undefined. +func (i *iterator[M]) find(m M) bool { i.reset() if i.r == nil { - return + return false } - + i.n = i.r for { - // Logic copied from sort.Search. - j, k := 0, int(i.n.count) - for j < k { - h := int(uint(j+k) >> 1) // avoid overflow when computing h - - // j ≤ h < k - if !fn(i.n.items[h]) { - j = h + 1 // preserves f(j-1) == false - } else { - k = h // preserves f(k) == true - } - } - + j, found := i.n.find(i.cmp, m) i.pos = int16(j) - if i.n.leaf { - if i.pos == i.n.count { - i.next() - } - return + if found { + return true + } else if i.n.leaf { + return false } i.descend(i.n, i.pos) } } // first seeks to the first item in the btree. -func (i *iterator) first() { +func (i *iterator[M]) first() { i.reset() if i.r == nil { return @@ -1239,7 +1164,7 @@ func (i *iterator) first() { } // last seeks to the last item in the btree. -func (i *iterator) last() { +func (i *iterator[M]) last() { i.reset() if i.r == nil { return @@ -1252,7 +1177,7 @@ func (i *iterator) last() { // next positions the iterator to the item immediately following // its current position. -func (i *iterator) next() { +func (i *iterator[M]) next() { if i.r == nil { return } @@ -1279,7 +1204,7 @@ func (i *iterator) next() { // prev positions the iterator to the item immediately preceding // its current position. -func (i *iterator) prev() { +func (i *iterator[M]) prev() { if i.r == nil { return } @@ -1304,13 +1229,13 @@ func (i *iterator) prev() { } // valid returns whether the iterator is positioned at a valid position. -func (i *iterator) valid() bool { +func (i *iterator[M]) valid() bool { return i.r != nil && i.pos >= 0 && i.pos < i.n.count } -// cur returns the item at the iterator's current position. It is illegal -// to call cur if the iterator is not valid. -func (i *iterator) cur() *FileMetadata { +// cur returns the table metadata at the iterator's current position. It is +// illegal to call cur if the iterator is not valid. +func (i *iterator[M]) cur() M { if invariants.Enabled && !i.valid() { panic("btree iterator.cur invoked on invalid iterator") } diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/doc.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/doc.go new file mode 100644 index 0000000..3a2d8ff --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/doc.go @@ -0,0 +1,264 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +/* + +Proof Sketch of Correctness of LSM with Sub-Levels (Incomplete): + +NB: Ordering of L0 files (from oldest to youngest) is done by (LargestSeqNum, +SmallestSeqNum, FileNum). This is used by sub-level initialization to do a +more sophisticated time and key-span based ordering. + +1. History + +Consider a scenario where there are no compactions out of L0. Consider the +history of files getting added to L0 and two ways of accumulating that history +into a stack of sstables: + +- Simple-stack: Files are simply stacked into levels based on their add + ordering (files added atomically can randomly be ordered in any way). + +- Seqnum-stack: Files are stacked based on (LargestSeqNum, SmallestSeqNum, + FileNum). + +These two stacks will not be identical after every file add, even though they +contain the same set of files. The trivial divergence is because of the atomic +adds of multiple files. But the real divergence is because we cannot claim +that files are added to L0 in increasing order of LargestSeqNum: ingests that +don't have the same keys as memtables can be added to L0 before the memtables +are flushed. + +We make some claims about the invariants at any point of the history, in these +stacks. + +Claim 1: If sstable S1 and sstable S2 contain the same user key k and the k +seqnum in S1 is < the k seqnum in S2, then S1 is added to L0 before S2. + +Proof by contradiction: + +- Flushes only: Consider that S2 is added to L0 before S1. + + - S2 and S1 are part of the same flush operation: All instances of k will be + in the same sst (flush splits are always on userkey boundaries). + Contradiction. + + - S2 and S1 are in different flushes: Since flushes are sequential, the + flush of S2 must end before the flush of S1 starts. Since we flush + memtables in order of their filling up and all seqnums inside older + memtables are less than all seqnums inside newer memtables, the seqnum of + k in S2 < the seqnum in S1. Contradiction. + +- Ingests and flushes: Consider that S2 is added to L0 before S1. + + - S2 and S1 are part of the same atomic ingest: Atomic ingests have ssts + with non-overlapping user key bounds, so they cannot contain k in + different ssts. Contradiction. + + - S2 and S1 are part of different ingests. They must be assigned different + sst seqnums. Ingests must be added to the LSM in order of their seqnum + (see the discussion here + https://github.com/cockroachdb/pebble/v2/issues/2196#issuecomment-1523461535). + So seqnum of S2 < seqnum of S1. Contradiction. + + - S2 is an ingest and S1 is from a flush, and S2 is added first. Cases: + + - The seqnum of k in S1 was present in an unflushed memtable when S2 + ingestion was requested: So the memtable seqnum for k < S2 seqnum. And k + in the memtable will be flushed first (both with normal ingests and + flushable ingests). Contradiction. + + - The seqnum of k in S1 was not present in any memtable when S2 ingestion + was requested: The k in S1 will be assigned a higher seqnum than S2. + Contradiction. + + - S2 is a flush and S1 is an ingest, and S2 is added first to L0. + + - S2 is from memtable(s) that flushed before the ingest was requested. + Seqnum of k in S2 < Seqnum of k in S1. Contradiction. + + - S2 is from memtable(s) that flushed because the ingest was requested. + Seqnum of k in S2 < Seqnum of k in S1. Contradiction. + + - The k in S2 was added to the memtable after S1 was assigned a seqnum, + but S2 got added first. This is disallowed by the fix in + https://github.com/cockroachdb/pebble/v2/issues/2196. Contradiction. + +Claim 1 is sufficient to prove the level invariant for key k for the +simple-stack, since when S1 has k#t1 and S2 has k#t2, and t1 < t2, S1 is added +first. However it is insufficient to prove the level invariant for +seqnum-stack: say S1 LargestSeqNum LSN1 > S2's LargestSeqNum LSN2. Then the +ordering of levels will be inverted after S2 is added. We address this with +claim 2. + +Claim 2: Consider sstable S1 and sstable S2, such that S1 is added to L0 +before S2. Then LSN(S1) < LSN(S2) or S1 and S2 have no userkeys in common. + +Proof sketch by contradiction: + +Consider LSN(S1) >= LSN(S2) and S1 has k#t1 and S2 has k#t2, where t1 < t2. We +will consider the case of LSN(S1) >= t2. If we can contradict this and show +LSN(S1) < t2, then by t2 <= LSN(S2), we have LSN(S1) < LSN(S2). + +Cases: + +- S1 and S2 are from different flushes: The seqnums are totally ordered across + memtables, so all seqnums in S1 < all seqnums in S2. So LSN(S1) < t2. + Contradiction. + +- S1 is a flush and S2 is an ingest: + + - S1 was the result of flushing memtables that were immutable when S2 + arrived, which has LSN(S2)=t2. Then all seqnums in those immutable + memtables < t2, i.e., LSN(S1) < t2, Contradiction. + + - S1 was the result of flushing a mutable memtable when S2 arrived for + ingestion. k#t1 was in this mutable memtable or one of the immutable + memtables that will be flushed together. Consider the sequence of such + memtables M1, M2, Mn, where Mn is the mutable memtable: + + - k#t1 is in Mn: S2 waits for the flush of Mn. Can Mn concurrently get a + higher seqnum write (of some other key) > t2 added to it before the + flush. No, because we are holding commitPipeline.mu when assigning t2 + and while holding it, we mark Mn as immutable. So the flush of M1 … Mn + has LSN(S1) < t2. Contradiction. + + - k#t1 is in M1: Mn becomes full and flushes together with M1. Can happen + but will be fixed in https://github.com/cockroachdb/pebble/v2/issues/2196 + by preventing any memtable with seqnum > t2 from flushing. + +- S1 is an ingest and S2 is a flush: By definition LSN(S1)=t1. t1 >= t2 is not + possible since by definition t1 < t2. + +Claim 1 and claim 2 can together be used to prove the level invariant for the +seqnum-stack: We are given S1 is added before S2 and both have user key k, +with seqnum t1 and t2 respectively. From claim 1, t1 < t2. From claim 2, +LSN(S1) < LSN(s2). So the ordering based on the LSN will not violate the +seqnum invariant. + +Since we have the level-invariant for both simple-stack and seqnum-stack, +reads are consistent across both. + +A real LSM behaves as a hybrid of these two stacks, since there are +compactions out from L0 at arbitrary points in time. So any reordering of the +stack that is possible in the seqnum-stack may not actually happen, since +those files may no longer be in L0. This hybrid can be shown to be correct +since both the simple-stack and seqnum-stack are correct. This correctness +argument predates the introduction of sub-levels. + +TODO(sumeer): proof of level-invariant for the hybrid. + +Because of the seqnum-stack being equivalent to the simple-stack, we no longer +worry about future file additions to L0 and only consider what is currently in +L0. We focus on the seqnum-stack and the current L0, and how it is organized +into sub-levels. Sub-levels is a conceptually simple reorganization of the +seqnum-stack in that files that don't overlap in the keyspans, so +pessimistically cannot have conflicting keys, no longer need to stack up on +top of each other. This cannot violate the level invariant since the key span +(in terms of user keys) of S1 which is at a higher level than S2 despite LSN(s1) +< LSN(s2), must be non-overlapping. + +2. L0=>Lbase compactions with sub-levels + +They cut out a triangle from the bottom of the sub-levels, so will never +compact out a higher seqnum of k while leaving behind a lower seqnum. Once in +Lbase, the seqnums of the files play no part and only the keyspans are used +for future maintenance of the already established level invariant. +TODO(needed): more details. + +3. Intra-L0 compactions with sub-levels + +They cut out an inverted triangle from the top of the sub-levels. Correctness +here is more complicated because (a) the use of earliest-unflushed-seq-num, +(b) the ordering of output files and untouched existing files is based on +(LargestSeqNum, SmallestSeqNum, FileNum). We consider these in turn. + +3.1 Use of earliest-unflushed-seq-num to exclude files + +Consider a key span at sub-level i for which all files in the key-span have +LSN >= earliest-unflushed-seq-num (so are excluded). Extend this key span to +include any adjacent files on that sub-level that also have the same property, +then extend it until the end-bounds of the adjacent files that do not satisfy +this property. Consider the rectangle defined by this key-span going all the +way down to sub-level 0. And then start with that key-span in sub-level i-1. +In the following picture -- represents the key span in i and | bars represent +that rectangle defined. + +i +++++|----------------|++f2++ +i-1 --|--------------++|++f1+++ + + +We claim that the files in this key-span in sub-level i-1 that satisfy this +property cannot extend out of the key-span. This can be proved by +contradiction: if a file f1 at sub-level i-1 extends beyond, there must be a +file at sub-level i, say f2, that did not satisfy this property (otherwise the +maximal keyspan in i would have been wider). Now we know that +earliest-unflushed-seq-num > LSN(f2) and LSN(f1) >= +earliest-unflushed-seq-num. So LSN(f1) > LSN(f2) and they have overlapping +keyspans, which is not possible since f1 is in sub-level i-1 and f2 in +sub-level i. This argument can be continued to claim that the +earliest-unflushed-seq-num property cuts out an inverted triangle from the +sub-levels. Pretending these files are not in the history is ok, since the +final sub-levels will look the same if these were not yet known and we then +added them in the future in LSN order. + +3.2 Ordering of output files + +The actual files chosen for the intra-L0 compaction also follow the same +inverted triangle pattern. This means we have a contiguous history of the +seqnums for a key participating in the compaction, and anything not +participating has either lower or higher seqnums. The shape can look like the +following picture where . represents the spans chosen for the compaction and - +the spans ignored because of earliest-unflushed-seq-num and x that are older +and not participating. + +6 ------------------------ +5 -------------------- +4 .....----....... +3 ........... +2 ........ +1 xxxxxxx....xxxxxxxxxxxxxxxx +0 xxxxxxxxxxxxxxxxxxxxxxxxx + +We know the compaction input choice is sound, but the question is whether an +output . produced by the compaction can fall either too low, i.e., lower than +a conflicting x, or end up too high, above a conflicting -. This is because +the choice of sub-level depends on the LSN of the output and not the actual +conflicting key seqnums in the file (the LSN is just a summary). Claim 1 and 2 +are insufficient to prove this. Those claims allow for the following sequence +of files (from higher to lower sub-level): + +- a#20 f3 +. a#10 b#15 f2 +x a#5 c#12 f1 + +If the compaction separates a#10 from b#15 in the output, a#10 can fall below +f1. To prove this cannot happen we need another claim. + +Claim 3: if key k is in files S1 and S2, with k#t1, k#t2 with t1 < t2, then +LSN(S1) < t2. + +Proof: We have proved this stronger claim when proving claim 2. + +Based on claim 3, the above example is impossible, since it would require +LSN(f1) < 10. + +Using claim 3 we can prove that even if the intra-L0 compaction writes one +userkey per output sst, the LSN of that output sst will be > the LSN of ssts +with the same userkey that are categorized as x. + +Next we need to show that the output won't land higher than - with a +conflicting key, say when we produce a single output file. NB: the narrowest +possible outputs (considered in the previous paragraph, with one key per file) +were the risk in the output sinking too low, and the widest possible output +(considered now) is the risk in staying too high. + +The file that was excluded (- file) with the conflicting key has LSN >= +earliest-unflushed-seq-num. By definition there is no point in any of the +participating files that is >= earliest-unflushed-seq-num. So the LSN of this +single output file is < earliest-unflushed-seq-num. Hence the output can't +land higher than the excluded file with the conflicting key. + +*/ diff --git a/vendor/github.com/cockroachdb/pebble/internal/manifest/l0_sublevels.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/l0_sublevels.go similarity index 78% rename from vendor/github.com/cockroachdb/pebble/internal/manifest/l0_sublevels.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/manifest/l0_sublevels.go index 6f58149..0c45237 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/manifest/l0_sublevels.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/l0_sublevels.go @@ -6,34 +6,33 @@ package manifest import ( "bytes" + stdcmp "cmp" "fmt" "math" + "slices" "sort" "strings" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/problemspans" ) -// errInvalidL0SublevelsOpt is for use in AddL0Files when the incremental -// sublevel generation optimization failed, and NewL0Sublevels must be called. -var errInvalidL0SublevelsOpt = errors.New("pebble: L0 sublevel generation optimization cannot be used") - // Intervals are of the form [start, end) with no gap between intervals. Each // file overlaps perfectly with a sequence of intervals. This perfect overlap // occurs because the union of file boundary keys is used to pick intervals. -// However the largest key in a file is inclusive, so when it is used as -// an interval, the actual key is ImmediateSuccessor(key). We don't have the +// However when the largest key in a file is inclusive and it is used as an end +// of an interval, the actual key is ImmediateSuccessor(key). We don't have the // ImmediateSuccessor function to do this computation, so we instead keep an -// isLargest bool to remind the code about this fact. This is used for +// isInclusiveEndBound bool to remind the code about this fact. This is used for // comparisons in the following manner: // - intervalKey{k, false} < intervalKey{k, true} // - k1 < k2 -> intervalKey{k1, _} < intervalKey{k2, _}. // // Note that the file's largest key is exclusive if the internal key // has a trailer matching the rangedel sentinel key. In this case, we set -// isLargest to false for end interval computation. +// isInclusiveEndBound to false for end interval computation. // // For example, consider three files with bounds [a,e], [b,g], and [e,j]. The // interval keys produced would be intervalKey{a, false}, intervalKey{b, false}, @@ -52,15 +51,19 @@ var errInvalidL0SublevelsOpt = errors.New("pebble: L0 sublevel generation optimi // picking overlapping files for a compaction, only need to use the index // numbers and so avoid expensive byte slice comparisons. type intervalKey struct { - key []byte - isLargest bool + key []byte + isInclusiveEndBound bool +} + +func (k *intervalKey) toEndBoundary() base.UserKeyBoundary { + return base.UserKeyExclusiveIf(k.key, !k.isInclusiveEndBound) } // intervalKeyTemp is used in the sortAndSweep step. It contains additional metadata // which is used to generate the {min,max}IntervalIndex for files. type intervalKeyTemp struct { intervalKey intervalKey - fileMeta *FileMetadata + fileMeta *TableMetadata isEndKey bool } @@ -80,10 +83,10 @@ func (i *intervalKeyTemp) setFileIntervalIndex(idx int) { func intervalKeyCompare(cmp Compare, a, b intervalKey) int { rv := cmp(a.key, b.key) if rv == 0 { - if a.isLargest && !b.isLargest { + if a.isInclusiveEndBound && !b.isInclusiveEndBound { return +1 } - if !a.isLargest && b.isLargest { + if !a.isInclusiveEndBound && b.isInclusiveEndBound { return -1 } } @@ -174,7 +177,7 @@ type fileInterval struct { intervalRangeIsBaseCompacting bool // All files in this interval, in increasing sublevel order. - files []*FileMetadata + files []*TableMetadata // len(files) - compactingFileCount is the stack depth that requires // starting new compactions. This metric is not precise since the @@ -215,12 +218,11 @@ func (b *bitSet) clearAllBits() { // L0Compaction describes an active compaction with inputs from L0. type L0Compaction struct { - Smallest InternalKey - Largest InternalKey + Bounds base.UserKeyBounds IsIntraL0 bool } -// L0Sublevels represents a sublevel view of SSTables in L0. Tables in one +// l0Sublevels represents a sublevel view of SSTables in L0. Tables in one // sublevel are non-overlapping in key ranges, and keys in higher-indexed // sublevels shadow older versions in lower-indexed sublevels. These invariants // are similar to the regular level invariants, except with higher indexed @@ -229,21 +231,21 @@ type L0Compaction struct { // There is no limit to the number of sublevels that can exist in L0 at any // time, however read and compaction performance is best when there are as few // sublevels as possible. -type L0Sublevels struct { +type l0Sublevels struct { // Levels are ordered from oldest sublevel to youngest sublevel in the // outer slice, and the inner slice contains non-overlapping files for // that sublevel in increasing key order. Levels is constructed from // levelFiles and is used by callers that require a LevelSlice. The below two - // fields are treated as immutable once created in NewL0Sublevels. + // fields are treated as immutable once created in newL0Sublevels. Levels []LevelSlice - levelFiles [][]*FileMetadata + levelFiles [][]*TableMetadata cmp Compare formatKey base.FormatKey fileBytes uint64 // All the L0 files, ordered from oldest to youngest. - levelMetadata *LevelMetadata + levelMetadata LevelMetadata // The file intervals in increasing key order. orderedIntervals []fileInterval @@ -255,51 +257,40 @@ type L0Sublevels struct { addL0FilesCalled bool } -type sublevelSorter []*FileMetadata - -// Len implements sort.Interface. -func (sl sublevelSorter) Len() int { - return len(sl) -} - -// Less implements sort.Interface. -func (sl sublevelSorter) Less(i, j int) bool { - return sl[i].minIntervalIndex < sl[j].minIntervalIndex -} - -// Swap implements sort.Interface. -func (sl sublevelSorter) Swap(i, j int) { - sl[i], sl[j] = sl[j], sl[i] +func sortByMinIntervalIndex(files []*TableMetadata) { + slices.SortFunc(files, func(a, b *TableMetadata) int { + return stdcmp.Compare(a.minIntervalIndex, b.minIntervalIndex) + }) } -// NewL0Sublevels creates an L0Sublevels instance for a given set of L0 files. +// newL0Sublevels creates an l0Sublevels instance for a given set of L0 files. // These files must all be in L0 and must be sorted by seqnum (see // SortBySeqNum). During interval iteration, when flushSplitMaxBytes bytes are // exceeded in the range of intervals since the last flush split key, a flush // split key is added. // // This method can be called without DB.mu being held, so any DB.mu protected -// fields in FileMetadata cannot be accessed here, such as Compacting and +// fields in TableMetadata cannot be accessed here, such as Compacting and // IsIntraL0Compacting. Those fields are accessed in InitCompactingFileInfo // instead. -func NewL0Sublevels( +func newL0Sublevels( levelMetadata *LevelMetadata, cmp Compare, formatKey base.FormatKey, flushSplitMaxBytes int64, -) (*L0Sublevels, error) { - s := &L0Sublevels{cmp: cmp, formatKey: formatKey} - s.levelMetadata = levelMetadata +) (*l0Sublevels, error) { + s := &l0Sublevels{cmp: cmp, formatKey: formatKey} + s.levelMetadata = *levelMetadata keys := make([]intervalKeyTemp, 0, 2*s.levelMetadata.Len()) iter := levelMetadata.Iter() for i, f := 0, iter.First(); f != nil; i, f = i+1, iter.Next() { f.L0Index = i keys = append(keys, intervalKeyTemp{ - intervalKey: intervalKey{key: f.Smallest.UserKey}, + intervalKey: intervalKey{key: f.Smallest().UserKey}, fileMeta: f, isEndKey: false, }) keys = append(keys, intervalKeyTemp{ intervalKey: intervalKey{ - key: f.Largest.UserKey, - isLargest: !f.Largest.IsExclusiveSentinel(), + key: f.Largest().UserKey, + isInclusiveEndBound: !f.Largest().IsExclusiveSentinel(), }, fileMeta: f, isEndKey: true, @@ -319,24 +310,22 @@ func NewL0Sublevels( // Initialize minIntervalIndex and maxIntervalIndex for each file, and use that // to update intervals. for f := iter.First(); f != nil; f = iter.Next() { - if err := s.addFileToSublevels(f, false /* checkInvariant */); err != nil { - return nil, err - } + s.addFileToSublevels(f) } // Sort each sublevel in increasing key order. for i := range s.levelFiles { - sort.Sort(sublevelSorter(s.levelFiles[i])) + sortByMinIntervalIndex(s.levelFiles[i]) } // Construct a parallel slice of sublevel B-Trees. // TODO(jackson): Consolidate and only use the B-Trees. for _, sublevelFiles := range s.levelFiles { - tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles) + ls := makeLevelSlice(btreeCmpSmallestKey(cmp), sublevelFiles) s.Levels = append(s.Levels, ls) - tr.Release() } s.calculateFlushSplitKeys(flushSplitMaxBytes) + s.Check() return s, nil } @@ -401,7 +390,7 @@ func mergeIntervals( // the file bytes are interpolated over has changed. estimatedBytes: 0, // Copy the below attributes from prevInterval. - files: append([]*FileMetadata(nil), prevInterval.files...), + files: append([]*TableMetadata(nil), prevInterval.files...), isBaseCompacting: prevInterval.isBaseCompacting, intervalRangeIsBaseCompacting: prevInterval.intervalRangeIsBaseCompacting, compactingFileCount: prevInterval.compactingFileCount, @@ -414,55 +403,78 @@ func mergeIntervals( return result, oldToNewMap } -// AddL0Files incrementally builds a new L0Sublevels for when the only change -// since the receiver L0Sublevels was an addition of the specified files, with +func (s *l0Sublevels) canUseAddL0Files( + addedTables map[base.TableNum]*TableMetadata, levelMetadata *LevelMetadata, +) (filesToAddInOrder []*TableMetadata, ok bool) { + if s.addL0FilesCalled { + if invariants.Enabled { + panic("addL0Files called twice on the same receiver") + } + return nil, false + } + if s.levelMetadata.Len()+len(addedTables) != levelMetadata.Len() { + if invariants.Enabled { + panic("levelMetadata mismatch") + } + return nil, false + } + + // addL0Files only works when the files we are adding match exactly the last + // files in the new levelMetadata (this is the case usually, but not always). + files := make([]*TableMetadata, len(addedTables)) + iter := levelMetadata.Iter() + t := iter.Last() + for i := len(addedTables) - 1; i >= 0; i-- { + if addedTables[t.TableNum] == nil { + // t is an existing table that sorts after some of the new tables + // (specifically the ones we haven't yet seen). + return nil, false + } + files[i] = t + t = iter.Prev() + } + return files, true +} + +// addL0Files incrementally builds a new l0Sublevels for when the only change +// since the receiver l0Sublevels was an addition of the specified tables, with // no L0 deletions. The common case of this is an ingestion or a flush. These // files can "sit on top" of existing sublevels, creating at most one new // sublevel for a flush (and possibly multiple for an ingestion), and at most // 2*len(files) additions to s.orderedIntervals. No files must have been deleted // from L0, and the added files must all be newer in sequence numbers than -// existing files in L0Sublevels. The files parameter must be sorted in seqnum -// order. The levelMetadata parameter corresponds to the new L0 post addition of -// files. This method is meant to be significantly more performant than -// NewL0Sublevels. +// existing files in l0Sublevels. The levelMetadata parameter corresponds to the +// new L0 post addition of files. This method is meant to be significantly more +// performant than newL0Sublevels. +// +// This function is intended to be called with the result of canUseAddL0Files(), +// which is the list of new L0 tables in increasing L0 order. // // Note that this function can only be called once on a given receiver; it // appends to some slices in s which is only safe when done once. This is okay, -// as the common case (generating a new L0Sublevels after a flush/ingestion) is +// as the common case (generating a new l0Sublevels after a flush/ingestion) is // only going to necessitate one call of this method on a given receiver. The -// returned value, if non-nil, can then have [*L0Sublevels.AddL0Files] called on -// it again, and so on. If [errInvalidL0SublevelsOpt] is returned as an error, -// it likely means the optimization could not be applied (i.e. files added were -// older than files already in the sublevels, which is possible around -// ingestions and in tests). Eg. it can happen when an ingested file was -// ingested without queueing a flush since it did not actually overlap with any -// keys in the memtable. Later on the memtable was flushed, and the memtable had -// keys spanning around the ingested file, producing a flushed file that -// overlapped with the ingested file in file bounds but not in keys. It's -// possible for that flushed file to have a lower LargestSeqNum than the -// ingested file if all the additions after the ingestion were to another -// flushed file that was split into a separate sstable during flush. Any other -// non-nil error means [L0Sublevels] generation failed in the same way as -// [NewL0Sublevels] would likely fail. -func (s *L0Sublevels) AddL0Files( - files []*FileMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata, -) (*L0Sublevels, error) { - if invariants.Enabled && s.addL0FilesCalled { - panic("AddL0Files called twice on the same receiver") +// returned value, if non-nil, can then have [*l0Sublevels.addL0Files] called on +// it again, and so on. +func (s *l0Sublevels) addL0Files( + files []*TableMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata, +) *l0Sublevels { + if s.addL0FilesCalled { + panic("addL0Files called twice on the same receiver") } s.addL0FilesCalled = true // Start with a shallow copy of s. - newVal := &L0Sublevels{} + newVal := &l0Sublevels{} *newVal = *s newVal.addL0FilesCalled = false - newVal.levelMetadata = levelMetadata + newVal.levelMetadata = *levelMetadata // Deep copy levelFiles and Levels, as they are mutated and sorted below. // Shallow copies of slices that we just append to, are okay. - newVal.levelFiles = make([][]*FileMetadata, len(s.levelFiles)) + newVal.levelFiles = make([][]*TableMetadata, len(s.levelFiles)) for i := range s.levelFiles { - newVal.levelFiles[i] = make([]*FileMetadata, len(s.levelFiles[i])) + newVal.levelFiles[i] = make([]*TableMetadata, len(s.levelFiles[i])) copy(newVal.levelFiles[i], s.levelFiles[i]) } newVal.Levels = make([]LevelSlice, len(s.Levels)) @@ -471,13 +483,13 @@ func (s *L0Sublevels) AddL0Files( fileKeys := make([]intervalKeyTemp, 0, 2*len(files)) for _, f := range files { left := intervalKeyTemp{ - intervalKey: intervalKey{key: f.Smallest.UserKey}, + intervalKey: intervalKey{key: f.Smallest().UserKey}, fileMeta: f, } right := intervalKeyTemp{ intervalKey: intervalKey{ - key: f.Largest.UserKey, - isLargest: !f.Largest.IsExclusiveSentinel(), + key: f.Largest().UserKey, + isInclusiveEndBound: !f.Largest().IsExclusiveSentinel(), }, fileMeta: f, isEndKey: true, @@ -562,7 +574,7 @@ func (s *L0Sublevels) AddL0Files( // with a binary search, or by only looping through files to the right of // the first interval touched by this method. for sublevel := range s.Levels { - s.Levels[sublevel].Each(func(f *FileMetadata) { + for f := range s.Levels[sublevel].All() { oldIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1 oldMinIntervalIndex := f.minIntervalIndex f.minIntervalIndex = oldToNewMap[f.minIntervalIndex] @@ -591,15 +603,13 @@ func (s *L0Sublevels) AddL0Files( newVal.orderedIntervals[i].estimatedBytes += f.Size / uint64(newIntervalDelta) } } - }) + } } updatedSublevels := make([]int, 0) // Update interval indices for new files. for i, f := range files { f.L0Index = s.levelMetadata.Len() + i - if err := newVal.addFileToSublevels(f, true /* checkInvariant */); err != nil { - return nil, err - } + newVal.addFileToSublevels(f) updatedSublevels = append(updatedSublevels, f.SubLevel) } @@ -618,13 +628,13 @@ func (s *L0Sublevels) AddL0Files( // Sort each updated sublevel in increasing key order. for _, sublevel := range updatedSublevels { - sort.Sort(sublevelSorter(newVal.levelFiles[sublevel])) + sortByMinIntervalIndex(newVal.levelFiles[sublevel]) } // Construct a parallel slice of sublevel B-Trees. // TODO(jackson): Consolidate and only use the B-Trees. for _, sublevel := range updatedSublevels { - tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel]) + ls := makeLevelSlice(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel]) if sublevel == len(newVal.Levels) { newVal.Levels = append(newVal.Levels, ls) } else { @@ -632,21 +642,19 @@ func (s *L0Sublevels) AddL0Files( // populated correctly. newVal.Levels[sublevel] = ls } - tr.Release() } newVal.flushSplitUserKeys = nil newVal.calculateFlushSplitKeys(flushSplitMaxBytes) - return newVal, nil + newVal.Check() + return newVal } -// addFileToSublevels is called during L0Sublevels generation, and adds f to the +// addFileToSublevels is called during l0Sublevels generation, and adds f to the // correct sublevel's levelFiles, the relevant intervals' files slices, and sets // interval indices on f. This method, if called successively on multiple files, -// _must_ be called on successively newer files (by seqnum). If checkInvariant -// is true, it could check for this in some cases and return -// [errInvalidL0SublevelsOpt] if that invariant isn't held. -func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) error { +// _must_ be called on successively newer files (by seqnum). +func (s *l0Sublevels) addFileToSublevels(f *TableMetadata) { // This is a simple and not very accurate estimate of the number of // bytes this SSTable contributes to the intervals it is a part of. // @@ -659,13 +667,11 @@ func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) e for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { interval := &s.orderedIntervals[i] if len(interval.files) > 0 { - if checkInvariant && interval.files[len(interval.files)-1].LargestSeqNum > f.LargestSeqNum { - // We are sliding this file "underneath" an existing file. Throw away - // and start over in NewL0Sublevels. - return errInvalidL0SublevelsOpt + if interval.files[len(interval.files)-1].LargestSeqNum > f.LargestSeqNum { + panic(errors.AssertionFailedf("addFileToSublevels found existing newer file")) } // interval.files is sorted by sublevels, from lowest to highest. - // AddL0Files can only add files at sublevels higher than existing files + // addL0Files can only add files at sublevels higher than existing files // in the same key intervals. if maxSublevel := interval.files[len(interval.files)-1].SubLevel; subLevel <= maxSublevel { subLevel = maxSublevel + 1 @@ -682,17 +688,16 @@ func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) e } f.SubLevel = subLevel if subLevel > len(s.levelFiles) { - return errors.Errorf("chose a sublevel beyond allowed range of sublevels: %d vs 0-%d", subLevel, len(s.levelFiles)) + panic(errors.AssertionFailedf("chose a sublevel beyond allowed range of sublevels: %d vs 0-%d", subLevel, len(s.levelFiles))) } if subLevel == len(s.levelFiles) { - s.levelFiles = append(s.levelFiles, []*FileMetadata{f}) + s.levelFiles = append(s.levelFiles, []*TableMetadata{f}) } else { s.levelFiles[subLevel] = append(s.levelFiles[subLevel], f) } - return nil } -func (s *L0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) { +func (s *l0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) { var cumulativeBytes uint64 // Multiply flushSplitMaxBytes by the number of sublevels. This prevents // excessive flush splitting when the number of sublevels increases. @@ -713,33 +718,35 @@ func (s *L0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) { // files. Must be called after sublevel initialization. // // Requires DB.mu *and* the manifest lock to be held. -func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) { +func (s *l0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) { for i := range s.orderedIntervals { s.orderedIntervals[i].compactingFileCount = 0 s.orderedIntervals[i].isBaseCompacting = false s.orderedIntervals[i].intervalRangeIsBaseCompacting = false } - iter := s.levelMetadata.Iter() - for f := iter.First(); f != nil; f = iter.Next() { + for f := range s.levelMetadata.All() { if invariants.Enabled { - if !bytes.Equal(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) { - panic(fmt.Sprintf("f.minIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s", - s.formatKey(s.orderedIntervals[f.minIntervalIndex].startKey.key), s.formatKey(f.Smallest.UserKey))) + bounds := f.UserKeyBounds() + if !bytes.Equal(s.orderedIntervals[f.minIntervalIndex].startKey.key, bounds.Start) { + panic(fmt.Sprintf("f.minIntervalIndex in TableMetadata out of sync with intervals in L0Sublevels: %s != %s", + s.formatKey(s.orderedIntervals[f.minIntervalIndex].startKey.key), s.formatKey(bounds.Start))) } - if !bytes.Equal(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) { - panic(fmt.Sprintf("f.maxIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s", - s.formatKey(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key), s.formatKey(f.Smallest.UserKey))) + if !bytes.Equal(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, bounds.End.Key) { + panic(fmt.Sprintf("f.maxIntervalIndex in TableMetadata out of sync with intervals in L0Sublevels: %s != %s", + s.formatKey(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key), s.formatKey(bounds.Start))) } } if !f.IsCompacting() { continue } if invariants.Enabled { - if s.cmp(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) != 0 || s.cmp(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) != 0 { - panic(fmt.Sprintf("file %s has inconsistent L0 Sublevel interval bounds: %s-%s, %s-%s", f.FileNum, + bounds := f.UserKeyBounds() + if s.cmp(s.orderedIntervals[f.minIntervalIndex].startKey.key, bounds.Start) != 0 || + s.cmp(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, bounds.End.Key) != 0 { + panic(fmt.Sprintf("file %s has inconsistent L0 Sublevel interval bounds: %s-%s, %s-%s", f.TableNum, s.orderedIntervals[f.minIntervalIndex].startKey.key, s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, - f.Smallest.UserKey, f.Largest.UserKey)) + bounds.Start, bounds.End.Key)) } } for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { @@ -758,13 +765,13 @@ func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) { // were added after the compaction initiated, and the active compaction // files straddle the input file. Mark these intervals as base compacting. for _, c := range inProgress { - startIK := intervalKey{key: c.Smallest.UserKey, isLargest: false} - endIK := intervalKey{key: c.Largest.UserKey, isLargest: !c.Largest.IsExclusiveSentinel()} - start := sort.Search(len(s.orderedIntervals), func(i int) bool { - return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) >= 0 + startIK := intervalKey{key: c.Bounds.Start, isInclusiveEndBound: false} + endIK := intervalKey{key: c.Bounds.End.Key, isInclusiveEndBound: c.Bounds.End.Kind == base.Inclusive} + start, _ := slices.BinarySearchFunc(s.orderedIntervals, startIK, func(a fileInterval, b intervalKey) int { + return intervalKeyCompare(s.cmp, a.startKey, b) }) - end := sort.Search(len(s.orderedIntervals), func(i int) bool { - return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) >= 0 + end, _ := slices.BinarySearchFunc(s.orderedIntervals, endIK, func(a fileInterval, b intervalKey) int { + return intervalKeyCompare(s.cmp, a.startKey, b) }) for i := start; i < end && i < len(s.orderedIntervals); i++ { interval := &s.orderedIntervals[i] @@ -790,13 +797,40 @@ func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) { } } +// Check performs sanity checks on l0Sublevels in invariants mode. +func (s *l0Sublevels) Check() { + if !invariants.Enabled { + return + } + iter := s.levelMetadata.Iter() + n := 0 + for t := iter.First(); t != nil; n, t = n+1, iter.Next() { + if t.L0Index != n { + panic(fmt.Sprintf("t.L0Index out of sync (%d vs %d)", t.L0Index, n)) + } + } + if len(s.Levels) != len(s.levelFiles) { + panic("Levels and levelFiles inconsistency") + } + for i := range s.Levels { + if s.Levels[i].Len() != len(s.levelFiles[i]) { + panic("Levels and levelFiles inconsistency") + } + for _, t := range s.levelFiles[i] { + if t.SubLevel != i { + panic("t.SubLevel out of sync") + } + } + } +} + // String produces a string containing useful debug information. Useful in test // code and debugging. -func (s *L0Sublevels) String() string { +func (s *l0Sublevels) String() string { return s.describe(false) } -func (s *L0Sublevels) describe(verbose bool) string { +func (s *l0Sublevels) describe(verbose bool) string { var buf strings.Builder fmt.Fprintf(&buf, "file count: %d, sublevels: %d, intervals: %d\nflush split keys(%d): [", s.levelMetadata.Len(), len(s.levelFiles), len(s.orderedIntervals), len(s.flushSplitUserKeys)) @@ -837,7 +871,7 @@ func (s *L0Sublevels) describe(verbose bool) string { intervalsBytes += s.orderedIntervals[k].estimatedBytes } fmt.Fprintf(&buf, "wide file: %d, [%d, %d], byte fraction: %f\n", - f.FileNum, f.minIntervalIndex, f.maxIntervalIndex, + f.TableNum, f.minIntervalIndex, f.maxIntervalIndex, float64(intervalsBytes)/float64(s.fileBytes)) } } @@ -879,11 +913,11 @@ func (s *L0Sublevels) describe(verbose bool) string { return buf.String() } -// ReadAmplification returns the contribution of L0Sublevels to the read +// ReadAmplification returns the contribution of l0Sublevels to the read // amplification for any particular point key. It is the maximum height of any // tracked fileInterval. This is always less than or equal to the number of // sublevels. -func (s *L0Sublevels) ReadAmplification() int { +func (s *l0Sublevels) ReadAmplification() int { amp := 0 for i := range s.orderedIntervals { interval := &s.orderedIntervals[i] @@ -895,19 +929,13 @@ func (s *L0Sublevels) ReadAmplification() int { return amp } -// UserKeyRange encodes a key range in user key space. A UserKeyRange's Start -// and End boundaries are both inclusive. -type UserKeyRange struct { - Start, End []byte -} - // InUseKeyRanges returns the merged table bounds of L0 files overlapping the // provided user key range. The returned key ranges are sorted and // nonoverlapping. -func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange { +func (s *l0Sublevels) InUseKeyRanges(smallest, largest []byte) []base.UserKeyBounds { // Binary search to find the provided keys within the intervals. - startIK := intervalKey{key: smallest, isLargest: false} - endIK := intervalKey{key: largest, isLargest: true} + startIK := intervalKey{key: smallest, isInclusiveEndBound: false} + endIK := intervalKey{key: largest, isInclusiveEndBound: true} start := sort.Search(len(s.orderedIntervals), func(i int) bool { return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) > 0 }) @@ -919,8 +947,8 @@ func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange { return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) > 0 }) - var keyRanges []UserKeyRange - var curr *UserKeyRange + var keyRanges []base.UserKeyBounds + var curr *base.UserKeyBounds for i := start; i < end; { // Intervals with no files are not in use and can be skipped, once we // end the current UserKeyRange. @@ -932,7 +960,7 @@ func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange { // If curr is nil, start a new in-use key range. if curr == nil { - keyRanges = append(keyRanges, UserKeyRange{ + keyRanges = append(keyRanges, base.UserKeyBounds{ Start: s.orderedIntervals[i].startKey.key, }) curr = &keyRanges[len(keyRanges)-1] @@ -947,7 +975,7 @@ func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange { // maxIdx starts. We must set curr.End now, before making that leap, // because this iteration may be the last. i = maxIdx - curr.End = s.orderedIntervals[i+1].startKey.key + curr.End = s.orderedIntervals[i+1].startKey.toEndBoundary() continue } @@ -955,7 +983,7 @@ func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange { // interval. Update the current end to be the next interval's start key. // Note that curr is not necessarily finished, because there may be an // abutting non-empty interval. - curr.End = s.orderedIntervals[i+1].startKey.key + curr.End = s.orderedIntervals[i+1].startKey.toEndBoundary() i++ } return keyRanges @@ -967,7 +995,7 @@ func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange { // to include in the prev sstable). These are user keys so that range tombstones // can be properly truncated (untruncated range tombstones are not permitted for // L0 files). -func (s *L0Sublevels) FlushSplitKeys() [][]byte { +func (s *l0Sublevels) FlushSplitKeys() [][]byte { return s.flushSplitUserKeys } @@ -976,7 +1004,7 @@ func (s *L0Sublevels) FlushSplitKeys() [][]byte { // picker to decide compaction score for L0. There is no scoring for intra-L0 // compactions -- they only run if L0 score is high but we're unable to pick an // L0 -> Lbase compaction. -func (s *L0Sublevels) MaxDepthAfterOngoingCompactions() int { +func (s *l0Sublevels) MaxDepthAfterOngoingCompactions() int { depth := 0 for i := range s.orderedIntervals { interval := &s.orderedIntervals[i] @@ -994,7 +1022,7 @@ func (s *L0Sublevels) MaxDepthAfterOngoingCompactions() int { // this a pure sanity checker. // //lint:ignore U1000 - useful for debugging -func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error { +func (s *l0Sublevels) checkCompaction(c *L0CompactionFiles) error { includedFiles := newBitSet(s.levelMetadata.Len()) fileIntervalsByLevel := make([]struct { min int @@ -1047,8 +1075,8 @@ func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error { if fileIntervalsByLevel[level].max > max { max = fileIntervalsByLevel[level].max } - index := sort.Search(len(s.levelFiles[level]), func(i int) bool { - return s.levelFiles[level][i].maxIntervalIndex >= min + index, _ := slices.BinarySearchFunc(s.levelFiles[level], min, func(a *TableMetadata, b int) int { + return stdcmp.Compare(a.maxIntervalIndex, b) }) // start := index for ; index < len(s.levelFiles[level]); index++ { @@ -1059,7 +1087,7 @@ func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error { if c.isIntraL0 && f.LargestSeqNum >= c.earliestUnflushedSeqNum { return errors.Errorf( "sstable %s in compaction has sequence numbers higher than the earliest unflushed seqnum %d: %d-%d", - f.FileNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum, + f.TableNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum, f.LargestSeqNum) } if !includedFiles[f.L0Index] { @@ -1067,16 +1095,16 @@ func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error { fmt.Fprintf(&buf, "bug %t, seed interval: %d: level %d, sl index %d, f.index %d, min %d, max %d, pre-min %d, pre-max %d, f.min %d, f.max %d, filenum: %d, isCompacting: %t\n%s\n", c.isIntraL0, c.seedInterval, level, index, f.L0Index, min, max, c.preExtensionMinInterval, c.preExtensionMaxInterval, f.minIntervalIndex, f.maxIntervalIndex, - f.FileNum, f.IsCompacting(), s) + f.TableNum, f.IsCompacting(), s) fmt.Fprintf(&buf, "files included:\n") for _, f := range c.Files { fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n", - f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) + f.TableNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) } fmt.Fprintf(&buf, "files added:\n") for _, f := range c.filesAdded { fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n", - f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) + f.TableNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) } return errors.New(buf.String()) } @@ -1085,18 +1113,17 @@ func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error { return nil } -// UpdateStateForStartedCompaction updates internal L0Sublevels state for a +// UpdateStateForStartedCompaction updates internal l0Sublevels state for a // recently started compaction. isBase specifies if this is a base compaction; // if false, this is assumed to be an intra-L0 compaction. The specified // compaction must be involving L0 SSTables. It's assumed that the Compacting -// and IsIntraL0Compacting fields are already set on all [FileMetadata]s passed +// and IsIntraL0Compacting fields are already set on all [TableMetadata]s passed // in. -func (s *L0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBase bool) error { +func (s *l0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBase bool) error { minIntervalIndex := -1 maxIntervalIndex := 0 for i := range inputs { - iter := inputs[i].Iter() - for f := iter.First(); f != nil; f = iter.Next() { + for f := range inputs[i].All() { for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { interval := &s.orderedIntervals[i] interval.compactingFileCount++ @@ -1126,7 +1153,7 @@ func (s *L0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBas // compaction (such as Files), as well as for picking between candidate // compactions (eg. fileBytes and seedIntervalStackDepthReduction). type L0CompactionFiles struct { - Files []*FileMetadata + Files []*TableMetadata FilesIncluded bitSet // A "seed interval" is an interval with a high stack depth that was chosen @@ -1152,12 +1179,12 @@ type L0CompactionFiles struct { // Set for intra-L0 compactions. SSTables with sequence numbers greater // than earliestUnflushedSeqNum cannot be a part of intra-L0 compactions. isIntraL0 bool - earliestUnflushedSeqNum uint64 + earliestUnflushedSeqNum base.SeqNum // For debugging purposes only. Used in checkCompaction(). preExtensionMinInterval int preExtensionMaxInterval int - filesAdded []*FileMetadata + filesAdded []*TableMetadata } // Clone allocates a new L0CompactionFiles, with the same underlying data. Note @@ -1178,7 +1205,7 @@ func (l *L0CompactionFiles) String() string { } // addFile adds the specified file to the LCF. -func (l *L0CompactionFiles) addFile(f *FileMetadata) { +func (l *L0CompactionFiles) addFile(f *TableMetadata) { if l.FilesIncluded[f.L0Index] { return } @@ -1369,9 +1396,13 @@ func (is intervalSorterByDecreasingScore) Swap(i, j int) { // heuristics, for the specified Lbase files and a minimum depth of overlapping // files that can be selected for compaction. Returns nil if no compaction is // possible. -func (s *L0Sublevels) PickBaseCompaction( - minCompactionDepth int, baseFiles LevelSlice, -) (*L0CompactionFiles, error) { +func (s *l0Sublevels) PickBaseCompaction( + logger base.Logger, + minCompactionDepth int, + baseFiles LevelSlice, + baseLevel int, + problemSpans *problemspans.ByLevel, +) *L0CompactionFiles { // For LBase compactions, we consider intervals in a greedy manner in the // following order: // - Intervals that are unlikely to be blocked due @@ -1386,12 +1417,21 @@ func (s *L0Sublevels) PickBaseCompaction( // this cost we can eliminate this heuristic. scoredIntervals := make([]intervalAndScore, 0, len(s.orderedIntervals)) sublevelCount := len(s.levelFiles) - for i := range s.orderedIntervals { + // The last orderedInterval does not contain any files and only provides the + // end key for the preceding interval, so we exclude it from this iteration. + for i := range s.orderedIntervals[:len(s.orderedIntervals)-1] { interval := &s.orderedIntervals[i] depth := len(interval.files) - interval.compactingFileCount - if interval.isBaseCompacting || minCompactionDepth > depth { + if interval.isBaseCompacting || depth < minCompactionDepth { continue } + if problemSpans != nil { + endKey := s.orderedIntervals[i+1].startKey + bounds := base.UserKeyBoundsEndExclusiveIf(interval.startKey.key, endKey.key, !endKey.isInclusiveEndBound) + if problemSpans.Overlaps(baseLevel, bounds) { + continue + } + } if interval.intervalRangeIsBaseCompacting { scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth}) } else { @@ -1419,21 +1459,19 @@ func (s *L0Sublevels) PickBaseCompaction( // file since they are likely nearby. Note that it is possible that // those intervals have seed files at lower sub-levels so could be // viable for compaction. - if f == nil { - return nil, errors.New("no seed file found in sublevel intervals") - } consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1) if f.IsCompacting() { if f.IsIntraL0Compacting { // If we're picking a base compaction and we came across a seed // file candidate that's being intra-L0 compacted, skip the - // interval instead of erroring out. + // interval instead of emitting an error. continue } - // We chose a compaction seed file that should not be compacting. - // Usually means the score is not accurately accounting for files - // already compacting, or internal state is inconsistent. - return nil, errors.Errorf("file %s chosen as seed file for compaction should not be compacting", f.FileNum) + // We chose a compaction seed file that should not be compacting; this + // indicates that the the internal state is inconsistent. Note that + // base.AssertionFailedf panics in invariant builds. + logger.Errorf("%v", base.AssertionFailedf("seed file %s should not be compacting", f.TableNum)) + continue } c := s.baseCompactionUsingSeed(f, interval.index, minCompactionDepth) @@ -1449,9 +1487,9 @@ func (s *L0Sublevels) PickBaseCompaction( var baseCompacting bool for ; m != nil && !baseCompacting; m = baseIter.Next() { - cmp := s.cmp(m.Smallest.UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key) + cmp := s.cmp(m.Smallest().UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key) // Compaction is ending at exclusive bound of c.maxIntervalIndex+1 - if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isLargest) { + if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isInclusiveEndBound) { break } baseCompacting = baseCompacting || m.IsCompacting() @@ -1459,16 +1497,16 @@ func (s *L0Sublevels) PickBaseCompaction( if baseCompacting { continue } - return c, nil + return c } } - return nil, nil + return nil } // Helper function for building an L0 -> Lbase compaction using a seed interval // and seed file in that seed interval. -func (s *L0Sublevels) baseCompactionUsingSeed( - f *FileMetadata, intervalIndex int, minCompactionDepth int, +func (s *l0Sublevels) baseCompactionUsingSeed( + f *TableMetadata, intervalIndex int, minCompactionDepth int, ) *L0CompactionFiles { c := &L0CompactionFiles{ FilesIncluded: newBitSet(s.levelMetadata.Len()), @@ -1562,11 +1600,11 @@ func (s *L0Sublevels) baseCompactionUsingSeed( // include overlapping files in the specified sublevel. Returns true if the // compaction is possible (i.e. does not conflict with any base/intra-L0 // compacting files). -func (s *L0Sublevels) extendFiles( - sl int, earliestUnflushedSeqNum uint64, cFiles *L0CompactionFiles, +func (s *l0Sublevels) extendFiles( + sl int, earliestUnflushedSeqNum base.SeqNum, cFiles *L0CompactionFiles, ) bool { - index := sort.Search(len(s.levelFiles[sl]), func(i int) bool { - return s.levelFiles[sl][i].maxIntervalIndex >= cFiles.minIntervalIndex + index, _ := slices.BinarySearchFunc(s.levelFiles[sl], cFiles.minIntervalIndex, func(a *TableMetadata, b int) int { + return stdcmp.Compare(a.maxIntervalIndex, b) }) for ; index < len(s.levelFiles[sl]); index++ { f := s.levelFiles[sl][index] @@ -1594,16 +1632,25 @@ func (s *L0Sublevels) extendFiles( // sublevel. This method is only called when a base compaction cannot be chosen. // See comment above [PickBaseCompaction] for heuristics involved in this // selection. -func (s *L0Sublevels) PickIntraL0Compaction( - earliestUnflushedSeqNum uint64, minCompactionDepth int, -) (*L0CompactionFiles, error) { +func (s *l0Sublevels) PickIntraL0Compaction( + earliestUnflushedSeqNum base.SeqNum, minCompactionDepth int, problemSpans *problemspans.ByLevel, +) *L0CompactionFiles { scoredIntervals := make([]intervalAndScore, len(s.orderedIntervals)) - for i := range s.orderedIntervals { + // The last orderedInterval does not contain any files and only provides the + // end key for the preceding interval, so we exclude it from this iteration. + for i := range s.orderedIntervals[:len(s.orderedIntervals)-1] { interval := &s.orderedIntervals[i] depth := len(interval.files) - interval.compactingFileCount if minCompactionDepth > depth { continue } + if problemSpans != nil { + endKey := s.orderedIntervals[i+1].startKey + bounds := base.UserKeyBoundsEndExclusiveIf(interval.startKey.key, endKey.key, !endKey.isInclusiveEndBound) + if problemSpans.Overlaps(0, bounds) { + continue + } + } scoredIntervals[i] = intervalAndScore{interval: i, score: depth} } sort.Sort(intervalSorterByDecreasingScore(scoredIntervals)) @@ -1617,53 +1664,48 @@ func (s *L0Sublevels) PickIntraL0Compaction( continue } - var f *FileMetadata // Pick the seed file for the interval as the file in the highest // sub-level. - stackDepthReduction := scoredInterval.score - for i := len(interval.files) - 1; i >= 0; i-- { - f = interval.files[i] - if f.IsCompacting() { - break - } - consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1) - // Can this be the seed file? Files with newer sequence numbers than - // earliestUnflushedSeqNum cannot be in the compaction. - if f.LargestSeqNum >= earliestUnflushedSeqNum { + seedFile := func() *TableMetadata { + stackDepthReduction := scoredInterval.score + for i := len(interval.files) - 1; i >= 0; i-- { + f := interval.files[i] + if f.IsCompacting() { + // This file could be in a concurrent intra-L0 or base compaction; we + // can't use this interval. + return nil + } + consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1) + // Can this be the seed file? Files with newer sequence numbers than + // earliestUnflushedSeqNum cannot be in the compaction. + if f.LargestSeqNum < earliestUnflushedSeqNum { + return f + } stackDepthReduction-- - if stackDepthReduction == 0 { - break + if stackDepthReduction < minCompactionDepth { + // Can't use this interval. + return nil } - } else { - break } - } - if stackDepthReduction < minCompactionDepth { - // Can't use this interval. - continue - } - - if f == nil { - return nil, errors.New("no seed file found in sublevel intervals") - } - if f.IsCompacting() { - // This file could be in a concurrent intra-L0 or base compaction. + return nil + }() + if seedFile == nil { // Try another interval. continue } // We have a seed file. Build a compaction off of that seed. c := s.intraL0CompactionUsingSeed( - f, interval.index, earliestUnflushedSeqNum, minCompactionDepth) + seedFile, interval.index, earliestUnflushedSeqNum, minCompactionDepth) if c != nil { - return c, nil + return c } } - return nil, nil + return nil } -func (s *L0Sublevels) intraL0CompactionUsingSeed( - f *FileMetadata, intervalIndex int, earliestUnflushedSeqNum uint64, minCompactionDepth int, +func (s *l0Sublevels) intraL0CompactionUsingSeed( + f *TableMetadata, intervalIndex int, earliestUnflushedSeqNum base.SeqNum, minCompactionDepth int, ) *L0CompactionFiles { // We know that all the files that overlap with intervalIndex have // LargestSeqNum < earliestUnflushedSeqNum, but for other intervals @@ -1762,7 +1804,7 @@ func (s *L0Sublevels) intraL0CompactionUsingSeed( // including any user keys for those internal keys could require choosing more // files in LBase which is undesirable. Unbounded start/end keys are indicated // by passing in the InvalidInternalKey. -func (s *L0Sublevels) ExtendL0ForBaseCompactionTo( +func (s *l0Sublevels) ExtendL0ForBaseCompactionTo( smallest, largest InternalKey, candidate *L0CompactionFiles, ) bool { firstIntervalIndex := 0 @@ -1880,7 +1922,7 @@ func (s *L0Sublevels) ExtendL0ForBaseCompactionTo( // // TODO(bilal): Add more targeted tests for this method, through // ExtendL0ForBaseCompactionTo and intraL0CompactionUsingSeed. -func (s *L0Sublevels) extendCandidateToRectangle( +func (s *l0Sublevels) extendCandidateToRectangle( minIntervalIndex int, maxIntervalIndex int, candidate *L0CompactionFiles, isBase bool, ) bool { candidate.preExtensionMinInterval = candidate.minIntervalIndex @@ -2025,7 +2067,7 @@ func (s *L0Sublevels) extendCandidateToRectangle( if f.IsCompacting() { // TODO(bilal): Do a logger.Fatalf instead of a panic, for // cleaner unwinding and error messages. - panic(fmt.Sprintf("expected %s to not be compacting", f.FileNum)) + panic(fmt.Sprintf("expected %s to not be compacting", f.TableNum)) } if candidate.isIntraL0 && f.LargestSeqNum >= candidate.earliestUnflushedSeqNum { continue @@ -2038,3 +2080,199 @@ func (s *L0Sublevels) extendCandidateToRectangle( } return addedCount > 0 } + +// L0Organizer keeps track of L0 state, including the subdivision into +// sublevels. +// +// It is designed to be used as a singleton (per DB) which gets updated as +// the version changes. It is used to initialize L0-related Version fields. +// +// The level 0 sstables are organized in a series of sublevels. Similar to the +// seqnum invariant in normal levels, there is no internal key in a lower +// sublevel table that has both the same user key and a higher sequence number. +// Within a sublevel, tables are sorted by their internal key range and any two +// tables at the same sublevel do not overlap. Unlike the normal levels, +// sublevel n contains older tables (lower sequence numbers) than sublevel n+1 +// (this is because the number of sublevels is variable). +type L0Organizer struct { + cmp base.Compare + formatKey base.FormatKey + flushSplitBytes int64 + generation int64 + + // levelMetadata is the current L0. + levelMetadata LevelMetadata + + // l0Sublevels reflects the current L0. + *l0Sublevels +} + +// NewL0Organizer creates the L0 organizer. The L0 organizer is responsible for +// maintaining the current L0 state and is kept in-sync with the current Version. +// +// flushSplitBytes denotes the target number of bytes per sublevel in each flush +// split interval (i.e. range between two flush split keys) in L0 sstables. When +// set to zero, only a single sstable is generated by each flush. When set to a +// non-zero value, flushes are split at points to meet L0's TargetFileSize, any +// grandparent-related overlap options, and at boundary keys of L0 flush split +// intervals (which are targeted to contain around FlushSplitBytes bytes in each +// sublevel between pairs of boundary keys). Splitting sstables during flush +// allows increased compaction flexibility and concurrency when those tables are +// compacted to lower levels. +func NewL0Organizer(comparer *base.Comparer, flushSplitBytes int64) *L0Organizer { + o := &L0Organizer{ + cmp: comparer.Compare, + formatKey: comparer.FormatKey, + flushSplitBytes: flushSplitBytes, + levelMetadata: MakeLevelMetadata(comparer.Compare, 0, nil), + } + var err error + o.l0Sublevels, err = newL0Sublevels(&o.levelMetadata, o.cmp, o.formatKey, o.flushSplitBytes) + if err != nil { + panic(errors.AssertionFailedf("error generating empty L0Sublevels: %s", err)) + } + return o +} + +// PrepareUpdate is the first step in the two-step process to update the +// L0Organizer. This first step performs as much work as it can without +// modifying the L0Organizer. +// +// This method can be called concurrently with other methods (other than +// PerformUpdate). It allows doing most of the update work outside an important +// lock. +func (o *L0Organizer) PrepareUpdate(bve *BulkVersionEdit, newVersion *Version) L0PreparedUpdate { + addedL0Tables := bve.AddedTables[0] + deletedL0Tables := bve.DeletedTables[0] + newLevelMeta := &newVersion.Levels[0] + if invariants.Enabled && invariants.Sometimes(10) { + // Verify that newLevelMeta = m.levelMetadata + addedL0Tables - deletedL0Tables. + verifyLevelMetadataTransition(&o.levelMetadata, newLevelMeta, addedL0Tables, deletedL0Tables) + } + + if len(addedL0Tables) == 0 && len(deletedL0Tables) == 0 { + return L0PreparedUpdate{ + generation: o.generation, + newSublevels: o.l0Sublevels, + } + } + + if len(deletedL0Tables) == 0 { + if files, ok := o.l0Sublevels.canUseAddL0Files(addedL0Tables, newLevelMeta); ok { + return L0PreparedUpdate{ + generation: o.generation, + addL0Files: files, + } + } + } + newSublevels, err := newL0Sublevels(newLevelMeta, o.cmp, o.formatKey, o.flushSplitBytes) + if err != nil { + panic(errors.AssertionFailedf("error generating L0Sublevels: %s", err)) + } + + return L0PreparedUpdate{ + generation: o.generation, + newSublevels: newSublevels, + } +} + +// L0PreparedUpdate is returned by L0Organizer.PrepareUpdate(), to be passed to +// PerformUpdate(). +type L0PreparedUpdate struct { + generation int64 + + // Exactly one of the following fields will be set. + addL0Files []*TableMetadata + newSublevels *l0Sublevels +} + +// PerformUpdate applies an update the L0 organizer which was previously +// prepared using PrepareUpdate. +// +// Sets newVersion.L0SublevelFiles (which is immutable once set). +// +// This method cannot be called concurrently with any other methods. +func (o *L0Organizer) PerformUpdate(prepared L0PreparedUpdate, newVersion *Version) { + if prepared.generation != o.generation { + panic("invalid L0 update generation") + } + o.levelMetadata = newVersion.Levels[0] + o.generation++ + if prepared.addL0Files != nil { + newSublevels := o.l0Sublevels.addL0Files(prepared.addL0Files, o.flushSplitBytes, &o.levelMetadata) + // In invariants mode, sometimes rebuild from scratch to verify that + // AddL0Files did the right thing. Note that NewL0Sublevels updates + // fields in TableMetadata like L0Index, so we don't want to do this + // every time. + if invariants.Enabled && invariants.Sometimes(10) { + expectedSublevels, err := newL0Sublevels(&o.levelMetadata, o.cmp, o.formatKey, o.flushSplitBytes) + if err != nil { + panic(fmt.Sprintf("error when regenerating sublevels: %s", err)) + } + s1 := describeSublevels(o.formatKey, false /* verbose */, expectedSublevels.Levels) + s2 := describeSublevels(o.formatKey, false /* verbose */, newSublevels.Levels) + if s1 != s2 { + // Add verbosity. + s1 := describeSublevels(o.formatKey, true /* verbose */, expectedSublevels.Levels) + s2 := describeSublevels(o.formatKey, true /* verbose */, newSublevels.Levels) + panic(fmt.Sprintf("incremental L0 sublevel generation produced different output than regeneration: %s != %s", s1, s2)) + } + } + o.l0Sublevels = newSublevels + } else { + o.l0Sublevels = prepared.newSublevels + } + newVersion.L0SublevelFiles = o.l0Sublevels.Levels +} + +// ResetForTesting reinitializes the L0Organizer to reflect the given version. +// Sets v.L0SublevelFiles. +func (o *L0Organizer) ResetForTesting(v *Version) { + o.levelMetadata = v.Levels[0] + o.generation = 0 + var err error + o.l0Sublevels, err = newL0Sublevels(&v.Levels[0], o.cmp, o.formatKey, o.flushSplitBytes) + if err != nil { + panic(errors.AssertionFailedf("error generating L0Sublevels: %s", err)) + } + v.L0SublevelFiles = o.l0Sublevels.Levels +} + +// verifyLevelMetadataTransition verifies that newLevel matches oldLevel after +// adding and removing the specified tables. +func verifyLevelMetadataTransition( + oldLevel, newLevel *LevelMetadata, + addedTables map[base.TableNum]*TableMetadata, + deletedTables map[base.TableNum]*TableMetadata, +) { + m := make(map[base.TableNum]*TableMetadata, oldLevel.Len()) + iter := oldLevel.Iter() + for t := iter.First(); t != nil; t = iter.Next() { + m[t.TableNum] = t + } + for n, t := range addedTables { + if m[n] != nil { + panic("added table that already exists in old level") + } + m[n] = t + } + for n, t := range deletedTables { + if m[n] == nil { + panic("deleted table not in old level") + } + if m[n] != t { + panic("deleted table does not match old level") + } + delete(m, n) + } + iter = newLevel.Iter() + for t := iter.First(); t != nil; t = iter.Next() { + if m[t.TableNum] == nil { + panic("unknown table in new level") + } + delete(m, t.TableNum) + } + if len(m) != 0 { + panic("tables missing from the new level") + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/layer.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/layer.go new file mode 100644 index 0000000..12e824d --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/layer.go @@ -0,0 +1,121 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +import ( + "fmt" + "math" + + "github.com/cockroachdb/redact" +) + +// Layer represents a section of the logical sstable hierarchy. It can represent: +// - a level L1 through L6, or +// - the entire L0 level, or +// - a specific L0 sublevel, or +// - the layer of flushable ingests (which is conceptually above the LSM). +type Layer struct { + kind layerKind + value uint16 +} + +// Level returns a Layer that represents an entire level (L0 through L6). +func Level(level int) Layer { + if level < 0 || level >= NumLevels { + panic("invalid level") + } + return Layer{ + kind: levelLayer, + value: uint16(level), + } +} + +// L0Sublevel returns a Layer that represents a specific L0 sublevel. +func L0Sublevel(sublevel int) Layer { + // Note: Pebble stops writes once we get to 1000 sublevels. + if sublevel < 0 || sublevel > math.MaxUint16 { + panic("invalid sublevel") + } + return Layer{ + kind: l0SublevelLayer, + value: uint16(sublevel), + } +} + +// FlushableIngestsLayer returns a Layer that represents the flushable ingests +// layer (which is logically above L0). +func FlushableIngestsLayer() Layer { + return Layer{ + kind: flushableIngestsLayer, + } +} + +// IsSet returns true if l has been initialized. +func (l Layer) IsSet() bool { + return l.kind != 0 +} + +// IsFlushableIngests returns true if the layer represents flushable ingests. +func (l Layer) IsFlushableIngests() bool { + return l.kind == flushableIngestsLayer +} + +// IsL0Sublevel returns true if the layer represents an L0 sublevel. +func (l Layer) IsL0Sublevel() bool { + return l.kind == l0SublevelLayer +} + +// Level returns the level for the layer. Must not be called if +// the layer represents flushable ingests. +func (l Layer) Level() int { + switch l.kind { + case levelLayer: + return int(l.value) + case l0SublevelLayer: + return 0 + case flushableIngestsLayer: + panic("flushable ingests layer") + default: + panic("invalid layer") + } +} + +// Sublevel returns the L0 sublevel. Can only be called if the layer represents +// an L0 sublevel. +func (l Layer) Sublevel() int { + if !l.IsL0Sublevel() { + panic("not an L0 sublevel layer") + } + return int(l.value) +} + +func (l Layer) String() string { + switch l.kind { + case levelLayer: + return fmt.Sprintf("L%d", l.value) + case l0SublevelLayer: + return fmt.Sprintf("L0.%d", l.value) + case flushableIngestsLayer: + return "flushable-ingests" + default: + return "unknown" + } +} + +// SafeFormat implements redact.SafeFormatter. +func (l Layer) SafeFormat(s redact.SafePrinter, verb rune) { + s.SafeString(redact.SafeString(l.String())) +} + +type layerKind uint8 + +const ( + // Entire level: value contains the level number (0 through 6). + levelLayer layerKind = iota + 1 + // L0 sublevel: value contains the sublevel number. + l0SublevelLayer + // Flushable ingests layer: value is unused. + flushableIngestsLayer +) diff --git a/vendor/github.com/cockroachdb/pebble/internal/manifest/level_metadata.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/level_metadata.go similarity index 54% rename from vendor/github.com/cockroachdb/pebble/internal/manifest/level_metadata.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/manifest/level_metadata.go index d48e277..aab5636 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/manifest/level_metadata.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/level_metadata.go @@ -7,85 +7,104 @@ package manifest import ( "bytes" "fmt" + "iter" + "reflect" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" ) // LevelMetadata contains metadata for all of the files within // a level of the LSM. type LevelMetadata struct { - level int - totalSize uint64 + level int + totalTableSize uint64 + totalRefSize uint64 // NumVirtual is the number of virtual sstables in the level. NumVirtual uint64 - // VirtualSize is the size of the virtual sstables in the level. - VirtualSize uint64 - tree btree + // VirtualTableSize is the size of the virtual sstables in the level. + VirtualTableSize uint64 + tree btree[*TableMetadata] } // clone makes a copy of the level metadata, implicitly increasing the ref // count of every file contained within lm. func (lm *LevelMetadata) clone() LevelMetadata { return LevelMetadata{ - level: lm.level, - totalSize: lm.totalSize, - NumVirtual: lm.NumVirtual, - VirtualSize: lm.VirtualSize, - tree: lm.tree.Clone(), + level: lm.level, + totalTableSize: lm.totalTableSize, + totalRefSize: lm.totalRefSize, + NumVirtual: lm.NumVirtual, + VirtualTableSize: lm.VirtualTableSize, + tree: lm.tree.Clone(), } } -func (lm *LevelMetadata) release() (obsolete []*FileBacking) { - return lm.tree.Release() +func (lm *LevelMetadata) release(of ObsoleteFilesSet) { + lm.tree.Release(of) } -func makeLevelMetadata(cmp Compare, level int, files []*FileMetadata) LevelMetadata { +// MakeLevelMetadata creates a LevelMetadata with the given files. +func MakeLevelMetadata(cmp Compare, level int, files []*TableMetadata) LevelMetadata { bcmp := btreeCmpSeqNum if level > 0 { bcmp = btreeCmpSmallestKey(cmp) } var lm LevelMetadata lm.level = level - lm.tree, _ = makeBTree(bcmp, files) + lm.tree = makeBTree(bcmp, files) for _, f := range files { - lm.totalSize += f.Size + lm.totalTableSize += f.Size + lm.totalRefSize += f.EstimatedReferenceSize() if f.Virtual { lm.NumVirtual++ - lm.VirtualSize += f.Size + lm.VirtualTableSize += f.Size } } return lm } -func makeBTree(cmp btreeCmp, files []*FileMetadata) (btree, LevelSlice) { - var t btree - t.cmp = cmp +func makeBTree[M fileMetadata](bcmp btreeCmp[M], files []M) btree[M] { + t := btree[M]{bcmp: bcmp} for _, f := range files { - t.Insert(f) + if err := t.Insert(f); err != nil { + panic(err) + } } - return t, newLevelSlice(t.Iter()) + return t +} + +func makeLevelSlice(bcmp btreeCmp[*TableMetadata], files []*TableMetadata) LevelSlice { + t := makeBTree(bcmp, files) + slice := newLevelSlice(tableMetadataIter(&t)) + slice.verifyInvariants() + // We can release the tree because the nodes that are referenced by the + // LevelSlice are immutable and we never recycle them. + t.Release(ignoreObsoleteFiles{}) + return slice } -func (lm *LevelMetadata) insert(f *FileMetadata) error { +func (lm *LevelMetadata) insert(f *TableMetadata) error { if err := lm.tree.Insert(f); err != nil { return err } - lm.totalSize += f.Size + lm.totalTableSize += f.Size + lm.totalRefSize += f.EstimatedReferenceSize() if f.Virtual { lm.NumVirtual++ - lm.VirtualSize += f.Size + lm.VirtualTableSize += f.Size } return nil } -func (lm *LevelMetadata) remove(f *FileMetadata) bool { - lm.totalSize -= f.Size +func (lm *LevelMetadata) remove(f *TableMetadata) { + lm.totalTableSize -= f.Size + lm.totalRefSize -= f.EstimatedReferenceSize() if f.Virtual { lm.NumVirtual-- - lm.VirtualSize -= f.Size + lm.VirtualTableSize -= f.Size } - return lm.tree.Delete(f) + lm.tree.Delete(f, assertNoObsoleteFiles{}) } // Empty indicates whether there are any files in the level. @@ -98,69 +117,62 @@ func (lm *LevelMetadata) Len() int { return lm.tree.Count() } -// Size returns the cumulative size of all the files within the level. -func (lm *LevelMetadata) Size() uint64 { - return lm.totalSize +// AggregateSize returns the aggregate size estimate of all sstables within the +// level, plus an estimate of the physical size of values stored externally in +// blob files. This quantity is equal to TableSize() + EstimatedReferenceSize(). +func (lm *LevelMetadata) AggregateSize() uint64 { + return lm.totalTableSize + lm.totalRefSize } -// Iter constructs a LevelIterator over the entire level. -func (lm *LevelMetadata) Iter() LevelIterator { - return LevelIterator{iter: lm.tree.Iter()} +// TableSize returns the cumulative size of all sstables within the level. This +// quantity does NOT include the size of values stored externally in blob files. +func (lm *LevelMetadata) TableSize() uint64 { + return lm.totalTableSize } -// Slice constructs a slice containing the entire level. -func (lm *LevelMetadata) Slice() LevelSlice { - return newLevelSlice(lm.tree.Iter()) +// EstimatedReferenceSize returns an estimate of the physical size of all the +// file's blob references in the table. This sum, added to TableSize(), yields +// AggregateSize(). +func (lm *LevelMetadata) EstimatedReferenceSize() uint64 { + return lm.totalRefSize } -// Find finds the provided file in the level if it exists. -func (lm *LevelMetadata) Find(cmp base.Compare, m *FileMetadata) *LevelFile { - iter := lm.Iter() - if lm.level != 0 { - // If lm holds files for levels >0, we can narrow our search by binary - // searching by bounds. - o := overlaps(iter, cmp, m.Smallest.UserKey, - m.Largest.UserKey, m.Largest.IsExclusiveSentinel()) - iter = o.Iter() - } - for f := iter.First(); f != nil; f = iter.Next() { - if f == m { - lf := iter.Take() - return &lf +// Iter constructs a LevelIterator over the entire level. +func (lm *LevelMetadata) Iter() LevelIterator { + return LevelIterator{iter: tableMetadataIter(&lm.tree)} +} + +// All returns an iterator over all files in the level. +func (lm *LevelMetadata) All() iter.Seq[*TableMetadata] { + return func(yield func(*TableMetadata) bool) { + iter := lm.Iter() + for f := iter.First(); f != nil; f = iter.Next() { + if !yield(f) { + break + } } } - return nil } -// Annotation lazily calculates and returns the annotation defined by -// Annotator. The Annotator is used as the key for pre-calculated -// values, so equal Annotators must be used to avoid duplicate computations -// and cached annotations. Annotation must not be called concurrently, and in -// practice this is achieved by requiring callers to hold DB.mu. -func (lm *LevelMetadata) Annotation(annotator Annotator) interface{} { - if lm.Empty() { - return annotator.Zero(nil) - } - v, _ := lm.tree.root.Annotation(annotator) - return v +// Slice constructs a slice containing the entire level. +func (lm *LevelMetadata) Slice() LevelSlice { + return newLevelSlice(tableMetadataIter(&lm.tree)) } -// InvalidateAnnotation clears any cached annotations defined by Annotator. The -// Annotator is used as the key for pre-calculated values, so equal Annotators -// must be used to clear the appropriate cached annotation. InvalidateAnnotation -// must not be called concurrently, and in practice this is achieved by -// requiring callers to hold DB.mu. -func (lm *LevelMetadata) InvalidateAnnotation(annotator Annotator) { - if lm.Empty() { - return +// Find finds the provided file in the level. If it exists, returns a LevelSlice +// that contains just that file; otherwise, returns an empty LevelSlice. +func (lm *LevelMetadata) Find(cmp base.Compare, m *TableMetadata) LevelSlice { + iter := lm.Iter() + if iter.find(m) { + return iter.Take().slice } - lm.tree.root.InvalidateAnnotation(annotator) + return LevelSlice{} } // LevelFile holds a file's metadata along with its position // within a level of the LSM. type LevelFile struct { - *FileMetadata + *TableMetadata slice LevelSlice } @@ -173,37 +185,30 @@ func (lf LevelFile) Slice() LevelSlice { // sorted by the L0 sequence number sort order. // TODO(jackson): Can we improve this interface or avoid needing to export // a slice constructor like this? -func NewLevelSliceSeqSorted(files []*FileMetadata) LevelSlice { - tr, slice := makeBTree(btreeCmpSeqNum, files) - tr.Release() - slice.verifyInvariants() - return slice +func NewLevelSliceSeqSorted(files []*TableMetadata) LevelSlice { + return makeLevelSlice(btreeCmpSeqNum, files) } // NewLevelSliceKeySorted constructs a LevelSlice over the provided files, // sorted by the files smallest keys. // TODO(jackson): Can we improve this interface or avoid needing to export // a slice constructor like this? -func NewLevelSliceKeySorted(cmp base.Compare, files []*FileMetadata) LevelSlice { - tr, slice := makeBTree(btreeCmpSmallestKey(cmp), files) - tr.Release() - slice.verifyInvariants() - return slice +func NewLevelSliceKeySorted(cmp base.Compare, files []*TableMetadata) LevelSlice { + return makeLevelSlice(btreeCmpSmallestKey(cmp), files) } // NewLevelSliceSpecificOrder constructs a LevelSlice over the provided files, // ordering the files by their order in the provided slice. It's used in // tests. // TODO(jackson): Update tests to avoid requiring this and remove it. -func NewLevelSliceSpecificOrder(files []*FileMetadata) LevelSlice { - tr, slice := makeBTree(btreeCmpSpecificOrder(files), files) - tr.Release() +func NewLevelSliceSpecificOrder(files []*TableMetadata) LevelSlice { + slice := makeLevelSlice(btreeCmpSpecificOrder(files), files) slice.verifyInvariants() return slice } // newLevelSlice constructs a new LevelSlice backed by iter. -func newLevelSlice(iter iterator) LevelSlice { +func newLevelSlice(iter iterator[*TableMetadata]) LevelSlice { s := LevelSlice{iter: iter} if iter.r != nil { s.length = iter.r.subtreeCount @@ -216,7 +221,9 @@ func newLevelSlice(iter iterator) LevelSlice { // by the provided start and end bounds. The provided startBound and endBound // iterators must be iterators over the same B-Tree. Both start and end bounds // are inclusive. -func newBoundedLevelSlice(iter iterator, startBound, endBound *iterator) LevelSlice { +func newBoundedLevelSlice( + iter iterator[*TableMetadata], startBound, endBound *iterator[*TableMetadata], +) LevelSlice { s := LevelSlice{ iter: iter, start: startBound, @@ -248,20 +255,19 @@ func newBoundedLevelSlice(iter iterator, startBound, endBound *iterator) LevelSl // LevelSlices should be constructed through one of the existing constructors, // not manually initialized. type LevelSlice struct { - iter iterator + iter iterator[*TableMetadata] length int // start and end form the inclusive bounds of a slice of files within a // level of the LSM. They may be nil if the entire B-Tree backing iter is // accessible. - start *iterator - end *iterator + start *iterator[*TableMetadata] + end *iterator[*TableMetadata] } func (ls LevelSlice) verifyInvariants() { if invariants.Enabled { - i := ls.Iter() var length int - for f := i.First(); f != nil; f = i.Next() { + for range ls.All() { length++ } if ls.length != length { @@ -270,11 +276,15 @@ func (ls LevelSlice) verifyInvariants() { } } -// Each invokes fn for each element in the slice. -func (ls LevelSlice) Each(fn func(*FileMetadata)) { - iter := ls.Iter() - for f := iter.First(); f != nil; f = iter.Next() { - fn(f) +// All returns an iterator over all files in the slice. +func (ls LevelSlice) All() iter.Seq[*TableMetadata] { + return func(yield func(*TableMetadata) bool) { + iter := ls.Iter() + for f := iter.First(); f != nil; f = iter.Next() { + if !yield(f) { + break + } + } } } @@ -282,12 +292,12 @@ func (ls LevelSlice) Each(fn func(*FileMetadata)) { func (ls LevelSlice) String() string { var buf bytes.Buffer fmt.Fprintf(&buf, "%d files: ", ls.length) - ls.Each(func(f *FileMetadata) { + for f := range ls.All() { if buf.Len() > 0 { fmt.Fprintf(&buf, " ") } fmt.Fprint(&buf, f) - }) + } return buf.String() } @@ -310,12 +320,22 @@ func (ls *LevelSlice) Len() int { return ls.length } -// SizeSum sums the size of all files in the slice. Its runtime is linear in +// AggregateSizeSum sums the size of all sstables in the slice, inclusive of the +// estimated physical size of tables' blob references. Its runtime is linear in // the length of the slice. -func (ls *LevelSlice) SizeSum() uint64 { +func (ls *LevelSlice) AggregateSizeSum() uint64 { var sum uint64 - iter := ls.Iter() - for f := iter.First(); f != nil; f = iter.Next() { + for f := range ls.All() { + sum += f.Size + f.EstimatedReferenceSize() + } + return sum +} + +// TableSizeSum sums the size of all sstables in the slice. Its runtime is +// linear in the length of the slice. +func (ls *LevelSlice) TableSizeSum() uint64 { + var sum uint64 + for f := range ls.All() { sum += f.Size } return sum @@ -325,8 +345,7 @@ func (ls *LevelSlice) SizeSum() uint64 { // linear in the length of the slice. func (ls *LevelSlice) NumVirtual() uint64 { var n uint64 - iter := ls.Iter() - for f := iter.First(); f != nil; f = iter.Next() { + for f := range ls.All() { if f.Virtual { n++ } @@ -334,12 +353,11 @@ func (ls *LevelSlice) NumVirtual() uint64 { return n } -// VirtualSizeSum returns the sum of the sizes of the virtual sstables in the -// level. -func (ls *LevelSlice) VirtualSizeSum() uint64 { +// VirtualTableSizeSum returns the sum of the sizes of the virtual sstables in +// the level. +func (ls *LevelSlice) VirtualTableSizeSum() uint64 { var sum uint64 - iter := ls.Iter() - for f := iter.First(); f != nil; f = iter.Next() { + for f := range ls.All() { if f.Virtual { sum += f.Size } @@ -376,6 +394,24 @@ func (ls LevelSlice) Reslice(resliceFunc func(start, end *LevelIterator)) LevelS return newBoundedLevelSlice(start.iter.clone(), &start.iter, &end.iter) } +// Overlaps returns a new LevelSlice that reflects the portion of files with +// boundaries that overlap with the provided bounds. +func (ls LevelSlice) Overlaps(cmp Compare, bounds base.UserKeyBounds) LevelSlice { + startIter := ls.Iter() + startIter.SeekGE(cmp, bounds.Start) + + // Note: newBoundedLevelSlice uses inclusive bounds, so we need to position + // endIter at the last overlapping file. + endIter := ls.Iter() + endIterFile := endIter.SeekGE(cmp, bounds.End.Key) + // The first file that ends at/after bounds.End.Key might or might not overlap + // the bounds; we need to check the start key. + if endIterFile == nil || !bounds.End.IsUpperBoundFor(cmp, endIterFile.Smallest().UserKey) { + endIter.Prev() + } + return newBoundedLevelSlice(startIter.iter.clone(), &startIter.iter, &endIter.iter) +} + // KeyType is used to specify the type of keys we're looking for in // LevelIterator positioning operations. Files not containing any keys of the // desired type are skipped. @@ -393,58 +429,14 @@ const ( KeyTypeRange ) -type keyTypeAnnotator struct{} - -var _ Annotator = keyTypeAnnotator{} - -func (k keyTypeAnnotator) Zero(dst interface{}) interface{} { - var val *KeyType - if dst != nil { - val = dst.(*KeyType) - } else { - val = new(KeyType) - } - *val = KeyTypePoint - return val -} - -func (k keyTypeAnnotator) Accumulate(m *FileMetadata, dst interface{}) (interface{}, bool) { - v := dst.(*KeyType) - switch *v { - case KeyTypePoint: - if m.HasRangeKeys { - *v = KeyTypePointAndRange - } - case KeyTypePointAndRange: - // Do nothing. - default: - panic("unexpected key type") - } - return v, true -} - -func (k keyTypeAnnotator) Merge(src interface{}, dst interface{}) interface{} { - v := dst.(*KeyType) - srcVal := src.(*KeyType) - switch *v { - case KeyTypePoint: - if *srcVal == KeyTypePointAndRange { - *v = KeyTypePointAndRange - } - case KeyTypePointAndRange: - // Do nothing. - default: - panic("unexpected key type") - } - return v -} - // LevelIterator iterates over a set of files' metadata. Its zero value is an // empty iterator. type LevelIterator struct { - iter iterator - start *iterator - end *iterator + iter iterator[*TableMetadata] + // If set, start is an inclusive lower bound on the iterator. + start *iterator[*TableMetadata] + // If set, end is an inclusive upper bound on the iterator. + end *iterator[*TableMetadata] filter KeyType } @@ -502,23 +494,14 @@ func (i *LevelIterator) Clone() LevelIterator { } } -// Current returns the item at the current iterator position. -// -// Current is deprecated. Callers should instead use the return value of a -// positioning operation. -func (i *LevelIterator) Current() *FileMetadata { - if !i.iter.valid() || - (i.end != nil && cmpIter(i.iter, *i.end) > 0) || - (i.start != nil && cmpIter(i.iter, *i.start) < 0) { - return nil - } - return i.iter.cur() -} - func (i *LevelIterator) empty() bool { return emptyWithBounds(i.iter, i.start, i.end) } +func (i *LevelIterator) find(m *TableMetadata) bool { + return i.iter.find(m) +} + // Filter clones the iterator and sets the desired KeyType as the key to filter // files on. func (i *LevelIterator) Filter(keyType KeyType) LevelIterator { @@ -527,7 +510,7 @@ func (i *LevelIterator) Filter(keyType KeyType) LevelIterator { return l } -func emptyWithBounds(i iterator, start, end *iterator) bool { +func emptyWithBounds(i iterator[*TableMetadata], start, end *iterator[*TableMetadata]) bool { // If i.r is nil, the iterator was constructed from an empty btree. // If the end bound is before the start bound, the bounds represent an // empty slice of the B-Tree. @@ -535,7 +518,7 @@ func emptyWithBounds(i iterator, start, end *iterator) bool { } // First seeks to the first file in the iterator and returns it. -func (i *LevelIterator) First() *FileMetadata { +func (i *LevelIterator) First() *TableMetadata { if i.empty() { return nil } @@ -551,7 +534,7 @@ func (i *LevelIterator) First() *FileMetadata { } // Last seeks to the last file in the iterator and returns it. -func (i *LevelIterator) Last() *FileMetadata { +func (i *LevelIterator) Last() *TableMetadata { if i.empty() { return nil } @@ -567,7 +550,7 @@ func (i *LevelIterator) Last() *FileMetadata { } // Next advances the iterator to the next file and returns it. -func (i *LevelIterator) Next() *FileMetadata { +func (i *LevelIterator) Next() *TableMetadata { if i.iter.r == nil { return nil } @@ -582,7 +565,7 @@ func (i *LevelIterator) Next() *FileMetadata { } // Prev moves the iterator the previous file and returns it. -func (i *LevelIterator) Prev() *FileMetadata { +func (i *LevelIterator) Prev() *TableMetadata { if i.iter.r == nil { return nil } @@ -596,70 +579,156 @@ func (i *LevelIterator) Prev() *FileMetadata { return i.skipFilteredBackward(i.iter.cur()) } -// SeekGE seeks to the first file in the iterator's file set with a largest -// user key greater than or equal to the provided user key. The iterator must -// have been constructed from L1+, because it requires the underlying files to -// be sorted by user keys and non-overlapping. -func (i *LevelIterator) SeekGE(cmp Compare, userKey []byte) *FileMetadata { - // TODO(jackson): Assert that i.iter.cmp == btreeCmpSmallestKey. +// SeekGE seeks to the first file with a largest key (of the desired type) that +// is an upper bound for the given user key. This is the first file that could +// contain a user key that is greater than or equal to userKey. +// +// More specifically, userKey is less than the file's largest.UserKey or they +// are equal and largest is not an exclusive sentinel. +// +// The iterator must have been constructed from L1+ or from a single sublevel of +// L0, because it requires the underlying files to be sorted by user keys and +// non-overlapping. +func (i *LevelIterator) SeekGE(cmp Compare, userKey []byte) *TableMetadata { if i.iter.r == nil { return nil } - m := i.seek(func(m *FileMetadata) bool { - return cmp(m.Largest.UserKey, userKey) >= 0 - }) - if i.filter != KeyTypePointAndRange && m != nil { - b, ok := m.LargestBound(i.filter) - if !ok { - m = i.Next() - } else if c := cmp(b.UserKey, userKey); c < 0 || c == 0 && b.IsExclusiveSentinel() { - // This file does not contain any keys of the type ≥ lower. It - // should be filtered, even though it does contain point keys. - m = i.Next() + i.assertNotL0Cmp() + i.iter.reset() + for { + // Logic copied from sort.Search. + // + // INVARIANT A: items[j-1].Largest().IsUpperBoundFor(cmp, userKey) == false + // INVARIANT B: items[k].Largest().IsUpperBoundFor(cmp, userKey) == true + j, k := 0, int(i.iter.n.count) + for j < k { + h := int(uint(j+k) >> 1) // avoid overflow when computing h + // j ≤ h < k + ik := &i.iter.n.items[h].PointKeyBounds + if i.iter.n.items[h].boundTypeLargest == boundTypeRangeKey { + ik = i.iter.n.items[h].RangeKeyBounds + } + c := cmp(userKey, ik.LargestUserKey()) + if c > 0 || (c == 0 && ik.largestTrailer.IsExclusiveSentinel()) { + j = h + 1 // preserves INVARIANT A + } else { + k = h // preserves INVARIANT B + } + } + i.iter.pos = int16(j) + if i.iter.n.leaf { + if i.iter.pos == i.iter.n.count { + // next, which will ascend and descend to move to the next node. + i.iter.next() + } + break } + i.iter.descend(i.iter.n, i.iter.pos) + } + + // If the iterator is filtered or has bounds, we fall into a slow path that + // filters based on the current file and constraints the iterator's position + // according to the configured bounds. + if i.filter != KeyTypePointAndRange || i.start != nil || i.end != nil { + m := i.constrainToIteratorBounds() + if i.filter != KeyTypePointAndRange && m != nil { + b, ok := m.LargestBound(i.filter) + if !ok || !b.IsUpperBoundFor(cmp, userKey) { + // The file does not contain any keys of desired key types + // that are >= userKey. + return i.Next() + } + } + return i.skipFilteredForward(m) + } + // If the iterator is not filtered and has no bounds, we fall into a fast + // path that returns the current file. + if !i.iter.valid() { + return nil } - return i.skipFilteredForward(m) + return i.iter.cur() } -// SeekLT seeks to the last file in the iterator's file set with a smallest -// user key less than the provided user key. The iterator must have been -// constructed from L1+, because it requires the underlying files to be sorted -// by user keys and non-overlapping. -func (i *LevelIterator) SeekLT(cmp Compare, userKey []byte) *FileMetadata { - // TODO(jackson): Assert that i.iter.cmp == btreeCmpSmallestKey. +// SeekLT seeks to the last file with a smallest key (of the desired type) that +// is less than the given user key. This is the last file that could contain a +// key less than userKey. +// +// The iterator must have been constructed from L1+ or from a single sublevel of +// L0, because it requires the underlying files to be sorted by user keys and +// non-overlapping. +func (i *LevelIterator) SeekLT(cmp Compare, userKey []byte) *TableMetadata { if i.iter.r == nil { return nil } - i.seek(func(m *FileMetadata) bool { - return cmp(m.Smallest.UserKey, userKey) >= 0 - }) + i.assertNotL0Cmp() + i.iter.reset() + for { + // Logic copied from sort.Search. + // + // INVARIANT A: items[j].Smallest().UserKey < userKey + // INVARIANT B: items[k].Smallest().UserKey >= 0 + j, k := 0, int(i.iter.n.count) + for j < k { + h := int(uint(j+k) >> 1) // avoid overflow when computing h + // j ≤ h < k + if cmp(i.iter.n.items[h].Smallest().UserKey, userKey) < 0 { + j = h + 1 // preserves INVARIANT A + } else { + k = h // preserves INVARIANT B + } + } + i.iter.pos = int16(j) + if i.iter.n.leaf { + if i.iter.pos == i.iter.n.count { + i.iter.next() + } + break + } + i.iter.descend(i.iter.n, i.iter.pos) + } + _ = i.constrainToIteratorBounds() m := i.Prev() + // Although i.Prev() guarantees that the current file contains keys of the // relevant type, it doesn't guarantee that the keys of the relevant type - // are < userKey. + // are < userKey. For example, say that we have these two files: + // f1: [a, f) with keys of the desired type in the range [c, d) + // f2: [h, k) + // and userKey is b. The seek call above will position us at f2 and Prev will + // position us at f1. if i.filter != KeyTypePointAndRange && m != nil { b, ok := m.SmallestBound(i.filter) if !ok { panic("unreachable") } - if c := cmp(b.UserKey, userKey); c >= 0 { - // This file does not contain any keys of the type ≥ lower. It - // should be filtered, even though it does contain point keys. - m = i.Prev() + if cmp(b.UserKey, userKey) >= 0 { + // This file does not contain any keys of desired key types + // that are <= userKey. + return i.Prev() } } - return i.skipFilteredBackward(m) + return m } -// skipFilteredForward takes the file metadata at the iterator's current +// assertNotL0Cmp verifies that the btree associated with the iterator is +// ordered by Smallest key (i.e. L1+ or L0 sublevel) and not by LargestSeqNum +// (L0). +func (i *LevelIterator) assertNotL0Cmp() { + if invariants.Enabled { + if reflect.ValueOf(i.iter.cmp).Pointer() == reflect.ValueOf(btreeCmpSeqNum).Pointer() { + panic("Seek used with btreeCmpSeqNum") + } + } +} + +// skipFilteredForward takes the table metadata at the iterator's current // position, and skips forward if the current key-type filter (i.filter) // excludes the file. It skips until it finds an unfiltered file or exhausts the -// level. If lower is != nil, skipFilteredForward skips any files that do not -// contain keys with the provided key-type ≥ lower. +// level. // // skipFilteredForward also enforces the upper bound, returning nil if at any // point the upper bound is exceeded. -func (i *LevelIterator) skipFilteredForward(meta *FileMetadata) *FileMetadata { +func (i *LevelIterator) skipFilteredForward(meta *TableMetadata) *TableMetadata { for meta != nil && !meta.ContainsKeyType(i.filter) { i.iter.next() if !i.iter.valid() { @@ -675,15 +744,14 @@ func (i *LevelIterator) skipFilteredForward(meta *FileMetadata) *FileMetadata { return meta } -// skipFilteredBackward takes the file metadata at the iterator's current +// skipFilteredBackward takes the table metadata at the iterator's current // position, and skips backward if the current key-type filter (i.filter) // excludes the file. It skips until it finds an unfiltered file or exhausts the -// level. If upper is != nil, skipFilteredBackward skips any files that do not -// contain keys with the provided key-type < upper. +// level. // // skipFilteredBackward also enforces the lower bound, returning nil if at any // point the lower bound is exceeded. -func (i *LevelIterator) skipFilteredBackward(meta *FileMetadata) *FileMetadata { +func (i *LevelIterator) skipFilteredBackward(meta *TableMetadata) *TableMetadata { for meta != nil && !meta.ContainsKeyType(i.filter) { i.iter.prev() if !i.iter.valid() { @@ -699,10 +767,10 @@ func (i *LevelIterator) skipFilteredBackward(meta *FileMetadata) *FileMetadata { return meta } -func (i *LevelIterator) seek(fn func(*FileMetadata) bool) *FileMetadata { - i.iter.seek(fn) - - // i.iter.seek seeked in the unbounded underlying B-Tree. If the iterator +// constrainToIteratorBounds adjusts the iterator position to ensure it's +// positioned within the iterator's bounds. +func (i *LevelIterator) constrainToIteratorBounds() *TableMetadata { + // seek operations seek in the unbounded underlying B-Tree. If the iterator // has start or end bounds, we may have exceeded them. Reset to the bounds // if necessary. // @@ -732,17 +800,19 @@ func (i *LevelIterator) seek(fn func(*FileMetadata) bool) *FileMetadata { // position. Take panics if the iterator is not currently positioned over a // file. func (i *LevelIterator) Take() LevelFile { - m := i.Current() - if m == nil { + if !i.iter.valid() || + (i.end != nil && cmpIter(i.iter, *i.end) > 0) || + (i.start != nil && cmpIter(i.iter, *i.start) < 0) { panic("Take called on invalid LevelIterator") } + m := i.iter.cur() // LevelSlice's start and end fields are immutable and are positioned to // the same position for a LevelFile because they're inclusive, so we can // share one iterator stack between the two bounds. boundsIter := i.iter.clone() s := newBoundedLevelSlice(i.iter.clone(), &boundsIter, &boundsIter) return LevelFile{ - FileMetadata: m, - slice: s, + TableMetadata: m, + slice: s, } } diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/table_metadata.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/table_metadata.go new file mode 100644 index 0000000..66e3e7b --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/table_metadata.go @@ -0,0 +1,1224 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +import ( + "bytes" + stdcmp "cmp" + "fmt" + "sync/atomic" + "unsafe" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/strparse" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/virtual" +) + +// TableMetadata is maintained for leveled-ssts, i.e., they belong to a level of +// some version. TableMetadata does not contain the actual level of the sst, +// since such leveled-ssts can move across levels in different versions, while +// sharing the same TableMetadata. There are two kinds of leveled-ssts, physical +// and virtual. Underlying both leveled-ssts is a backing-sst, for which the +// only state is TableBacking. A backing-sst is level-less. It is possible for a +// backing-sst to be referred to by a physical sst in one version and by one or +// more virtual ssts in one or more versions. A backing-sst becomes obsolete and +// can be deleted once it is no longer required by any physical or virtual sst +// in any version. +// +// We maintain some invariants: +// +// 1. Each physical and virtual sst will have a unique TableMetadata.TableNum, +// and there will be exactly one TableMetadata associated with the TableNum. +// +// 2. Within a version, a backing-sst is either only referred to by one +// physical sst or one or more virtual ssts. +// +// 3. Once a backing-sst is referred to by a virtual sst in the latest version, +// it cannot go back to being referred to by a physical sst in any future +// version. +// +// Once a physical sst is no longer needed by any version, we will no longer +// maintain the table metadata associated with it. We will still maintain the +// TableBacking associated with the physical sst if the backing sst is required +// by any virtual ssts in any version. +// +// When using these fields in the context of a Virtual Table, These fields +// have additional invariants imposed on them, and/or slightly varying meanings: +// - boundTypeSmallest and boundTypeLargest (and their counterparts +// {Point,Range}KeyBounds.{Smallest(), Largest()}) remain tight bounds that represent a +// key at that exact bound. We make the effort to determine the next smallest +// or largest key in an sstable after virtualizing it, to maintain this +// tightness. If the largest is a sentinel key (IsExclusiveSentinel()), it +// could mean that a rangedel or range key ends at that user key, or has been +// truncated to that user key. +// - One invariant is that if a rangedel or range key is truncated on its +// upper bound, the virtual sstable *must* have a rangedel or range key +// sentinel key as its upper bound. This is because truncation yields +// an exclusive upper bound for the rangedel/rangekey, and if there are +// any points at that exclusive upper bound within the same virtual +// sstable, those could get uncovered by this truncation. We enforce this +// invariant in calls to keyspan.Truncate. +// - Size is an estimate of the size of the virtualized portion of this sstable. +// The underlying file's size is stored in TableBacking.Size, though it could +// also be estimated or could correspond to just the referenced portion of +// a file (eg. if the file originated on another node). +// - Size must be > 0. +// - SmallestSeqNum and LargestSeqNum are loose bounds for virtual sstables. +// This means that all keys in the virtual sstable must have seqnums within +// [SmallestSeqNum, LargestSeqNum], however there's no guarantee that there's +// a key with a seqnum at either of the bounds. Calculating tight seqnum +// bounds would be too expensive and deliver little value. +// - Note: These properties do not apply to external sstables, whose bounds are +// loose rather than tight, as we do not open them on ingest. +type TableMetadata struct { + // AllowedSeeks is used to determine if a file should be picked for + // a read triggered compaction. It is decremented when read sampling + // in pebble.Iterator after every after every positioning operation + // that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc). + AllowedSeeks atomic.Int64 + + // statsValid indicates if stats have been loaded for the table. The + // TableStats structure is populated only if valid is true. + statsValid atomic.Bool + + // TableBacking is the physical file that backs either physical or virtual + // sstables. + TableBacking *TableBacking + + // InitAllowedSeeks is the inital value of allowed seeks. This is used + // to re-set allowed seeks on a file once it hits 0. + InitAllowedSeeks int64 + // TableNum is the table number, unique across the lifetime of a DB. + // + // INVARIANT: when !TableMetadata.Virtual, TableNum == TableBacking.DiskFileNum. + TableNum base.TableNum + // Size is the size of the file, in bytes. Size is an approximate value for + // virtual sstables. + // + // INVARIANTS: + // - When !TableMetadata.Virtual, Size == TableBacking.Size. + // - Size should be non-zero. Size 0 virtual sstables must not be created. + Size uint64 + // File creation time in seconds since the epoch (1970-01-01 00:00:00 + // UTC). For ingested sstables, this corresponds to the time the file was + // ingested. For virtual sstables, this corresponds to the wall clock time + // when the TableMetadata for the virtual sstable was first created. + CreationTime int64 + // LargestSeqNumAbsolute is an upper bound for the largest sequence number + // in the table. This upper bound is guaranteed to be higher than any + // sequence number any of the table's keys have held at any point in time + // while the database has been open. Specifically, if the table contains + // keys that have had their sequence numbers zeroed during a compaction, + // LargestSeqNumAbsolute will be at least as high as the pre-zeroing + // sequence number. LargestSeqNumAbsolute is NOT durably persisted, so after + // a database restart it takes on the value of LargestSeqNum. + LargestSeqNumAbsolute base.SeqNum + // Lower and upper bounds for the smallest and largest sequence numbers in + // the table, across both point and range keys. For physical sstables, these + // values are tight bounds. For virtual sstables, there is no guarantee that + // there will be keys with SmallestSeqNum or LargestSeqNum within virtual + // sstable bounds. + SmallestSeqNum base.SeqNum + LargestSeqNum base.SeqNum + // PointKeyBounds.Smallest() and PointKeyBounds.Largest() are the inclusive bounds for the + // internal point keys stored in the table. This includes RANGEDELs, which + // alter point keys. + // NB: these field should be set using ExtendPointKeyBounds. They are left + // exported for reads as an optimization. + PointKeyBounds InternalKeyBounds + // RangeKeyBounds.Smallest() and RangeKeyBounds.Largest() are the inclusive bounds for the + // internal range keys stored in the table. + // NB: these field should be set using ExtendRangeKeyBounds. They are left + // exported for reads as an optimization. + RangeKeyBounds *InternalKeyBounds + // BlobReferences is a list of blob files containing values that are + // referenced by this sstable. + BlobReferences BlobReferences + // BlobReferenceDepth is the stack depth of blob files referenced by this + // sstable. See the comment on the BlobReferenceDepth type for more details. + // + // INVARIANT: BlobReferenceDepth == 0 iff len(BlobReferences) == 0 + // INVARIANT: BlobReferenceDepth <= len(BlobReferences) + BlobReferenceDepth BlobReferenceDepth + + // refs is the reference count for the table, used to determine when a table + // is obsolete. When a table's reference count falls to zero, the table is + // considered obsolete and the table's references on its associated files + // (backing file, blob references) are released. + // + // The tables in each version are maintained in a copy-on-write B-tree and + // each B-tree node keeps a reference on the contained tables. + refs atomic.Int32 + + // Stats describe table statistics. Protected by DB.mu. + // + // For virtual sstables, set stats upon virtual sstable creation as + // asynchronous computation of stats is not currently supported. + // + // TODO(bananabrick): To support manifest replay for virtual sstables, we + // probably need to compute virtual sstable stats asynchronously. Otherwise, + // we'd have to write virtual sstable stats to the version edit. + Stats TableStats + + // For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and + // pick L0 compactions. Only accurate for the most recent Version. + // TODO(radu): this is very hacky and fragile. This information should live + // inside l0Sublevels. + SubLevel int + L0Index int + minIntervalIndex int + maxIntervalIndex int + + // NB: the alignment of this struct is 8 bytes. We pack all the bools to + // ensure an optimal packing. + + // IsIntraL0Compacting is set to True if this file is part of an intra-L0 + // compaction. When it's true, IsCompacting must also return true. If + // Compacting is true and IsIntraL0Compacting is false for an L0 file, the + // file must be part of a compaction to Lbase. + IsIntraL0Compacting bool + CompactionState CompactionState + // True if compaction of this file has been explicitly requested. + // Previously, RocksDB and earlier versions of Pebble allowed this + // flag to be set by a user table property collector. Some earlier + // versions of Pebble respected this flag, while other more recent + // versions ignored this flag. + // + // More recently this flag has been repurposed to facilitate the + // compaction of 'atomic compaction units'. Files marked for + // compaction are compacted in a rewrite compaction at the lowest + // possible compaction priority. + // + // NB: A count of files marked for compaction is maintained on + // Version, and compaction picking reads cached annotations + // determined by this field. + // + // Protected by DB.mu. + MarkedForCompaction bool + // HasPointKeys tracks whether the table contains point keys (including + // RANGEDELs). If a table contains only range deletions, HasPointsKeys is + // still true. + HasPointKeys bool + // HasRangeKeys tracks whether the table contains any range keys. + HasRangeKeys bool + // Virtual is true if the TableMetadata belongs to a virtual sstable. + Virtual bool + // boundsSet track whether the overall bounds have been set. + boundsSet bool + // boundTypeSmallest and boundTypeLargest provide an indication as to which + // key type (point or range) corresponds to the smallest and largest overall + // table bounds. + boundTypeSmallest, boundTypeLargest boundType + // VirtualParams are set only when Virtual is true. + VirtualParams *virtual.VirtualReaderParams + + // SyntheticPrefix is used to prepend a prefix to all keys and/or override all + // suffixes in a table; used for some virtual tables. + SyntheticPrefixAndSuffix sstable.SyntheticPrefixAndSuffix +} + +// Ref increments the table's ref count. If this is the table's first reference, +// Ref will increment the reference of the table's TableBacking. +func (m *TableMetadata) Ref() { + if v := m.refs.Add(1); v == 1 { + m.TableBacking.Ref() + } +} + +// Unref decrements the table's reference count. If the count reaches zero, the +// table releases its references on associated files. If the table's backing +// file becomes obsolete, it's inserted into the provided ObsoleteFiles. +func (m *TableMetadata) Unref(obsoleteFiles ObsoleteFilesSet) { + v := m.refs.Add(-1) + if invariants.Enabled && v < 0 { + panic(errors.AssertionFailedf("pebble: invalid TableMetadata refcounting for table %s", m.TableNum)) + } + // When the reference count reaches zero, release the table's references. + if v == 0 { + if m.TableBacking.Unref() == 0 { + obsoleteFiles.AddBacking(m.TableBacking) + } + } +} + +// InternalKeyBounds returns the set of overall table bounds. +func (m *TableMetadata) InternalKeyBounds() (InternalKey, InternalKey) { + return m.Smallest(), m.Largest() +} + +// UserKeyBounds returns the user key bounds that correspond to m.Smallest and +// Largest. Because we do not allow split user keys, the user key bounds of +// files within a level do not overlap. +func (m *TableMetadata) UserKeyBounds() base.UserKeyBounds { + return base.UserKeyBoundsFromInternal(m.Smallest(), m.Largest()) +} + +// UserKeyBoundsByType returns the user key bounds for the given key types. +// Note that the returned bounds are invalid when requesting KeyTypePoint but +// HasPointKeys is false, or when requesting KeyTypeRange and HasRangeKeys is +// false. +func (m *TableMetadata) UserKeyBoundsByType(keyType KeyType) base.UserKeyBounds { + switch keyType { + case KeyTypePoint: + return base.UserKeyBoundsFromInternal(m.PointKeyBounds.Smallest(), m.PointKeyBounds.Largest()) + case KeyTypeRange: + if !m.HasRangeKeys { + return base.UserKeyBounds{} + } + return base.UserKeyBoundsFromInternal(m.RangeKeyBounds.Smallest(), m.RangeKeyBounds.Largest()) + default: + return base.UserKeyBoundsFromInternal(m.Smallest(), m.Largest()) + } +} + +// SyntheticSeqNum returns a SyntheticSeqNum which is set when SmallestSeqNum +// equals LargestSeqNum. +func (m *TableMetadata) SyntheticSeqNum() sstable.SyntheticSeqNum { + if m.SmallestSeqNum == m.LargestSeqNum { + return sstable.SyntheticSeqNum(m.SmallestSeqNum) + } + return sstable.NoSyntheticSeqNum +} + +// IterTransforms returns an sstable.IterTransforms populated according to the +// file. +func (m *TableMetadata) IterTransforms() sstable.IterTransforms { + return sstable.IterTransforms{ + SyntheticSeqNum: m.SyntheticSeqNum(), + SyntheticPrefixAndSuffix: m.SyntheticPrefixAndSuffix, + } +} + +// FragmentIterTransforms returns an sstable.FragmentIterTransforms populated +// according to the file. +func (m *TableMetadata) FragmentIterTransforms() sstable.FragmentIterTransforms { + return sstable.FragmentIterTransforms{ + SyntheticSeqNum: m.SyntheticSeqNum(), + SyntheticPrefixAndSuffix: m.SyntheticPrefixAndSuffix, + } +} + +func (m *TableMetadata) PhysicalMeta() *TableMetadata { + if m.Virtual { + panic("pebble: table metadata does not belong to a physical sstable") + } + return m +} + +func (m *TableMetadata) VirtualMeta() *TableMetadata { + if !m.Virtual { + panic("pebble: table metadata does not belong to a virtual sstable") + } + return m +} + +// EstimatedReferenceSize returns the estimated physical size of all the file's +// blob references in the table. This sum, added to the sstable's size, yields +// an approximation of the overall size of the data represented by the table. +// +// EstimatedReferenceSize is an estimate, but it's guaranteed to be stable over +// the lifetime of the table. This is necessary to correctly maintain +// incrementally-updated metrics. +func (m *TableMetadata) EstimatedReferenceSize() uint64 { + var size uint64 + for i := range m.BlobReferences { + size += m.BlobReferences[i].EstimatedPhysicalSize + } + return size +} + +// TableBacking either backs a single physical sstable, or one or more virtual +// sstables. +// +// See the comment above the TableMetadata type for sstable terminology. +type TableBacking struct { + DiskFileNum base.DiskFileNum + Size uint64 + + // Reference count for the backing file, used to determine when a backing file + // is obsolete and can be removed. + // + // The reference count is at least the number of distinct tables that use this + // backing across all versions that have a non-zero reference count. The tables + // in each version are maintained in a copy-on-write B-tree and each B-tree node + // keeps a reference on the respective backings. + // + // In addition, a reference count is taken for every backing in the latest + // version's VirtualBackings (necessary to support Protect/Unprotect). + refs atomic.Int32 +} + +// MustHaveRefs asserts that the backing has a positive refcount. +func (b *TableBacking) MustHaveRefs() { + if refs := b.refs.Load(); refs <= 0 { + panic(errors.AssertionFailedf("backing %s must have positive refcount (refs=%d)", + b.DiskFileNum, refs)) + } +} + +// Ref increments the backing's ref count. +func (b *TableBacking) Ref() { + b.refs.Add(1) +} + +// IsUnused returns if the backing is not being used by any tables in a version +// or btree. +func (b *TableBacking) IsUnused() bool { + return b.refs.Load() == 0 +} + +// Unref decrements the backing's ref count (and returns the new count). +func (b *TableBacking) Unref() int32 { + v := b.refs.Add(-1) + if invariants.Enabled && v < 0 { + panic(errors.AssertionFailedf("pebble: invalid TableBacking refcounting: file %s has refcount %d", b.DiskFileNum, v)) + } + return v +} + +// InitPhysicalBacking allocates and sets the TableBacking which is required by a +// physical sstable TableMetadata. +// +// Ensure that the state required by TableBacking, such as the TableNum, is +// already set on the TableMetadata before InitPhysicalBacking is called. +// Calling InitPhysicalBacking only after the relevant state has been set in the +// TableMetadata is not necessary in tests which don't rely on TableBacking. +func (m *TableMetadata) InitPhysicalBacking() { + if m.Virtual { + panic("pebble: virtual sstables should use a pre-existing TableBacking") + } + if m.TableBacking != nil { + panic("backing already initialized") + } + m.TableBacking = &TableBacking{ + DiskFileNum: base.PhysicalTableDiskFileNum(m.TableNum), + Size: m.Size, + } +} + +// InitVirtualBacking creates a new TableBacking for a virtual table. +// +// The Smallest/Largest bounds must already be set to their final values. +func (m *TableMetadata) InitVirtualBacking(fileNum base.DiskFileNum, size uint64) { + m.AttachVirtualBacking(&TableBacking{ + DiskFileNum: fileNum, + Size: size, + }) +} + +// AttachVirtualBacking attaches an existing TableBacking for a virtual table. +// +// The Smallest/Largest bounds must already be set to their final values. +func (m *TableMetadata) AttachVirtualBacking(backing *TableBacking) { + if !m.Virtual { + panic("pebble: provider-backed sstables must be virtual") + } + if m.TableBacking != nil { + panic("backing already initialized") + } + m.TableBacking = backing + if m.Smallest().UserKey == nil || m.Largest().UserKey == nil { + panic("bounds must be set before attaching backing") + } + m.VirtualParams = &virtual.VirtualReaderParams{ + Lower: m.Smallest(), + Upper: m.Largest(), + FileNum: m.TableNum, + } +} + +// ValidateVirtual should be called once the TableMetadata for a virtual sstable +// is created to verify that the fields of the virtual sstable are sound. +func (m *TableMetadata) ValidateVirtual(createdFrom *TableMetadata) { + switch { + case !m.Virtual: + panic("pebble: invalid virtual sstable") + case createdFrom.SmallestSeqNum != m.SmallestSeqNum: + panic("pebble: invalid smallest sequence number for virtual sstable") + case createdFrom.LargestSeqNum != m.LargestSeqNum: + panic("pebble: invalid largest sequence number for virtual sstable") + case createdFrom.LargestSeqNumAbsolute != m.LargestSeqNumAbsolute: + panic("pebble: invalid largest absolute sequence number for virtual sstable") + case createdFrom.TableBacking != nil && createdFrom.TableBacking != m.TableBacking: + panic("pebble: invalid physical sstable state for virtual sstable") + case m.Size == 0: + panic("pebble: virtual sstable size must be set upon creation") + } +} + +// SetCompactionState transitions this file's compaction state to the given +// state. Protected by DB.mu. +func (m *TableMetadata) SetCompactionState(to CompactionState) { + if invariants.Enabled { + transitionErr := func() error { + return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to) + } + switch m.CompactionState { + case CompactionStateNotCompacting: + if to != CompactionStateCompacting { + panic(transitionErr()) + } + case CompactionStateCompacting: + if to != CompactionStateCompacted && to != CompactionStateNotCompacting { + panic(transitionErr()) + } + case CompactionStateCompacted: + panic(transitionErr()) + default: + panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState)) + } + } + m.CompactionState = to +} + +// IsCompacting returns true if this file's compaction state is +// CompactionStateCompacting. Protected by DB.mu. +func (m *TableMetadata) IsCompacting() bool { + return m.CompactionState == CompactionStateCompacting +} + +// StatsValid returns true if the table stats have been populated. If StatValid +// returns true, the Stats field may be read (with or without holding the +// database mutex). +func (m *TableMetadata) StatsValid() bool { + return m.statsValid.Load() +} + +// StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu +// while populating TableStats and calling StatsMarkValud. Once stats are +// populated, they must not be mutated. +func (m *TableMetadata) StatsMarkValid() { + m.statsValid.Store(true) +} + +// ExtendPointKeyBounds attempts to extend the lower and upper point key bounds +// and overall table bounds with the given smallest and largest keys. The +// smallest and largest bounds may not be extended if the table already has a +// bound that is smaller or larger, respectively. The receiver is returned. +// NB: calling this method should be preferred to manually setting the bounds by +// manipulating the fields directly, to maintain certain invariants. +func (m *TableMetadata) ExtendPointKeyBounds( + cmp Compare, smallest, largest InternalKey, +) *TableMetadata { + // Update the point key bounds. + if !m.HasPointKeys { + m.PointKeyBounds.SetInternalKeyBounds(smallest, largest) + m.HasPointKeys = true + } else { + isSmallestPoint := base.InternalCompare(cmp, smallest, m.PointKeyBounds.Smallest()) < 0 + isLargestPoint := base.InternalCompare(cmp, largest, m.PointKeyBounds.Largest()) > 0 + if isSmallestPoint && isLargestPoint { + m.PointKeyBounds.SetInternalKeyBounds(smallest, largest) + } else if isSmallestPoint { + m.PointKeyBounds.SetSmallest(smallest) + } else if isLargestPoint { + m.PointKeyBounds.SetLargest(largest) + } + } + // Update the overall bounds. + m.extendOverallBounds(cmp, m.PointKeyBounds.Smallest(), m.PointKeyBounds.Largest(), boundTypePointKey) + return m +} + +// ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds +// and overall table bounds with the given smallest and largest keys. The +// smallest and largest bounds may not be extended if the table already has a +// bound that is smaller or larger, respectively. The receiver is returned. +// NB: calling this method should be preferred to manually setting the bounds by +// manipulating the fields directly, to maintain certain invariants. +func (m *TableMetadata) ExtendRangeKeyBounds( + cmp Compare, smallest, largest InternalKey, +) *TableMetadata { + // Update the range key bounds. + if !m.HasRangeKeys { + m.RangeKeyBounds = &InternalKeyBounds{} + m.RangeKeyBounds.SetInternalKeyBounds(smallest, largest) + m.HasRangeKeys = true + } else { + isSmallestRange := base.InternalCompare(cmp, smallest, m.RangeKeyBounds.Smallest()) < 0 + isLargestRange := base.InternalCompare(cmp, largest, m.RangeKeyBounds.Largest()) > 0 + if isSmallestRange && isLargestRange { + m.RangeKeyBounds.SetInternalKeyBounds(smallest, largest) + } else if isSmallestRange { + m.RangeKeyBounds.SetSmallest(smallest) + } else if isLargestRange { + m.RangeKeyBounds.SetLargest(largest) + } + } + // Update the overall bounds. + m.extendOverallBounds(cmp, m.RangeKeyBounds.Smallest(), m.RangeKeyBounds.Largest(), boundTypeRangeKey) + return m +} + +// extendOverallBounds attempts to extend the overall table lower and upper +// bounds. The given bounds may not be used if a lower or upper bound already +// exists that is smaller or larger than the given keys, respectively. The given +// boundType will be used if the bounds are updated. +func (m *TableMetadata) extendOverallBounds( + cmp Compare, smallest, largest InternalKey, bTyp boundType, +) { + if !m.boundsSet { + m.boundsSet = true + m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp + } else { + if base.InternalCompare(cmp, smallest, m.Smallest()) < 0 { + m.boundTypeSmallest = bTyp + } + if base.InternalCompare(cmp, largest, m.Largest()) > 0 { + m.boundTypeLargest = bTyp + } + } +} + +// Overlaps returns true if the file key range overlaps with the given user key bounds. +func (m *TableMetadata) Overlaps(cmp Compare, bounds *base.UserKeyBounds) bool { + b := m.UserKeyBounds() + return b.Overlaps(cmp, bounds) +} + +// ContainedWithinSpan returns true if the file key range completely overlaps with the +// given range ("end" is assumed to exclusive). +func (m *TableMetadata) ContainedWithinSpan(cmp Compare, start, end []byte) bool { + lowerCmp, upperCmp := cmp(m.Smallest().UserKey, start), cmp(m.Largest().UserKey, end) + return lowerCmp >= 0 && (upperCmp < 0 || (upperCmp == 0 && m.Largest().IsExclusiveSentinel())) +} + +// ContainsKeyType returns whether or not the file contains keys of the provided +// type. +func (m *TableMetadata) ContainsKeyType(kt KeyType) bool { + switch kt { + case KeyTypePointAndRange: + return true + case KeyTypePoint: + return m.HasPointKeys + case KeyTypeRange: + return m.HasRangeKeys + default: + panic("unrecognized key type") + } +} + +// SmallestBound returns the file's smallest bound of the key type. It returns a +// false second return value if the file does not contain any keys of the key +// type. +func (m *TableMetadata) SmallestBound(kt KeyType) (InternalKey, bool) { + switch kt { + case KeyTypePointAndRange: + return m.Smallest(), true + case KeyTypePoint: + return m.PointKeyBounds.Smallest(), m.HasPointKeys + case KeyTypeRange: + if !m.HasRangeKeys { + return InternalKey{}, m.HasRangeKeys + } + return m.RangeKeyBounds.Smallest(), m.HasRangeKeys + default: + panic("unrecognized key type") + } +} + +// LargestBound returns the file's largest bound of the key type. It returns a +// false second return value if the file does not contain any keys of the key +// type. +func (m *TableMetadata) LargestBound(kt KeyType) (InternalKey, bool) { + switch kt { + case KeyTypePointAndRange: + ik := m.Largest() + return ik, true + case KeyTypePoint: + return m.PointKeyBounds.Largest(), m.HasPointKeys + case KeyTypeRange: + if !m.HasRangeKeys { + return InternalKey{}, m.HasRangeKeys + } + return m.RangeKeyBounds.Largest(), m.HasRangeKeys + default: + panic("unrecognized key type") + } +} + +const ( + maskContainsPointKeys = 1 << 0 + maskSmallest = 1 << 1 + maskLargest = 1 << 2 +) + +// boundsMarker returns a marker byte whose bits encode the following +// information (in order from least significant bit): +// - if the table contains point keys +// - if the table's smallest key is a point key +// - if the table's largest key is a point key +func (m *TableMetadata) boundsMarker() (sentinel uint8, err error) { + if m.HasPointKeys { + sentinel |= maskContainsPointKeys + } + switch m.boundTypeSmallest { + case boundTypePointKey: + sentinel |= maskSmallest + case boundTypeRangeKey: + // No op - leave bit unset. + default: + return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.TableNum) + } + switch m.boundTypeLargest { + case boundTypePointKey: + sentinel |= maskLargest + case boundTypeRangeKey: + // No op - leave bit unset. + default: + return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.TableNum) + } + return +} + +// String implements fmt.Stringer, printing the file number and the overall +// table bounds. +func (m *TableMetadata) String() string { + return fmt.Sprintf("%s:[%s-%s]", m.TableNum, m.Smallest(), m.Largest()) +} + +// DebugString returns a verbose representation of TableMetadata, typically for +// use in tests and debugging, returning the file number and the point, range +// and overall bounds for the table. +func (m *TableMetadata) DebugString(format base.FormatKey, verbose bool) string { + var b bytes.Buffer + if m.Virtual { + fmt.Fprintf(&b, "%s(%s):[%s-%s]", + m.TableNum, m.TableBacking.DiskFileNum, m.Smallest().Pretty(format), m.Largest().Pretty(format)) + } else { + fmt.Fprintf(&b, "%s:[%s-%s]", + m.TableNum, m.Smallest().Pretty(format), m.Largest().Pretty(format)) + } + if !verbose { + return b.String() + } + fmt.Fprintf(&b, " seqnums:[%d-%d]", m.SmallestSeqNum, m.LargestSeqNum) + if m.HasPointKeys { + fmt.Fprintf(&b, " points:[%s-%s]", + m.PointKeyBounds.Smallest().Pretty(format), m.PointKeyBounds.Largest().Pretty(format)) + } + if m.HasRangeKeys { + fmt.Fprintf(&b, " ranges:[%s-%s]", + m.RangeKeyBounds.Smallest().Pretty(format), m.RangeKeyBounds.Largest().Pretty(format)) + } + if m.Size != 0 { + fmt.Fprintf(&b, " size:%d", m.Size) + if m.Virtual && m.TableBacking != nil { + fmt.Fprintf(&b, "(%d)", m.TableBacking.Size) + } + } + if len(m.BlobReferences) > 0 { + fmt.Fprint(&b, " blobrefs:[") + for i, r := range m.BlobReferences { + if i > 0 { + fmt.Fprint(&b, ", ") + } + fmt.Fprintf(&b, "(%s: %d)", r.FileID, r.ValueSize) + } + fmt.Fprintf(&b, "; depth:%d]", m.BlobReferenceDepth) + } + return b.String() +} + +const debugParserSeparators = ":-[]();{}" + +// errFromPanic can be used in a recover block to convert panics into errors. +func errFromPanic(r any) error { + if err, ok := r.(error); ok { + return err + } + return errors.Errorf("%v", r) +} + +// ParseTableMetadataDebug parses a TableMetadata from its DebugString +// representation. +func ParseTableMetadataDebug(s string) (_ *TableMetadata, err error) { + defer func() { + if r := recover(); r != nil { + err = errors.CombineErrors(err, errFromPanic(r)) + } + }() + + // Input format: + // 000000:[a#0,SET-z#0,SET] seqnums:[5-5] points:[...] ranges:[...] size:5 + m := &TableMetadata{} + p := strparse.MakeParser(debugParserSeparators, s) + m.TableNum = p.FileNum() + var backingNum base.DiskFileNum + if p.Peek() == "(" { + p.Expect("(") + backingNum = p.DiskFileNum() + p.Expect(")") + } + p.Expect(":", "[") + + smallest := p.InternalKey() + p.Expect("-") + largest := p.InternalKey() + p.Expect("]") + + for !p.Done() { + field := p.Next() + p.Expect(":") + switch field { + case "seqnums": + p.Expect("[") + m.SmallestSeqNum = p.SeqNum() + p.Expect("-") + m.LargestSeqNum = p.SeqNum() + p.Expect("]") + m.LargestSeqNumAbsolute = m.LargestSeqNum + + case "points": + p.Expect("[") + smallestPoint := p.InternalKey() + p.Expect("-") + m.PointKeyBounds.SetInternalKeyBounds(smallestPoint, p.InternalKey()) + m.HasPointKeys = true + p.Expect("]") + + case "ranges": + m.RangeKeyBounds = &InternalKeyBounds{} + p.Expect("[") + smallest := p.InternalKey() + p.Expect("-") + m.RangeKeyBounds.SetInternalKeyBounds(smallest, p.InternalKey()) + m.HasRangeKeys = true + p.Expect("]") + + case "size": + m.Size = p.Uint64() + + case "blobrefs": + p.Expect("[") + for p.Peek() != ";" { + if p.Peek() == "," { + p.Expect(",") + } + p.Expect("(") + var ref BlobReference + ref.FileID = p.BlobFileID() + p.Expect(":") + ref.ValueSize = p.Uint64() + m.BlobReferences = append(m.BlobReferences, ref) + p.Expect(")") + } + p.Expect(";") + p.Expect("depth") + p.Expect(":") + m.BlobReferenceDepth = BlobReferenceDepth(p.Uint64()) + p.Expect("]") + + default: + p.Errf("unknown field %q", field) + } + } + + cmp := base.DefaultComparer.Compare + if base.InternalCompare(cmp, smallest, m.PointKeyBounds.Smallest()) == 0 { + m.boundTypeSmallest = boundTypePointKey + } else if m.HasRangeKeys && base.InternalCompare(cmp, smallest, m.RangeKeyBounds.Smallest()) == 0 { + m.boundTypeSmallest = boundTypeRangeKey + } + if base.InternalCompare(cmp, largest, m.PointKeyBounds.Largest()) == 0 { + m.boundTypeLargest = boundTypePointKey + } else if m.HasRangeKeys && base.InternalCompare(cmp, largest, m.RangeKeyBounds.Largest()) == 0 { + m.boundTypeLargest = boundTypeRangeKey + } + + // By default, when the parser sees just the overall bounds, we set the point + // keys. This preserves backwards compatability with existing test cases that + // specify only the overall bounds. + if !m.HasPointKeys && !m.HasRangeKeys { + m.PointKeyBounds.SetInternalKeyBounds(smallest, largest) + m.HasPointKeys = true + m.boundTypeSmallest, m.boundTypeLargest = boundTypePointKey, boundTypePointKey + } + if backingNum == 0 { + m.InitPhysicalBacking() + } else { + m.Virtual = true + m.InitVirtualBacking(backingNum, 0 /* size */) + } + return m, nil +} + +// Validate validates the metadata for consistency with itself, returning an +// error if inconsistent. +func (m *TableMetadata) Validate(cmp Compare, formatKey base.FormatKey) error { + // Combined range and point key validation. + + if !m.HasPointKeys && !m.HasRangeKeys { + return base.CorruptionErrorf("file %s has neither point nor range keys", + errors.Safe(m.TableNum)) + } + if base.InternalCompare(cmp, m.Smallest(), m.Largest()) > 0 { + return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s", + errors.Safe(m.TableNum), m.Smallest().Pretty(formatKey), + m.Largest().Pretty(formatKey)) + } + if m.SmallestSeqNum > m.LargestSeqNum { + return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d", + errors.Safe(m.TableNum), m.SmallestSeqNum, m.LargestSeqNum) + } + if m.LargestSeqNumAbsolute < m.LargestSeqNum { + return base.CorruptionErrorf("file %s has inconsistent absolute largest seqnum bounds: %d vs %d", + errors.Safe(m.TableNum), m.LargestSeqNumAbsolute, m.LargestSeqNum) + } + + // Point key validation. + + if m.HasPointKeys { + if base.InternalCompare(cmp, m.PointKeyBounds.Smallest(), m.PointKeyBounds.Largest()) > 0 { + return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s", + errors.Safe(m.TableNum), m.PointKeyBounds.Smallest().Pretty(formatKey), + m.PointKeyBounds.Largest().Pretty(formatKey)) + } + if base.InternalCompare(cmp, m.PointKeyBounds.Smallest(), m.Smallest()) < 0 || + base.InternalCompare(cmp, m.PointKeyBounds.Largest(), m.Largest()) > 0 { + return base.CorruptionErrorf( + "file %s has inconsistent point key bounds relative to overall bounds: "+ + "overall = [%s-%s], point keys = [%s-%s]", + errors.Safe(m.TableNum), + m.Smallest().Pretty(formatKey), m.Largest().Pretty(formatKey), + m.PointKeyBounds.Smallest().Pretty(formatKey), m.PointKeyBounds.Largest().Pretty(formatKey), + ) + } + if !isValidPointBoundKeyKind[m.PointKeyBounds.Smallest().Kind()] { + return base.CorruptionErrorf("file %s has invalid smallest point key kind", m) + } + if !isValidPointBoundKeyKind[m.PointKeyBounds.Largest().Kind()] { + return base.CorruptionErrorf("file %s has invalid largest point key kind", m) + } + } + + // Range key validation. + + if m.HasRangeKeys { + if base.InternalCompare(cmp, m.RangeKeyBounds.Smallest(), m.RangeKeyBounds.Largest()) > 0 { + return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s", + errors.Safe(m.TableNum), m.RangeKeyBounds.Smallest().Pretty(formatKey), + m.RangeKeyBounds.Largest().Pretty(formatKey)) + } + if base.InternalCompare(cmp, m.RangeKeyBounds.Smallest(), m.Smallest()) < 0 || + base.InternalCompare(cmp, m.RangeKeyBounds.Largest(), m.Largest()) > 0 { + return base.CorruptionErrorf( + "file %s has inconsistent range key bounds relative to overall bounds: "+ + "overall = [%s-%s], range keys = [%s-%s]", + errors.Safe(m.TableNum), + m.Smallest().Pretty(formatKey), m.Largest().Pretty(formatKey), + m.RangeKeyBounds.Smallest().Pretty(formatKey), m.RangeKeyBounds.Largest().Pretty(formatKey), + ) + } + if !isValidRangeKeyBoundKeyKind[m.RangeKeyBounds.Smallest().Kind()] { + return base.CorruptionErrorf("file %s has invalid smallest range key kind", m) + } + if !isValidRangeKeyBoundKeyKind[m.RangeKeyBounds.Largest().Kind()] { + return base.CorruptionErrorf("file %s has invalid largest range key kind", m) + } + } + + // Ensure that TableMetadata.Init was called. + if m.TableBacking == nil { + return base.CorruptionErrorf("table metadata TableBacking not set") + } + // Assert that there's a nonzero blob reference depth if and only if the + // table has a nonzero count of blob references. Additionally, the file's + // blob reference depth should be bounded by the number of blob references. + if (len(m.BlobReferences) == 0) != (m.BlobReferenceDepth == 0) || m.BlobReferenceDepth > BlobReferenceDepth(len(m.BlobReferences)) { + return base.CorruptionErrorf("table %s with %d blob refs but %d blob ref depth", + m.TableNum, len(m.BlobReferences), m.BlobReferenceDepth) + } + if m.SyntheticPrefixAndSuffix.HasPrefix() { + if !m.Virtual { + return base.CorruptionErrorf("non-virtual file with synthetic prefix") + } + if !bytes.HasPrefix(m.Smallest().UserKey, m.SyntheticPrefixAndSuffix.Prefix()) { + return base.CorruptionErrorf("virtual file with synthetic prefix has smallest key with a different prefix: %s", m.Smallest().Pretty(formatKey)) + } + if !bytes.HasPrefix(m.Largest().UserKey, m.SyntheticPrefixAndSuffix.Prefix()) { + return base.CorruptionErrorf("virtual file with synthetic prefix has largest key with a different prefix: %s", m.Largest().Pretty(formatKey)) + } + } + if m.SyntheticPrefixAndSuffix.HasSuffix() { + if !m.Virtual { + return base.CorruptionErrorf("non-virtual file with synthetic suffix") + } + } + + return nil +} + +var ( + isValidPointBoundKeyKind = [base.InternalKeyKindMax + 1]bool{ + base.InternalKeyKindDelete: true, + base.InternalKeyKindSet: true, + base.InternalKeyKindMerge: true, + base.InternalKeyKindSingleDelete: true, + base.InternalKeyKindRangeDelete: true, + base.InternalKeyKindSetWithDelete: true, + base.InternalKeyKindDeleteSized: true, + } + isValidRangeKeyBoundKeyKind = [base.InternalKeyKindMax + 1]bool{ + base.InternalKeyKindRangeKeySet: true, + base.InternalKeyKindRangeKeyUnset: true, + base.InternalKeyKindRangeKeyDelete: true, + } +) + +// TableInfo returns a subset of the TableMetadata state formatted as a +// TableInfo. +func (m *TableMetadata) TableInfo() TableInfo { + return TableInfo{ + FileNum: m.TableNum, + Size: m.Size, + Smallest: m.Smallest(), + Largest: m.Largest(), + SmallestSeqNum: m.SmallestSeqNum, + LargestSeqNum: m.LargestSeqNum, + blobReferences: m.BlobReferences, + } +} + +func (m *TableMetadata) cmpSeqNum(b *TableMetadata) int { + // NB: This is the same ordering that RocksDB uses for L0 files. + + // Sort first by largest sequence number. + if v := stdcmp.Compare(m.LargestSeqNum, b.LargestSeqNum); v != 0 { + return v + } + // Then by smallest sequence number. + if v := stdcmp.Compare(m.SmallestSeqNum, b.SmallestSeqNum); v != 0 { + return v + } + // Break ties by file number. + return stdcmp.Compare(m.TableNum, b.TableNum) +} + +func (m *TableMetadata) cmpSmallestKey(b *TableMetadata, cmp Compare) int { + return base.InternalCompare(cmp, m.Smallest(), b.Smallest()) +} + +// boundType represents the type of key (point or range) present as the smallest +// and largest keys. +type boundType uint8 + +const ( + boundTypePointKey boundType = iota + 1 + boundTypeRangeKey +) + +// Smallest returns the smallest key based on the bound type of +// boundTypeSmallest. +// +//gcassert:inline +func (m *TableMetadata) Smallest() InternalKey { + x := &m.PointKeyBounds + if m.boundTypeSmallest == boundTypeRangeKey { + x = m.RangeKeyBounds + } + return x.Smallest() +} + +// Largest returns the largest key based on the bound type of +// boundTypeLargest. +// +//gcassert:inline +func (m *TableMetadata) Largest() InternalKey { + x := &m.PointKeyBounds + if m.boundTypeLargest == boundTypeRangeKey { + x = m.RangeKeyBounds + } + return x.Largest() +} + +// InternalKeyBounds represents set of keys (smallest, largest) used for the +// in-memory and on-disk partial DBs that make up a pebble DB. +// +// It consists of the smallest, largest keys and their respective trailers. +// The keys are represented as a single string; their individual representations +// are given by the userKeySeparatorIdx as: +// - smallest: [0, userKeySeparatorIdx) +// - largest: [userKeySeparatorIdx, len(userKeyData)) +// +// This format allows us to save a couple of bytes that will add up +// proportionally to the amount of sstables we have. +type InternalKeyBounds struct { + userKeyData string + userKeySeparatorIdx int + smallestTrailer base.InternalKeyTrailer + largestTrailer base.InternalKeyTrailer +} + +func (ikr *InternalKeyBounds) SetInternalKeyBounds(smallest, largest InternalKey) { + ikr.userKeyData = string(smallest.UserKey) + string(largest.UserKey) + ikr.smallestTrailer = smallest.Trailer + ikr.largestTrailer = largest.Trailer + ikr.userKeySeparatorIdx = len(smallest.UserKey) +} + +//gcassert:inline +func (ikr *InternalKeyBounds) SmallestUserKey() []byte { + return unsafe.Slice(unsafe.StringData(ikr.userKeyData), ikr.userKeySeparatorIdx) +} + +//gcassert:inline +func (ikr *InternalKeyBounds) Smallest() InternalKey { + return InternalKey{ + UserKey: ikr.SmallestUserKey(), + Trailer: ikr.smallestTrailer, + } +} + +//gcassert:inline +func (ikr *InternalKeyBounds) LargestUserKey() []byte { + largestStart := unsafe.StringData(ikr.userKeyData[ikr.userKeySeparatorIdx:]) + return unsafe.Slice(largestStart, len(ikr.userKeyData)-ikr.userKeySeparatorIdx) +} + +//gcassert:inline +func (ikr *InternalKeyBounds) Largest() InternalKey { + ik := InternalKey{ + UserKey: ikr.LargestUserKey(), + Trailer: ikr.largestTrailer, + } + return ik +} + +func (ikr *InternalKeyBounds) SetSmallest(ik InternalKey) { + ikr.userKeyData = string(ik.UserKey) + string(ikr.LargestUserKey()) + ikr.smallestTrailer = ik.Trailer + ikr.userKeySeparatorIdx = len(ik.UserKey) +} + +func (ikr *InternalKeyBounds) SetLargest(ik InternalKey) { + smallestUserKey := ikr.SmallestUserKey() + ikr.userKeyData = string(smallestUserKey) + string(ik.UserKey) + ikr.largestTrailer = ik.Trailer + ikr.userKeySeparatorIdx = len(smallestUserKey) +} + +// TableInfo contains the common information for table related events. +type TableInfo struct { + // FileNum is the internal DB identifier for the table. + FileNum base.FileNum + // Size is the size of the file in bytes. + Size uint64 + // Smallest is the smallest internal key in the table. + Smallest InternalKey + // Largest is the largest internal key in the table. + Largest InternalKey + // SmallestSeqNum is the smallest sequence number in the table. + SmallestSeqNum base.SeqNum + // LargestSeqNum is the largest sequence number in the table. + LargestSeqNum base.SeqNum + // blobReferences is the list of blob files referenced by the table. + blobReferences BlobReferences +} + +// GetBlobReferenceFiles returns the list of blob file numbers referenced by +// the table. +func (t *TableInfo) GetBlobReferenceFiles() []base.BlobFileID { + files := make([]base.BlobFileID, 0, len(t.blobReferences)) + for _, blob := range t.blobReferences { + files = append(files, blob.FileID) + } + return files +} + +// TableStats contains statistics on a table used for compaction heuristics, +// and export via Metrics. +type TableStats struct { + // The total number of entries in the table. + NumEntries uint64 + // The number of point and range deletion entries in the table. + NumDeletions uint64 + // NumRangeKeySets is the total number of range key sets in the table. + // + // NB: If there's a chance that the sstable contains any range key sets, + // then NumRangeKeySets must be > 0. + NumRangeKeySets uint64 + // Estimate of the total disk space that may be dropped by this table's + // point deletions by compacting them. + PointDeletionsBytesEstimate uint64 + // Estimate of the total disk space that may be dropped by this table's + // range deletions by compacting them. This estimate is at data-block + // granularity and is not updated if compactions beneath the table reduce + // the amount of reclaimable disk space. It also does not account for + // overlapping data in L0 and ignores L0 sublevels, but the error that + // introduces is expected to be small. Similarly, multiple overlapping + // RANGEDELs can in different levels can count the same data to be deleted + // multiple times. + // + // Tables in the bottommost level of the LSM may have a nonzero estimate if + // snapshots or move compactions prevented the elision of their range + // tombstones. A table in the bottommost level that was ingested into L6 + // will have a zero estimate, because the file's sequence numbers indicate + // that the tombstone cannot drop any data contained within the file itself. + RangeDeletionsBytesEstimate uint64 + // Total size of value blocks and value index block. + ValueBlocksSize uint64 + // CompressionType is the compression profile used for the table (or nil if + // the profile name is not recognized). + CompressionType *block.CompressionProfile + // TombstoneDenseBlocksRatio is the ratio of data blocks in this table that + // fulfills at least one of the following: + // 1. The block contains at least options.Experimental.NumDeletionsThreshold + // point tombstones. + // 2. The ratio of the uncompressed size of point tombstones to the + // uncompressed size of the block is at least + // options.Experimental.DeletionSizeRatioThreshold. + // This statistic is used to determine eligibility for a tombstone density + // compaction. + TombstoneDenseBlocksRatio float64 + RawKeySize uint64 + RawValueSize uint64 +} + +// CompactionState is the compaction state of a file. +// +// The following shows the valid state transitions: +// +// NotCompacting --> Compacting --> Compacted +// ^ | +// | | +// +-------<-------+ +// +// Input files to a compaction transition to Compacting when a compaction is +// picked. A file that has finished compacting typically transitions into the +// Compacted state, at which point it is effectively obsolete ("zombied") and +// will eventually be removed from the LSM. A file that has been move-compacted +// will transition from Compacting back into the NotCompacting state, signaling +// that the file may be selected for a subsequent compaction. A failed +// compaction will result in all input tables transitioning from Compacting to +// NotCompacting. +// +// This state is in-memory only. It is not persisted to the manifest. +type CompactionState uint8 + +// CompactionStates. +const ( + CompactionStateNotCompacting CompactionState = iota + CompactionStateCompacting + CompactionStateCompacted +) + +// String implements fmt.Stringer. +func (s CompactionState) String() string { + switch s { + case CompactionStateNotCompacting: + return "NotCompacting" + case CompactionStateCompacting: + return "Compacting" + case CompactionStateCompacted: + return "Compacted" + default: + panic(fmt.Sprintf("pebble: unknown compaction state %d", s)) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/version.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/version.go new file mode 100644 index 0000000..377e3a7 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/version.go @@ -0,0 +1,803 @@ +// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +import ( + "bytes" + "fmt" + "iter" + "maps" + "slices" + "strings" + "sync" + "sync/atomic" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/strparse" +) + +// Compare exports the base.Compare type. +type Compare = base.Compare + +// InternalKey exports the base.InternalKey type. +type InternalKey = base.InternalKey + +// KeyRange returns the narrowest UserKeyBounds that encompass the bounds of all +// the TableMetadata in iters. +func KeyRange(ucmp Compare, iters ...iter.Seq[*TableMetadata]) base.UserKeyBounds { + var bounds base.UserKeyBounds + for _, iter := range iters { + for meta := range iter { + bounds = bounds.Union(ucmp, meta.UserKeyBounds()) + } + } + return bounds +} + +// ExtendKeyRange returns the narrowest UserKeyBounds that encompass the +// provided bounds and the bounds of all the TableMetadata in iters. +func ExtendKeyRange( + ucmp Compare, bounds base.UserKeyBounds, iters ...iter.Seq[*TableMetadata], +) base.UserKeyBounds { + for _, iter := range iters { + for meta := range iter { + bounds = bounds.Union(ucmp, meta.UserKeyBounds()) + } + } + return bounds +} + +// SortBySmallest sorts the specified files by smallest key using the supplied +// comparison function to order user keys. +func SortBySmallest(files []*TableMetadata, cmp Compare) { + slices.SortFunc(files, func(a, b *TableMetadata) int { + return a.cmpSmallestKey(b, cmp) + }) +} + +// NumLevels is the number of levels a Version contains. +const NumLevels = 7 + +// NewInitialVersion creates a version with no files. The L0Organizer should be freshly created. +func NewInitialVersion(comparer *base.Comparer) *Version { + v := &Version{ + cmp: comparer, + BlobFiles: MakeBlobFileSet(nil), + } + for level := range v.Levels { + v.Levels[level] = MakeLevelMetadata(comparer.Compare, level, nil /* files */) + v.RangeKeyLevels[level] = MakeLevelMetadata(comparer.Compare, level, nil /* files */) + } + return v +} + +// NewVersionForTesting constructs a new Version with the provided files. It +// requires the provided files are already well-ordered. The L0Organizer should +// be freshly created. +func NewVersionForTesting( + comparer *base.Comparer, l0Organizer *L0Organizer, files [7][]*TableMetadata, +) *Version { + v := &Version{ + cmp: comparer, + BlobFiles: MakeBlobFileSet(nil), + } + for l := range files { + // NB: We specifically insert `files` into the B-Tree in the order + // they appear within `files`. Some tests depend on this behavior in + // order to test consistency checking, etc. Once we've constructed the + // initial B-Tree, we swap out the btreeCmp for the correct one. + // TODO(jackson): Adjust or remove the tests and remove this. + v.Levels[l].tree = makeBTree(btreeCmpSpecificOrder(files[l]), files[l]) + v.Levels[l].level = l + if l == 0 { + v.Levels[l].tree.bcmp = btreeCmpSeqNum + } else { + v.Levels[l].tree.bcmp = btreeCmpSmallestKey(comparer.Compare) + } + for _, f := range files[l] { + v.Levels[l].totalTableSize += f.Size + } + } + l0Organizer.ResetForTesting(v) + return v +} + +// Version is a collection of table metadata for on-disk tables at various +// levels. In-memory DBs are written to level-0 tables, and compactions +// migrate data from level N to level N+1. The tables map internal keys (which +// are a user key, a delete or set bit, and a sequence number) to user values. +// +// The tables at level 0 are sorted by largest sequence number. Due to file +// ingestion, there may be overlap in the ranges of sequence numbers contain in +// level 0 sstables. In particular, it is valid for one level 0 sstable to have +// the seqnum range [1,100] while an adjacent sstable has the seqnum range +// [50,50]. This occurs when the [50,50] table was ingested and given a global +// seqnum. The ingestion code will have ensured that the [50,50] sstable will +// not have any keys that overlap with the [1,100] in the seqnum range +// [1,49]. The range of internal keys [fileMetadata.smallest, +// fileMetadata.largest] in each level 0 table may overlap. +// +// The tables at any non-0 level are sorted by their internal key range and any +// two tables at the same non-0 level do not overlap. +// +// The internal key ranges of two tables at different levels X and Y may +// overlap, for any X != Y. +// +// Finally, for every internal key in a table at level X, there is no internal +// key in a higher level table that has both the same user key and a higher +// sequence number. +type Version struct { + refs atomic.Int32 + + // L0SublevelFiles contains the L0 sublevels. + L0SublevelFiles []LevelSlice + + Levels [NumLevels]LevelMetadata + + // RangeKeyLevels holds a subset of the same files as Levels that contain range + // keys (i.e. fileMeta.HasRangeKeys == true). The memory amplification of this + // duplication should be minimal, as range keys are expected to be rare. + RangeKeyLevels [NumLevels]LevelMetadata + + // BlobFiles holds the set of physical blob files that are referenced by the + // version. The BlobFileSet is responsible for maintaining reference counts + // on physical blob files so that they remain on storage until they're no + // longer referenced by any version. + BlobFiles BlobFileSet + + // The callback to invoke when the last reference to a version is + // removed. Will be called with list.mu held. + Deleted func(obsolete ObsoleteFiles) + + // Stats holds aggregated stats about the version maintained from + // version to version. + Stats struct { + // MarkedForCompaction records the count of files marked for + // compaction within the version. + MarkedForCompaction int + } + + cmp *base.Comparer + + // The list the version is linked into. + list *VersionList + + // The next/prev link for the versionList doubly-linked list of versions. + prev, next *Version +} + +// String implements fmt.Stringer, printing the TableMetadata for each level in +// the Version. +func (v *Version) String() string { + return v.string(v.cmp.FormatKey, false) +} + +// DebugString returns an alternative format to String() which includes sequence +// number and kind information for the sstable boundaries. +func (v *Version) DebugString() string { + return v.string(v.cmp.FormatKey, true) +} + +// DebugStringFormatKey is like DebugString but allows overriding key formatting +// with the provided FormatKey. +func (v *Version) DebugStringFormatKey(fmtKey base.FormatKey) string { + return v.string(fmtKey, true) +} + +func describeSublevels(format base.FormatKey, verbose bool, sublevels []LevelSlice) string { + var buf bytes.Buffer + for sublevel := len(sublevels) - 1; sublevel >= 0; sublevel-- { + fmt.Fprintf(&buf, "L0.%d:\n", sublevel) + for f := range sublevels[sublevel].All() { + fmt.Fprintf(&buf, " %s\n", f.DebugString(format, verbose)) + } + } + return buf.String() +} + +func (v *Version) string(fmtKey base.FormatKey, verbose bool) string { + var buf bytes.Buffer + if len(v.L0SublevelFiles) > 0 { + fmt.Fprintf(&buf, "%s", describeSublevels(fmtKey, verbose, v.L0SublevelFiles)) + } else if !v.Levels[0].Empty() { + // Depending on where within the version lifecycle we're printing the + // Version, we may not have the sublevels structure populated yet. If + // L0SublevelFiles wasn't populated, print the L0 files without any L0 + // structure. + fmt.Fprintf(&buf, "L0 (no sublevels yet):\n") + for f := range v.Levels[0].All() { + fmt.Fprintf(&buf, " %s\n", f.DebugString(fmtKey, verbose)) + } + } + for level := 1; level < NumLevels; level++ { + if v.Levels[level].Empty() { + continue + } + fmt.Fprintf(&buf, "L%d:\n", level) + for f := range v.Levels[level].All() { + fmt.Fprintf(&buf, " %s\n", f.DebugString(fmtKey, verbose)) + } + } + if v.BlobFiles.Count() > 0 { + fmt.Fprintf(&buf, "Blob files:\n") + for f := range v.BlobFiles.All() { + fmt.Fprintf(&buf, " %s\n", f.String()) + } + } + return buf.String() +} + +// ParseVersionDebug parses a Version from its DebugString output. +func ParseVersionDebug( + comparer *base.Comparer, l0Organizer *L0Organizer, s string, +) (*Version, error) { + var files [NumLevels][]*TableMetadata + level := -1 + for _, l := range strings.Split(s, "\n") { + if l == "" { + continue + } + p := strparse.MakeParser(debugParserSeparators, l) + if l, ok := p.TryLevel(); ok { + level = l + continue + } + + if level == -1 { + return nil, errors.Errorf("version string must start with a level") + } + m, err := ParseTableMetadataDebug(l) + if err != nil { + return nil, err + } + files[level] = append(files[level], m) + } + // L0 files are printed from higher sublevel to lower, which means in a + // partial order that represents newest to oldest. Reverse the order of L0 + // files to ensure we construct the same sublevels. + slices.Reverse(files[0]) + v := NewVersionForTesting(comparer, l0Organizer, files) + if err := v.CheckOrdering(); err != nil { + return nil, err + } + return v, nil +} + +// Refs returns the number of references to the version. +func (v *Version) Refs() int32 { + return v.refs.Load() +} + +// Ref increments the version refcount. +func (v *Version) Ref() { + v.refs.Add(1) +} + +// Unref decrements the version refcount. If the last reference to the version +// was removed, the version is removed from the list of versions and the +// Deleted callback is invoked. Requires that the VersionList mutex is NOT +// locked. +func (v *Version) Unref() { + if v.refs.Add(-1) == 0 { + l := v.list + l.mu.Lock() + l.Remove(v) + v.Deleted(v.unrefFiles()) + l.mu.Unlock() + } +} + +// UnrefLocked decrements the version refcount. If the last reference to the +// version was removed, the version is removed from the list of versions and +// the Deleted callback is invoked. Requires that the VersionList mutex is +// already locked. +func (v *Version) UnrefLocked() { + if v.refs.Add(-1) == 0 { + v.list.Remove(v) + v.Deleted(v.unrefFiles()) + } +} + +func (v *Version) unrefFiles() ObsoleteFiles { + var obsoleteFiles ObsoleteFiles + for _, lm := range v.Levels { + lm.release(&obsoleteFiles) + } + for _, lm := range v.RangeKeyLevels { + lm.release(&obsoleteFiles) + } + v.BlobFiles.release(&obsoleteFiles) + return obsoleteFiles +} + +// ObsoleteFiles holds a set of files that are no longer referenced by any +// referenced Version. +type ObsoleteFiles struct { + TableBackings []*TableBacking + BlobFiles []*PhysicalBlobFile +} + +// AddBacking appends the provided TableBacking to the list of obsolete files. +func (of *ObsoleteFiles) AddBacking(fb *TableBacking) { + of.TableBackings = append(of.TableBackings, fb) +} + +// AddBlob appends the provided BlobFileMetadata to the list of obsolete files. +func (of *ObsoleteFiles) AddBlob(bm *PhysicalBlobFile) { + of.BlobFiles = append(of.BlobFiles, bm) +} + +// Count returns the number of files in the ObsoleteFiles. +func (of *ObsoleteFiles) Count() int { + return len(of.TableBackings) + len(of.BlobFiles) +} + +// Assert that ObsoleteFiles implements the obsoleteFiles interface. +var _ ObsoleteFilesSet = (*ObsoleteFiles)(nil) + +// Next returns the next version in the list of versions. +func (v *Version) Next() *Version { + return v.next +} + +// CalculateInuseKeyRanges examines table metadata in levels [level, maxLevel] +// within bounds [smallest,largest], returning an ordered slice of key ranges +// that include all keys that exist within levels [level, maxLevel] and within +// [smallest,largest]. +func (v *Version) CalculateInuseKeyRanges( + l0Organizer *L0Organizer, level, maxLevel int, smallest, largest []byte, +) []base.UserKeyBounds { + // Use two slices, alternating which one is input and which one is output + // as we descend the LSM. + var input, output []base.UserKeyBounds + + // L0 requires special treatment, since sstables within L0 may overlap. + // We use the L0 Sublevels structure to efficiently calculate the merged + // in-use key ranges. + if level == 0 { + output = l0Organizer.InUseKeyRanges(smallest, largest) + level++ + } + + // NB: We always treat `largest` as inclusive for simplicity, because + // there's little consequence to calculating slightly broader in-use key + // ranges. + bounds := base.UserKeyBoundsInclusive(smallest, largest) + for ; level <= maxLevel; level++ { + overlaps := v.Overlaps(level, bounds) + iter := overlaps.Iter() + + // We may already have in-use key ranges from higher levels. Iterate + // through both our accumulated in-use key ranges and this level's + // files, merging the two. + // + // Tables higher within the LSM have broader key spaces. We use this + // when possible to seek past a level's files that are contained by + // our current accumulated in-use key ranges. This helps avoid + // per-sstable work during flushes or compactions in high levels which + // overlap the majority of the LSM's sstables. + input, output = output, input + output = output[:0] + + cmp := v.cmp.Compare + inputIdx := 0 + var currFile *TableMetadata + // If we have an accumulated key range and its start is ≤ smallest, + // we can seek to the accumulated range's end. Otherwise, we need to + // start at the first overlapping file within the level. + if len(input) > 0 && cmp(input[0].Start, smallest) <= 0 { + currFile = seekGT(&iter, cmp, input[0].End) + } else { + currFile = iter.First() + } + + for currFile != nil && inputIdx < len(input) { + // Invariant: Neither currFile nor input[inputIdx] overlaps any earlier + // ranges. + switch { + case cmp(currFile.Largest().UserKey, input[inputIdx].Start) < 0: + // File is completely before input range. + output = append(output, currFile.UserKeyBounds()) + currFile = iter.Next() + + case cmp(input[inputIdx].End.Key, currFile.Smallest().UserKey) < 0: + // Input range is completely before the next file. + output = append(output, input[inputIdx]) + inputIdx++ + + default: + // Input range and file range overlap or touch. We will maximally extend + // the range with more overlapping inputs and files. + currAccum := currFile.UserKeyBounds() + if cmp(input[inputIdx].Start, currAccum.Start) < 0 { + currAccum.Start = input[inputIdx].Start + } + currFile = iter.Next() + + // Extend curAccum with any overlapping (or touching) input intervals or + // files. Note that we will always consume at least input[inputIdx]. + for { + if inputIdx < len(input) && cmp(input[inputIdx].Start, currAccum.End.Key) <= 0 { + if currAccum.End.CompareUpperBounds(cmp, input[inputIdx].End) < 0 { + currAccum.End = input[inputIdx].End + // Skip over files that are entirely inside this newly extended + // accumulated range; we expect ranges to be wider in levels that + // are higher up so this might skip over a non-trivial number of + // files. + currFile = seekGT(&iter, cmp, currAccum.End) + } + inputIdx++ + } else if currFile != nil && cmp(currFile.Smallest().UserKey, currAccum.End.Key) <= 0 { + if b := currFile.UserKeyBounds(); currAccum.End.CompareUpperBounds(cmp, b.End) < 0 { + currAccum.End = b.End + } + currFile = iter.Next() + } else { + // No overlaps remaining. + break + } + } + output = append(output, currAccum) + } + } + // If we have either files or input ranges left over, add them to the + // output. + output = append(output, input[inputIdx:]...) + for ; currFile != nil; currFile = iter.Next() { + output = append(output, currFile.UserKeyBounds()) + } + } + return output +} + +// seekGT seeks to the first file that ends with a boundary that is after the +// given boundary. Specifically: +// - if boundary.End is inclusive, the returned file ending boundary is strictly +// greater than boundary.End.Key +// - if boundary.End is exclusive, the returned file ending boundary is either +// greater than boundary.End.Key, or it's inclusive at boundary.End.Key. +func seekGT(iter *LevelIterator, cmp base.Compare, boundary base.UserKeyBoundary) *TableMetadata { + f := iter.SeekGE(cmp, boundary.Key) + if f == nil { + return nil + } + // If boundary is inclusive or the file boundary is exclusive we do not + // tolerate an equal largest key. + // Note: we know f.Largest.UserKey >= boundary.End.Key so this condition is + // equivalent to boundary.End.IsUpperBoundForInternalKey(cmp, f.Largest). + if (boundary.Kind == base.Inclusive || f.Largest().IsExclusiveSentinel()) && cmp(boundary.Key, f.Largest().UserKey) == 0 { + return iter.Next() + } + return f +} + +// Contains returns a boolean indicating whether the provided file exists in +// the version at the given level. If level is non-zero then Contains binary +// searches among the files. If level is zero, Contains scans the entire +// level. +func (v *Version) Contains(level int, m *TableMetadata) bool { + if level == 0 { + for f := range v.Levels[0].All() { + if f == m { + return true + } + } + return false + } + for f := range v.Overlaps(level, m.UserKeyBounds()).All() { + if f == m { + return true + } + } + return false +} + +// Overlaps returns all elements of v.files[level] whose user key range +// intersects the given bounds. If level is non-zero then the user key bounds of +// v.files[level] are assumed to not overlap (although they may touch). If level +// is zero then that assumption cannot be made, and the given bounds are +// expanded to the union of those matching bounds so far and the computation is +// repeated until the bounds stabilize. +// The returned files are a subsequence of the input files, i.e., the ordering +// is not changed. +func (v *Version) Overlaps(level int, bounds base.UserKeyBounds) LevelSlice { + if level == 0 { + // Indices that have been selected as overlapping. + l0 := v.Levels[level] + l0Iter := l0.Iter() + selectedIndices := make([]bool, l0.Len()) + numSelected := 0 + var slice LevelSlice + for { + restart := false + for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { + selected := selectedIndices[i] + if selected { + continue + } + if !meta.Overlaps(v.cmp.Compare, &bounds) { + // meta is completely outside the specified range; skip it. + continue + } + // Overlaps. + selectedIndices[i] = true + numSelected++ + + // Since this is L0, check if the newly added fileMetadata has expanded + // the range. We expand the range immediately for files we have + // remaining to check in this loop. All already checked and unselected + // files will need to be rechecked via the restart below. + if v.cmp.Compare(meta.Smallest().UserKey, bounds.Start) < 0 { + bounds.Start = meta.Smallest().UserKey + restart = true + } + if !bounds.End.IsUpperBoundForInternalKey(v.cmp.Compare, meta.Largest()) { + bounds.End = base.UserKeyExclusiveIf(meta.Largest().UserKey, meta.Largest().IsExclusiveSentinel()) + restart = true + } + } + + if !restart { + // Construct a B-Tree containing only the matching items. + var tr btree[*TableMetadata] + tr.bcmp = v.Levels[level].tree.bcmp + for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { + if selectedIndices[i] { + err := tr.Insert(meta) + if err != nil { + panic(err) + } + } + } + slice = newLevelSlice(tableMetadataIter(&tr)) + // TODO(jackson): Avoid the oddity of constructing and + // immediately releasing a B-Tree. Make LevelSlice an + // interface? + tr.Release(assertNoObsoleteFiles{}) + break + } + // Continue looping to retry the files that were not selected. + } + return slice + } + + return v.Levels[level].Slice().Overlaps(v.cmp.Compare, bounds) +} + +// AllLevelsAndSublevels returns an iterator that produces a Layer, LevelSlice +// pair for each L0 sublevel (from top to bottom) and each level below L0. +func (v *Version) AllLevelsAndSublevels() iter.Seq2[Layer, LevelSlice] { + return func(yield func(Layer, LevelSlice) bool) { + for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- { + if !yield(L0Sublevel(sublevel), v.L0SublevelFiles[sublevel]) { + return + } + } + for level := 1; level < NumLevels; level++ { + if !yield(Level(level), v.Levels[level].Slice()) { + return + } + } + } +} + +// CheckOrdering checks that the files are consistent with respect to +// increasing file numbers (for level 0 files) and increasing and non- +// overlapping internal key ranges (for level non-0 files). +func (v *Version) CheckOrdering() error { + for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- { + sublevelIter := v.L0SublevelFiles[sublevel].Iter() + if err := CheckOrdering(v.cmp, L0Sublevel(sublevel), sublevelIter); err != nil { + return base.CorruptionErrorf("%s\n%s", err, v.DebugString()) + } + } + + for level, lm := range v.Levels { + if err := CheckOrdering(v.cmp, Level(level), lm.Iter()); err != nil { + return base.CorruptionErrorf("%s\n%s", err, v.DebugString()) + } + } + return nil +} + +// validateBlobFileInvariants validates invariants around blob files. Currently +// it validates that the set of BlobFileIDs referenced by the Version's tables' +// blob references is exactly the same as the set of BlobFileIDs present in the +// Version's blob files B-Tree. +func (v *Version) validateBlobFileInvariants() error { + // Collect all the blob file IDs that are referenced by the Version's + // tables' blob references. + var referencedFileIDs []base.BlobFileID + { + referencedFileIDsMap := make(map[base.BlobFileID]struct{}, v.BlobFiles.tree.Count()) + for i := 0; i < len(v.Levels); i++ { + for table := range v.Levels[i].All() { + for _, br := range table.BlobReferences { + referencedFileIDsMap[br.FileID] = struct{}{} + } + } + } + referencedFileIDs = slices.Collect(maps.Keys(referencedFileIDsMap)) + slices.Sort(referencedFileIDs) + } + + // Collect all the blob file IDs that are present in the Version's blob + // files B-Tree. + var versionBlobFileIDs []base.BlobFileID + { + versionBlobFileIDsMap := make(map[base.BlobFileID]struct{}, v.BlobFiles.tree.Count()) + for bf := range v.BlobFiles.All() { + versionBlobFileIDsMap[bf.FileID] = struct{}{} + } + versionBlobFileIDs = slices.Collect(maps.Keys(versionBlobFileIDsMap)) + slices.Sort(versionBlobFileIDs) + } + + if !slices.Equal(referencedFileIDs, versionBlobFileIDs) { + return base.AssertionFailedf("divergence between referenced BlobFileIDs and Version's BlobFiles B-Tree: %v vs %v", + referencedFileIDs, versionBlobFileIDs) + } + return nil +} + +// VersionList holds a list of versions. The versions are ordered from oldest +// to newest. +type VersionList struct { + mu *sync.Mutex + root Version +} + +// Init initializes the version list. +func (l *VersionList) Init(mu *sync.Mutex) { + l.mu = mu + l.root.next = &l.root + l.root.prev = &l.root +} + +// Empty returns true if the list is empty, and false otherwise. +func (l *VersionList) Empty() bool { + return l.root.next == &l.root +} + +// Front returns the oldest version in the list. Note that this version is only +// valid if Empty() returns true. +func (l *VersionList) Front() *Version { + return l.root.next +} + +// Back returns the newest version in the list. Note that this version is only +// valid if Empty() returns true. +func (l *VersionList) Back() *Version { + return l.root.prev +} + +// PushBack adds a new version to the back of the list. This new version +// becomes the "newest" version in the list. +func (l *VersionList) PushBack(v *Version) { + if v.list != nil || v.prev != nil || v.next != nil { + panic("pebble: version list is inconsistent") + } + v.prev = l.root.prev + v.prev.next = v + v.next = &l.root + v.next.prev = v + v.list = l +} + +// Remove removes the specified version from the list. +func (l *VersionList) Remove(v *Version) { + if v == &l.root { + panic("pebble: cannot remove version list root node") + } + if v.list != l { + panic("pebble: version list is inconsistent") + } + v.prev.next = v.next + v.next.prev = v.prev + v.next = nil // avoid memory leaks + v.prev = nil // avoid memory leaks + v.list = nil // avoid memory leaks +} + +// CheckOrdering checks that the files are consistent with respect to +// seqnums (for level 0 files -- see detailed comment below) and increasing and non- +// overlapping internal key ranges (for non-level 0 files). +func CheckOrdering(comparer *base.Comparer, level Layer, files LevelIterator) error { + cmp := comparer.Compare + format := comparer.FormatKey + // The invariants to check for L0 sublevels are the same as the ones to + // check for all other levels. However, if L0 is not organized into + // sublevels, or if all L0 files are being passed in, we do the legacy L0 + // checks, defined in the detailed comment below. + if level == Level(0) { + // We have 2 kinds of files: + // - Files with exactly one sequence number: these could be either ingested files + // or flushed files. We cannot tell the difference between them based on TableMetadata, + // so our consistency checking here uses the weaker checks assuming it is a narrow + // flushed file. We cannot error on ingested files having sequence numbers coincident + // with flushed files as the seemingly ingested file could just be a flushed file + // with just one key in it which is a truncated range tombstone sharing sequence numbers + // with other files in the same flush. + // - Files with multiple sequence numbers: these are necessarily flushed files. + // + // Three cases of overlapping sequence numbers: + // Case 1: + // An ingested file contained in the sequence numbers of the flushed file -- it must be + // fully contained (not coincident with either end of the flushed file) since the memtable + // must have been at [a, b-1] (where b > a) when the ingested file was assigned sequence + // num b, and the memtable got a subsequent update that was given sequence num b+1, before + // being flushed. + // + // So a sequence [1000, 1000] [1002, 1002] [1000, 2000] is invalid since the first and + // third file are inconsistent with each other. So comparing adjacent files is insufficient + // for consistency checking. + // + // Visually we have something like + // x------y x-----------yx-------------y (flushed files where x, y are the endpoints) + // y y y y (y's represent ingested files) + // And these are ordered in increasing order of y. Note that y's must be unique. + // + // Case 2: + // A flushed file that did not overlap in keys with any file in any level, but does overlap + // in the file key intervals. This file is placed in L0 since it overlaps in the file + // key intervals but since it has no overlapping data, it is assigned a sequence number + // of 0 in RocksDB. We handle this case for compatibility with RocksDB. + // + // Case 3: + // A sequence of flushed files that overlap in sequence numbers with one another, + // but do not overlap in keys inside the sstables. These files correspond to + // partitioned flushes or the results of intra-L0 compactions of partitioned + // flushes. + // + // Since these types of SSTables violate most other sequence number + // overlap invariants, and handling this case is important for compatibility + // with future versions of pebble, this method relaxes most L0 invariant + // checks. + + var prev *TableMetadata + for f := files.First(); f != nil; f, prev = files.Next(), f { + if prev == nil { + continue + } + // Validate that the sorting is sane. + if prev.LargestSeqNum == 0 && f.LargestSeqNum == prev.LargestSeqNum { + // Multiple files satisfying case 2 mentioned above. + } else if prev.cmpSeqNum(f) >= 0 { + return base.CorruptionErrorf("L0 files %s and %s are not properly ordered: <#%d-#%d> vs <#%d-#%d>", + errors.Safe(prev.TableNum), errors.Safe(f.TableNum), + errors.Safe(prev.SmallestSeqNum), errors.Safe(prev.LargestSeqNum), + errors.Safe(f.SmallestSeqNum), errors.Safe(f.LargestSeqNum)) + } + } + } else { + var prev *TableMetadata + for f := files.First(); f != nil; f, prev = files.Next(), f { + if err := f.Validate(cmp, format); err != nil { + return errors.Wrapf(err, "%s ", level) + } + if prev != nil { + if prev.cmpSmallestKey(f, cmp) >= 0 { + return base.CorruptionErrorf("%s files %s and %s are not properly ordered: [%s-%s] vs [%s-%s]", + errors.Safe(level), errors.Safe(prev.TableNum), errors.Safe(f.TableNum), + prev.Smallest().Pretty(format), prev.Largest().Pretty(format), + f.Smallest().Pretty(format), f.Largest().Pretty(format)) + } + + // In all supported format major version, split user keys are + // prohibited, so both files cannot contain keys with the same user + // keys. If the bounds have the same user key, the previous file's + // boundary must have a InternalKeyTrailer indicating that it's exclusive. + if v := cmp(prev.Largest().UserKey, f.Smallest().UserKey); v > 0 || (v == 0 && !prev.Largest().IsExclusiveSentinel()) { + return base.CorruptionErrorf("%s files %s and %s have overlapping ranges: [%s-%s] vs [%s-%s]", + errors.Safe(level), errors.Safe(prev.TableNum), errors.Safe(f.TableNum), + prev.Smallest().Pretty(format), prev.Largest().Pretty(format), + f.Smallest().Pretty(format), f.Largest().Pretty(format)) + } + } + } + } + return nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/version_edit.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/version_edit.go new file mode 100644 index 0000000..00f62af --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/version_edit.go @@ -0,0 +1,1322 @@ +// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +import ( + "bufio" + "bytes" + stdcmp "cmp" + "encoding/binary" + "fmt" + "io" + "maps" + "slices" + "strings" + "time" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/strparse" + "github.com/cockroachdb/pebble/v2/sstable" +) + +// TODO(peter): describe the MANIFEST file format, independently of the C++ +// project. + +type byteReader interface { + io.ByteReader + io.Reader +} + +// Tags for the versionEdit disk format. +// Tag 8 is no longer used. +const ( + // LevelDB tags. + tagComparator = 1 + tagLogNumber = 2 + tagNextFileNumber = 3 + tagLastSequence = 4 + tagCompactPointer = 5 + tagDeletedFile = 6 + tagNewFile = 7 + tagPrevLogNumber = 9 + + // RocksDB tags. + tagNewFile2 = 100 + tagNewFile3 = 102 + tagNewFile4 = 103 + tagColumnFamily = 200 + tagColumnFamilyAdd = 201 + tagColumnFamilyDrop = 202 + tagMaxColumnFamily = 203 + + // Pebble tags. + tagNewFile5 = 104 // Range keys. + tagCreatedBackingTable = 105 + tagRemovedBackingTable = 106 + tagNewBlobFile = 107 + tagDeletedBlobFile = 108 + + // The custom tags sub-format used by tagNewFile4 and above. All tags less + // than customTagNonSafeIgnoreMask are safe to ignore and their format must be + // a single bytes field. + customTagTerminate = 1 + customTagNeedsCompaction = 2 + customTagCreationTime = 6 + customTagPathID = 65 + customTagNonSafeIgnoreMask = 1 << 6 + customTagVirtual = 66 + customTagSyntheticPrefix = 67 + customTagSyntheticSuffix = 68 + customTagBlobReferences = 69 +) + +// DeletedTableEntry holds the state for a sstable deletion from a level. The +// table itself might still be referenced by another level. +type DeletedTableEntry struct { + Level int + FileNum base.FileNum +} + +// DeletedBlobFileEntry holds the state for a blob file deletion. The blob file +// ID may still be in-use with a different physical blob file. +type DeletedBlobFileEntry struct { + FileID base.BlobFileID + FileNum base.DiskFileNum +} + +// NewTableEntry holds the state for a new sstable or one moved from a different +// level. +type NewTableEntry struct { + Level int + Meta *TableMetadata + // BackingFileNum is only set during manifest replay, and only for virtual + // sstables. + BackingFileNum base.DiskFileNum +} + +// VersionEdit holds the state for an edit to a Version along with other +// on-disk state (log numbers, next file number, and the last sequence number). +type VersionEdit struct { + // ComparerName is the value of Options.Comparer.Name. This is only set in + // the first VersionEdit in a manifest (either when the DB is created, or + // when a new manifest is created) and is used to verify that the comparer + // specified at Open matches the comparer that was previously used. + ComparerName string + + // MinUnflushedLogNum is the smallest WAL log file number corresponding to + // mutations that have not been flushed to an sstable. + // + // This is an optional field, and 0 represents it is not set. + MinUnflushedLogNum base.DiskFileNum + + // ObsoletePrevLogNum is a historic artifact from LevelDB that is not used by + // Pebble, RocksDB, or even LevelDB. Its use in LevelDB was deprecated in + // 6/2011. We keep it around purely for informational purposes when + // displaying MANIFEST contents. + ObsoletePrevLogNum uint64 + + // The next file number. A single counter is used to assign file numbers + // for the WAL, MANIFEST, sstable, and OPTIONS files. + NextFileNum uint64 + + // LastSeqNum is an upper bound on the sequence numbers that have been + // assigned in flushed WALs. Unflushed WALs (that will be replayed during + // recovery) may contain sequence numbers greater than this value. + LastSeqNum base.SeqNum + + // A file num may be present in both deleted files and new files when it + // is moved from a lower level to a higher level (when the compaction + // found that there was no overlapping file at the higher level). + DeletedTables map[DeletedTableEntry]*TableMetadata + NewTables []NewTableEntry + // CreatedBackingTables can be used to preserve the TableBacking associated + // with a physical sstable. This is useful when virtual sstables in the + // latest version are reconstructed during manifest replay, and we also need + // to reconstruct the TableBacking which is required by these virtual + // sstables. + // + // INVARIANT: The TableBacking associated with a physical sstable must only + // be added as a backing file in the same version edit where the physical + // sstable is first virtualized. This means that the physical sstable must + // be present in DeletedFiles and that there must be at least one virtual + // sstable with the same TableBacking as the physical sstable in NewFiles. A + // file must be present in CreatedBackingTables in exactly one version edit. + // The physical sstable associated with the TableBacking must also not be + // present in NewFiles. + CreatedBackingTables []*TableBacking + // RemovedBackingTables is used to remove the TableBacking associated with a + // virtual sstable. Note that a backing sstable can be removed as soon as + // there are no virtual sstables in the latest version which are using the + // backing sstable, but the backing sstable doesn't necessarily have to be + // removed atomically with the version edit which removes the last virtual + // sstable associated with the backing sstable. The removal can happen in a + // future version edit. + // + // INVARIANT: A file must only be added to RemovedBackingTables if it was + // added to CreateBackingTables in a prior version edit. The same version + // edit also cannot have the same file present in both CreateBackingTables + // and RemovedBackingTables. A file must be present in RemovedBackingTables + // in exactly one version edit. + RemovedBackingTables []base.DiskFileNum + // NewBlobFiles holds the metadata for all new blob files introduced within + // the version edit. + NewBlobFiles []BlobFileMetadata + // DeletedBlobFiles holds all physical blob files that became unused during + // the version edit. + // + // A physical blob file may become unused if the corresponding BlobFileID + // becomes unreferenced during the version edit. In this case the BlobFileID + // is not referenced by any sstable in the resulting Version. + // + // A physical blob file may also become unused if it is being replaced by a + // new physical blob file. In this case NewBlobFiles must contain a + // BlobFileMetadata with the same BlobFileID. + // + // While replaying a MANIFEST, the values are nil. Otherwise the values must + // not be nil. + DeletedBlobFiles map[DeletedBlobFileEntry]*PhysicalBlobFile +} + +// Decode decodes an edit from the specified reader. +// +// Note that the Decode step will not set the TableBacking for virtual sstables +// and the responsibility is left to the caller. However, the Decode step will +// populate the NewFileEntry.BackingFileNum in VersionEdit.NewFiles. +func (v *VersionEdit) Decode(r io.Reader) error { + br, ok := r.(byteReader) + if !ok { + br = bufio.NewReader(r) + } + d := versionEditDecoder{br} + for { + tag, err := binary.ReadUvarint(br) + if err == io.EOF { + break + } + if err != nil { + return err + } + switch tag { + case tagComparator: + s, err := d.readBytes() + if err != nil { + return err + } + v.ComparerName = string(s) + + case tagLogNumber: + n, err := d.readUvarint() + if err != nil { + return err + } + v.MinUnflushedLogNum = base.DiskFileNum(n) + + case tagNextFileNumber: + n, err := d.readUvarint() + if err != nil { + return err + } + v.NextFileNum = n + + case tagLastSequence: + n, err := d.readUvarint() + if err != nil { + return err + } + v.LastSeqNum = base.SeqNum(n) + + case tagCompactPointer: + if _, err := d.readLevel(); err != nil { + return err + } + if _, err := d.readBytes(); err != nil { + return err + } + // NB: RocksDB does not use compaction pointers anymore. + + case tagRemovedBackingTable: + n, err := d.readUvarint() + if err != nil { + return err + } + v.RemovedBackingTables = append( + v.RemovedBackingTables, base.DiskFileNum(n), + ) + case tagCreatedBackingTable: + dfn, err := d.readUvarint() + if err != nil { + return err + } + size, err := d.readUvarint() + if err != nil { + return err + } + fileBacking := &TableBacking{ + DiskFileNum: base.DiskFileNum(dfn), + Size: size, + } + v.CreatedBackingTables = append(v.CreatedBackingTables, fileBacking) + case tagDeletedFile: + level, err := d.readLevel() + if err != nil { + return err + } + fileNum, err := d.readFileNum() + if err != nil { + return err + } + if v.DeletedTables == nil { + v.DeletedTables = make(map[DeletedTableEntry]*TableMetadata) + } + v.DeletedTables[DeletedTableEntry{level, fileNum}] = nil + + case tagNewFile, tagNewFile2, tagNewFile3, tagNewFile4, tagNewFile5: + level, err := d.readLevel() + if err != nil { + return err + } + fileNum, err := d.readFileNum() + if err != nil { + return err + } + if tag == tagNewFile3 { + // The pathID field appears unused in RocksDB. + _ /* pathID */, err := d.readUvarint() + if err != nil { + return err + } + } + size, err := d.readUvarint() + if err != nil { + return err + } + // We read the smallest / largest key bounds differently depending on + // whether we have point, range or both types of keys present in the + // table. + var ( + smallestPointKey, largestPointKey []byte + smallestRangeKey, largestRangeKey []byte + parsedPointBounds bool + boundsMarker byte + ) + if tag != tagNewFile5 { + // Range keys not present in the table. Parse the point key bounds. + smallestPointKey, err = d.readBytes() + if err != nil { + return err + } + largestPointKey, err = d.readBytes() + if err != nil { + return err + } + } else { + // Range keys are present in the table. Determine whether we have point + // keys to parse, in addition to the bounds. + boundsMarker, err = d.ReadByte() + if err != nil { + return err + } + // Parse point key bounds, if present. + if boundsMarker&maskContainsPointKeys > 0 { + smallestPointKey, err = d.readBytes() + if err != nil { + return err + } + largestPointKey, err = d.readBytes() + if err != nil { + return err + } + parsedPointBounds = true + } else { + // The table does not have point keys. + // Sanity check: the bounds must be range keys. + if boundsMarker&maskSmallest != 0 || boundsMarker&maskLargest != 0 { + return base.CorruptionErrorf( + "new-file-4-range-keys: table without point keys has point key bounds: marker=%x", + boundsMarker, + ) + } + } + // Parse range key bounds. + smallestRangeKey, err = d.readBytes() + if err != nil { + return err + } + largestRangeKey, err = d.readBytes() + if err != nil { + return err + } + } + var smallestSeqNum base.SeqNum + var largestSeqNum base.SeqNum + if tag != tagNewFile { + n, err := d.readUvarint() + if err != nil { + return err + } + smallestSeqNum = base.SeqNum(n) + n, err = d.readUvarint() + if err != nil { + return err + } + largestSeqNum = base.SeqNum(n) + } + var markedForCompaction bool + var creationTime uint64 + virtualState := struct { + virtual bool + backingFileNum uint64 + }{} + var syntheticPrefix sstable.SyntheticPrefix + var syntheticSuffix sstable.SyntheticSuffix + var blobReferences BlobReferences + var blobReferenceDepth BlobReferenceDepth + if tag == tagNewFile4 || tag == tagNewFile5 { + for { + customTag, err := d.readUvarint() + if err != nil { + return err + } + if customTag == customTagTerminate { + break + } + switch customTag { + case customTagNeedsCompaction: + field, err := d.readBytes() + if err != nil { + return err + } + if len(field) != 1 { + return base.CorruptionErrorf("new-file4: need-compaction field wrong size") + } + markedForCompaction = (field[0] == 1) + + case customTagCreationTime: + field, err := d.readBytes() + if err != nil { + return err + } + var n int + creationTime, n = binary.Uvarint(field) + if n != len(field) { + return base.CorruptionErrorf("new-file4: invalid file creation time") + } + + case customTagPathID: + return base.CorruptionErrorf("new-file4: path-id field not supported") + + case customTagVirtual: + virtualState.virtual = true + if virtualState.backingFileNum, err = d.readUvarint(); err != nil { + return err + } + + case customTagSyntheticPrefix: + synthetic, err := d.readBytes() + if err != nil { + return err + } + syntheticPrefix = synthetic + + case customTagSyntheticSuffix: + if syntheticSuffix, err = d.readBytes(); err != nil { + return err + } + + case customTagBlobReferences: + // The first varint encodes the 'blob reference depth' + // of the table. + v, err := d.readUvarint() + if err != nil { + return err + } + blobReferenceDepth = BlobReferenceDepth(v) + n, err := d.readUvarint() + if err != nil { + return err + } + blobReferences = make([]BlobReference, n) + for i := 0; i < int(n); i++ { + fileID, err := d.readUvarint() + if err != nil { + return err + } + valueSize, err := d.readUvarint() + if err != nil { + return err + } + blobReferences[i] = BlobReference{ + FileID: base.BlobFileID(fileID), + ValueSize: valueSize, + } + } + continue + + default: + if (customTag & customTagNonSafeIgnoreMask) != 0 { + return base.CorruptionErrorf("new-file4: custom field not supported: %d", customTag) + } + if _, err := d.readBytes(); err != nil { + return err + } + } + } + } + m := &TableMetadata{ + TableNum: fileNum, + Size: size, + CreationTime: int64(creationTime), + SmallestSeqNum: smallestSeqNum, + LargestSeqNum: largestSeqNum, + LargestSeqNumAbsolute: largestSeqNum, + BlobReferences: blobReferences, + BlobReferenceDepth: blobReferenceDepth, + MarkedForCompaction: markedForCompaction, + Virtual: virtualState.virtual, + SyntheticPrefixAndSuffix: sstable.MakeSyntheticPrefixAndSuffix(syntheticPrefix, syntheticSuffix), + } + + if tag != tagNewFile5 { // no range keys present + m.PointKeyBounds.SetInternalKeyBounds(base.DecodeInternalKey(smallestPointKey), + base.DecodeInternalKey(largestPointKey)) + m.HasPointKeys = true + m.boundTypeSmallest, m.boundTypeLargest = boundTypePointKey, boundTypePointKey + } else { // range keys present + // Set point key bounds, if parsed. + if parsedPointBounds { + m.PointKeyBounds.SetInternalKeyBounds(base.DecodeInternalKey(smallestPointKey), + base.DecodeInternalKey(largestPointKey)) + m.HasPointKeys = true + } + // Set range key bounds. + m.RangeKeyBounds = &InternalKeyBounds{} + m.RangeKeyBounds.SetInternalKeyBounds(base.DecodeInternalKey(smallestRangeKey), + base.DecodeInternalKey(largestRangeKey)) + m.HasRangeKeys = true + // Set overall bounds (by default assume range keys). + m.boundTypeSmallest, m.boundTypeLargest = boundTypeRangeKey, boundTypeRangeKey + if boundsMarker&maskSmallest == maskSmallest { + m.boundTypeSmallest = boundTypePointKey + } + if boundsMarker&maskLargest == maskLargest { + m.boundTypeLargest = boundTypePointKey + } + } + m.boundsSet = true + if !virtualState.virtual { + m.InitPhysicalBacking() + } + + nfe := NewTableEntry{ + Level: level, + Meta: m, + } + if virtualState.virtual { + nfe.BackingFileNum = base.DiskFileNum(virtualState.backingFileNum) + } + v.NewTables = append(v.NewTables, nfe) + + case tagNewBlobFile: + fileID, err := d.readUvarint() + if err != nil { + return err + } + diskFileNum, err := d.readFileNum() + if err != nil { + return err + } + size, err := d.readUvarint() + if err != nil { + return err + } + valueSize, err := d.readUvarint() + if err != nil { + return err + } + creationTime, err := d.readUvarint() + if err != nil { + return err + } + v.NewBlobFiles = append(v.NewBlobFiles, BlobFileMetadata{ + FileID: base.BlobFileID(fileID), + Physical: &PhysicalBlobFile{ + FileNum: base.DiskFileNum(diskFileNum), + Size: size, + ValueSize: valueSize, + CreationTime: creationTime, + }, + }) + + case tagDeletedBlobFile: + fileID, err := d.readUvarint() + if err != nil { + return err + } + fileNum, err := d.readFileNum() + if err != nil { + return err + } + if v.DeletedBlobFiles == nil { + v.DeletedBlobFiles = make(map[DeletedBlobFileEntry]*PhysicalBlobFile) + } + v.DeletedBlobFiles[DeletedBlobFileEntry{ + FileID: base.BlobFileID(fileID), + FileNum: base.DiskFileNum(fileNum), + }] = nil + + case tagPrevLogNumber: + n, err := d.readUvarint() + if err != nil { + return err + } + v.ObsoletePrevLogNum = n + + case tagColumnFamily, tagColumnFamilyAdd, tagColumnFamilyDrop, tagMaxColumnFamily: + return base.CorruptionErrorf("column families are not supported") + + default: + return base.CorruptionErrorf("MANIFEST: unknown tag: %d", tag) + } + } + return nil +} + +func (v *VersionEdit) string(verbose bool, fmtKey base.FormatKey) string { + var buf bytes.Buffer + if v.ComparerName != "" { + fmt.Fprintf(&buf, " comparer: %s\n", v.ComparerName) + } + if v.MinUnflushedLogNum != 0 { + fmt.Fprintf(&buf, " log-num: %d\n", v.MinUnflushedLogNum) + } + if v.ObsoletePrevLogNum != 0 { + fmt.Fprintf(&buf, " prev-log-num: %d\n", v.ObsoletePrevLogNum) + } + if v.NextFileNum != 0 { + fmt.Fprintf(&buf, " next-file-num: %d\n", v.NextFileNum) + } + if v.LastSeqNum != 0 { + fmt.Fprintf(&buf, " last-seq-num: %d\n", v.LastSeqNum) + } + entries := slices.Collect(maps.Keys(v.DeletedTables)) + slices.SortFunc(entries, func(a, b DeletedTableEntry) int { + if v := stdcmp.Compare(a.Level, b.Level); v != 0 { + return v + } + return stdcmp.Compare(a.FileNum, b.FileNum) + }) + for _, df := range entries { + fmt.Fprintf(&buf, " del-table: L%d %s\n", df.Level, df.FileNum) + } + for _, nf := range v.NewTables { + fmt.Fprintf(&buf, " add-table: L%d", nf.Level) + fmt.Fprintf(&buf, " %s", nf.Meta.DebugString(fmtKey, verbose)) + if nf.Meta.CreationTime != 0 { + fmt.Fprintf(&buf, " (%s)", + time.Unix(nf.Meta.CreationTime, 0).UTC().Format(time.RFC3339)) + } + fmt.Fprintln(&buf) + } + + for _, b := range v.CreatedBackingTables { + fmt.Fprintf(&buf, " add-backing: %s\n", b.DiskFileNum) + } + for _, n := range v.RemovedBackingTables { + fmt.Fprintf(&buf, " del-backing: %s\n", n) + } + for _, f := range v.NewBlobFiles { + fmt.Fprintf(&buf, " add-blob-file: %s\n", f.String()) + } + deletedBlobFileEntries := slices.Collect(maps.Keys(v.DeletedBlobFiles)) + slices.SortFunc(deletedBlobFileEntries, func(a, b DeletedBlobFileEntry) int { + if v := stdcmp.Compare(a.FileID, b.FileID); v != 0 { + return v + } + return stdcmp.Compare(a.FileNum, b.FileNum) + }) + for _, df := range deletedBlobFileEntries { + fmt.Fprintf(&buf, " del-blob-file: %s %s\n", df.FileID, df.FileNum) + } + return buf.String() +} + +// DebugString is a more verbose version of String(). Use this in tests. +func (v *VersionEdit) DebugString(fmtKey base.FormatKey) string { + return v.string(true /* verbose */, fmtKey) +} + +// String implements fmt.Stringer for a VersionEdit. +func (v *VersionEdit) String() string { + return v.string(false /* verbose */, base.DefaultFormatter) +} + +// ParseVersionEditDebug parses a VersionEdit from its DebugString +// implementation. +// +// It doesn't recognize all fields; this implementation can be filled in as +// needed. +func ParseVersionEditDebug(s string) (_ *VersionEdit, err error) { + defer func() { + if r := recover(); r != nil { + err = errors.CombineErrors(err, errFromPanic(r)) + } + }() + + var ve VersionEdit + for _, l := range strings.Split(s, "\n") { + l = strings.TrimSpace(l) + if l == "" { + continue + } + field, value, ok := strings.Cut(l, ":") + if !ok { + return nil, errors.Errorf("malformed line %q", l) + } + field = strings.TrimSpace(field) + p := strparse.MakeParser(debugParserSeparators, value) + switch field { + case "add-table": + level := p.Level() + meta, err := ParseTableMetadataDebug(p.Remaining()) + if err != nil { + return nil, err + } + ve.NewTables = append(ve.NewTables, NewTableEntry{ + Level: level, + Meta: meta, + }) + + case "del-table": + level := p.Level() + num := p.FileNum() + if ve.DeletedTables == nil { + ve.DeletedTables = make(map[DeletedTableEntry]*TableMetadata) + } + ve.DeletedTables[DeletedTableEntry{ + Level: level, + FileNum: num, + }] = nil + + case "add-backing": + n := p.DiskFileNum() + ve.CreatedBackingTables = append(ve.CreatedBackingTables, &TableBacking{ + DiskFileNum: n, + Size: 100, + }) + + case "del-backing": + n := p.DiskFileNum() + ve.RemovedBackingTables = append(ve.RemovedBackingTables, n) + + case "add-blob-file": + meta, err := ParseBlobFileMetadataDebug(p.Remaining()) + if err != nil { + return nil, err + } + ve.NewBlobFiles = append(ve.NewBlobFiles, meta) + + case "del-blob-file": + if ve.DeletedBlobFiles == nil { + ve.DeletedBlobFiles = make(map[DeletedBlobFileEntry]*PhysicalBlobFile) + } + ve.DeletedBlobFiles[DeletedBlobFileEntry{ + FileID: p.BlobFileID(), + FileNum: p.DiskFileNum(), + }] = nil + + default: + return nil, errors.Errorf("field %q not implemented", field) + } + } + return &ve, nil +} + +// Encode encodes an edit to the specified writer. +func (v *VersionEdit) Encode(w io.Writer) error { + e := versionEditEncoder{new(bytes.Buffer)} + + if v.ComparerName != "" { + e.writeUvarint(tagComparator) + e.writeString(v.ComparerName) + } + if v.MinUnflushedLogNum != 0 { + e.writeUvarint(tagLogNumber) + e.writeUvarint(uint64(v.MinUnflushedLogNum)) + } + if v.ObsoletePrevLogNum != 0 { + e.writeUvarint(tagPrevLogNumber) + e.writeUvarint(v.ObsoletePrevLogNum) + } + if v.NextFileNum != 0 { + e.writeUvarint(tagNextFileNumber) + e.writeUvarint(uint64(v.NextFileNum)) + } + for _, dfn := range v.RemovedBackingTables { + e.writeUvarint(tagRemovedBackingTable) + e.writeUvarint(uint64(dfn)) + } + for _, fileBacking := range v.CreatedBackingTables { + e.writeUvarint(tagCreatedBackingTable) + e.writeUvarint(uint64(fileBacking.DiskFileNum)) + e.writeUvarint(fileBacking.Size) + } + // RocksDB requires LastSeqNum to be encoded for the first MANIFEST entry, + // even though its value is zero. We detect this by encoding LastSeqNum when + // ComparerName is set. + if v.LastSeqNum != 0 || v.ComparerName != "" { + e.writeUvarint(tagLastSequence) + e.writeUvarint(uint64(v.LastSeqNum)) + } + for x := range v.DeletedTables { + e.writeUvarint(tagDeletedFile) + e.writeUvarint(uint64(x.Level)) + e.writeUvarint(uint64(x.FileNum)) + } + for _, x := range v.NewTables { + customFields := x.Meta.MarkedForCompaction || x.Meta.CreationTime != 0 || x.Meta.Virtual || len(x.Meta.BlobReferences) > 0 + var tag uint64 + switch { + case x.Meta.HasRangeKeys: + tag = tagNewFile5 + case customFields: + tag = tagNewFile4 + default: + tag = tagNewFile2 + } + e.writeUvarint(tag) + e.writeUvarint(uint64(x.Level)) + e.writeUvarint(uint64(x.Meta.TableNum)) + e.writeUvarint(x.Meta.Size) + if !x.Meta.HasRangeKeys { + // If we have no range keys, preserve the original format and write the + // smallest and largest point keys. + e.writeKey(x.Meta.PointKeyBounds.Smallest()) + e.writeKey(x.Meta.PointKeyBounds.Largest()) + } else { + // When range keys are present, we first write a marker byte that + // indicates if the table also contains point keys, in addition to how the + // overall bounds for the table should be reconstructed. This byte is + // followed by the keys themselves. + b, err := x.Meta.boundsMarker() + if err != nil { + return err + } + if err = e.WriteByte(b); err != nil { + return err + } + // Write point key bounds (if present). + if x.Meta.HasPointKeys { + e.writeKey(x.Meta.PointKeyBounds.Smallest()) + e.writeKey(x.Meta.PointKeyBounds.Largest()) + } + // Write range key bounds (if present). + if x.Meta.HasRangeKeys { + e.writeKey(x.Meta.RangeKeyBounds.Smallest()) + e.writeKey(x.Meta.RangeKeyBounds.Largest()) + } + } + e.writeUvarint(uint64(x.Meta.SmallestSeqNum)) + e.writeUvarint(uint64(x.Meta.LargestSeqNum)) + if customFields { + if x.Meta.CreationTime != 0 { + e.writeUvarint(customTagCreationTime) + var buf [binary.MaxVarintLen64]byte + n := binary.PutUvarint(buf[:], uint64(x.Meta.CreationTime)) + e.writeBytes(buf[:n]) + } + if x.Meta.MarkedForCompaction { + e.writeUvarint(customTagNeedsCompaction) + e.writeBytes([]byte{1}) + } + if x.Meta.Virtual { + e.writeUvarint(customTagVirtual) + e.writeUvarint(uint64(x.Meta.TableBacking.DiskFileNum)) + } + if x.Meta.SyntheticPrefixAndSuffix.HasPrefix() { + e.writeUvarint(customTagSyntheticPrefix) + e.writeBytes(x.Meta.SyntheticPrefixAndSuffix.Prefix()) + } + if x.Meta.SyntheticPrefixAndSuffix.HasSuffix() { + e.writeUvarint(customTagSyntheticSuffix) + e.writeBytes(x.Meta.SyntheticPrefixAndSuffix.Suffix()) + } + if len(x.Meta.BlobReferences) > 0 { + e.writeUvarint(customTagBlobReferences) + e.writeUvarint(uint64(x.Meta.BlobReferenceDepth)) + e.writeUvarint(uint64(len(x.Meta.BlobReferences))) + for _, ref := range x.Meta.BlobReferences { + e.writeUvarint(uint64(ref.FileID)) + e.writeUvarint(ref.ValueSize) + } + } + e.writeUvarint(customTagTerminate) + } + } + for _, x := range v.NewBlobFiles { + e.writeUvarint(tagNewBlobFile) + e.writeUvarint(uint64(x.FileID)) + e.writeUvarint(uint64(x.Physical.FileNum)) + e.writeUvarint(x.Physical.Size) + e.writeUvarint(x.Physical.ValueSize) + e.writeUvarint(x.Physical.CreationTime) + } + for x := range v.DeletedBlobFiles { + e.writeUvarint(tagDeletedBlobFile) + e.writeUvarint(uint64(x.FileID)) + e.writeUvarint(uint64(x.FileNum)) + } + _, err := w.Write(e.Bytes()) + return err +} + +// versionEditDecoder should be used to decode version edits. +type versionEditDecoder struct { + byteReader +} + +func (d versionEditDecoder) readBytes() ([]byte, error) { + n, err := d.readUvarint() + if err != nil { + return nil, err + } + s := make([]byte, n) + _, err = io.ReadFull(d, s) + if err != nil { + if err == io.ErrUnexpectedEOF { + return nil, base.CorruptionErrorf("pebble: corrupt manifest: failed to read %d bytes", n) + } + return nil, err + } + return s, nil +} + +func (d versionEditDecoder) readLevel() (int, error) { + u, err := d.readUvarint() + if err != nil { + return 0, err + } + if u >= NumLevels { + return 0, base.CorruptionErrorf("pebble: corrupt manifest: level %d >= %d", u, NumLevels) + } + return int(u), nil +} + +func (d versionEditDecoder) readFileNum() (base.FileNum, error) { + u, err := d.readUvarint() + if err != nil { + return 0, err + } + return base.FileNum(u), nil +} + +func (d versionEditDecoder) readUvarint() (uint64, error) { + u, err := binary.ReadUvarint(d) + if err != nil { + if err == io.EOF || err == io.ErrUnexpectedEOF { + return 0, base.CorruptionErrorf("pebble: corrupt manifest: failed to read uvarint") + } + return 0, err + } + return u, nil +} + +type versionEditEncoder struct { + *bytes.Buffer +} + +func (e versionEditEncoder) writeBytes(p []byte) { + e.writeUvarint(uint64(len(p))) + e.Write(p) +} + +func (e versionEditEncoder) writeKey(k InternalKey) { + e.writeUvarint(uint64(k.Size())) + e.Write(k.UserKey) + buf := k.EncodeTrailer() + e.Write(buf[:]) +} + +func (e versionEditEncoder) writeString(s string) { + e.writeUvarint(uint64(len(s))) + e.WriteString(s) +} + +func (e versionEditEncoder) writeUvarint(u uint64) { + var buf [binary.MaxVarintLen64]byte + n := binary.PutUvarint(buf[:], u) + e.Write(buf[:n]) +} + +// BulkVersionEdit summarizes the files added and deleted from a set of version +// edits. +// +// INVARIANTS: +// No file can be added to a level more than once. This is true globally, and +// also true for all of the calls to Accumulate for a single bulk version edit. +// +// No file can be removed from a level more than once. This is true globally, +// and also true for all of the calls to Accumulate for a single bulk version +// edit. +// +// A sstable file must not be added and removed from a given level in the same +// version edit, and a blob file must not be both added and deleted in the same +// version edit. +// +// A file that is being removed from a level must have been added to that level +// before (in a prior version edit). Note that a given file can be deleted from +// a level and added to another level in a single version edit +type BulkVersionEdit struct { + AddedTables [NumLevels]map[base.FileNum]*TableMetadata + DeletedTables [NumLevels]map[base.FileNum]*TableMetadata + + BlobFiles struct { + // Added holds the metadata of all new blob files introduced within the + // aggregated version edit, keyed by file number. + Added map[base.BlobFileID]*PhysicalBlobFile + // Deleted holds a list of all blob files that became unreferenced by + // any sstables, making them obsolete within the resulting version (a + // zombie if still referenced by previous versions). Deleted file + // numbers must not exist in Added. + // + // Deleted is keyed by blob file ID and points to the physical blob file. + Deleted map[base.BlobFileID]*PhysicalBlobFile + } + + // AddedFileBacking is a map to support lookup so that we can populate the + // TableBacking of virtual sstables during manifest replay. + AddedFileBacking map[base.DiskFileNum]*TableBacking + RemovedFileBacking []base.DiskFileNum + + // AllAddedTables maps table number to table metadata for all added sstables + // from accumulated version edits. AllAddedTables is only populated if set to + // non-nil by a caller. It must be set to non-nil when replaying version edits + // read from a MANIFEST (as opposed to VersionEdits constructed in-memory). + // While replaying a MANIFEST file, VersionEdit.DeletedFiles map entries have + // nil values, because the on-disk deletion record encodes only the file + // number. Accumulate uses AllAddedTables to correctly populate the + // BulkVersionEdit's Deleted field with non-nil *TableMetadata. + AllAddedTables map[base.FileNum]*TableMetadata + + // MarkedForCompactionCountDiff holds the aggregated count of files + // marked for compaction added or removed. + MarkedForCompactionCountDiff int +} + +// Accumulate adds the file addition and deletions in the specified version +// edit to the bulk edit's internal state. +// +// INVARIANTS: +// (1) If a table is added to a given level in a call to Accumulate and then +// removed from that level in a subsequent call, the file will not be present in +// the resulting BulkVersionEdit.Deleted for that level. +// (2) If a new table is added and it includes a reference to a blob file, that +// blob file must either appear in BlobFiles.Added, or the blob file must be +// referenced by a table deleted in the same bulk version edit. +// +// After accumulation of version edits, the bulk version edit may have +// information about a file which has been deleted from a level, but it may not +// have information about the same file added to the same level. The add +// could've occurred as part of a previous bulk version edit. In this case, the +// deleted file must be present in BulkVersionEdit.Deleted, at the end of the +// accumulation, because we need to decrease the refcount of the deleted file in +// Apply. +func (b *BulkVersionEdit) Accumulate(ve *VersionEdit) error { + // Add any blob files that were introduced. + for _, nbf := range ve.NewBlobFiles { + if b.BlobFiles.Added == nil { + b.BlobFiles.Added = make(map[base.BlobFileID]*PhysicalBlobFile) + } + b.BlobFiles.Added[nbf.FileID] = nbf.Physical + } + + for entry, physicalBlobFile := range ve.DeletedBlobFiles { + // If the blob file was added in a prior, accumulated version edit we + // can resolve the deletion by removing it from the added files map. + if b.BlobFiles.Added != nil { + added := b.BlobFiles.Added[entry.FileID] + if added != nil && added.FileNum == entry.FileNum { + delete(b.BlobFiles.Added, entry.FileID) + continue + } + } + // Otherwise the blob file deleted was added prior to this bulk edit, + // and we insert it into BlobFiles.Deleted so that Apply may remove it + // from the resulting version. + if b.BlobFiles.Deleted == nil { + b.BlobFiles.Deleted = make(map[base.BlobFileID]*PhysicalBlobFile) + } + b.BlobFiles.Deleted[entry.FileID] = physicalBlobFile + + } + + for df, m := range ve.DeletedTables { + dmap := b.DeletedTables[df.Level] + if dmap == nil { + dmap = make(map[base.FileNum]*TableMetadata) + b.DeletedTables[df.Level] = dmap + } + + if m == nil { + // m is nil only when replaying a MANIFEST. + if b.AllAddedTables == nil { + return errors.Errorf("deleted file L%d.%s's metadata is absent and bve.AddedByFileNum is nil", df.Level, df.FileNum) + } + m = b.AllAddedTables[df.FileNum] + if m == nil { + return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", df.Level, df.FileNum) + } + } + if m.MarkedForCompaction { + b.MarkedForCompactionCountDiff-- + } + if _, ok := b.AddedTables[df.Level][df.FileNum]; !ok { + dmap[df.FileNum] = m + } else { + // Present in b.Added for the same level. + delete(b.AddedTables[df.Level], df.FileNum) + } + } + + // Generate state for Added backing files. Note that these must be generated + // before we loop through the NewFiles, because we need to populate the + // FileBackings which might be used by the NewFiles loop. + if b.AddedFileBacking == nil { + b.AddedFileBacking = make(map[base.DiskFileNum]*TableBacking) + } + for _, fb := range ve.CreatedBackingTables { + if _, ok := b.AddedFileBacking[fb.DiskFileNum]; ok { + // There is already a TableBacking associated with fb.DiskFileNum. + // This should never happen. There must always be only one TableBacking + // associated with a backing sstable. + panic(fmt.Sprintf("pebble: duplicate file backing %s", fb.DiskFileNum.String())) + } + b.AddedFileBacking[fb.DiskFileNum] = fb + } + + for _, nf := range ve.NewTables { + // A new file should not have been deleted in this or a preceding + // VersionEdit at the same level (though files can move across levels). + if dmap := b.DeletedTables[nf.Level]; dmap != nil { + if _, ok := dmap[nf.Meta.TableNum]; ok { + return base.CorruptionErrorf("pebble: file deleted L%d.%s before it was inserted", nf.Level, nf.Meta.TableNum) + } + } + if nf.Meta.Virtual && nf.Meta.TableBacking == nil { + // TableBacking for a virtual sstable must only be nil if we're performing + // manifest replay. + backing := b.AddedFileBacking[nf.BackingFileNum] + if backing == nil { + return errors.Errorf("TableBacking for virtual sstable must not be nil") + } + nf.Meta.AttachVirtualBacking(backing) + } else if nf.Meta.TableBacking == nil { + return errors.Errorf("Added file L%d.%s's has no TableBacking", nf.Level, nf.Meta.TableNum) + } + + if b.AddedTables[nf.Level] == nil { + b.AddedTables[nf.Level] = make(map[base.FileNum]*TableMetadata) + } + b.AddedTables[nf.Level][nf.Meta.TableNum] = nf.Meta + if b.AllAddedTables != nil { + b.AllAddedTables[nf.Meta.TableNum] = nf.Meta + } + if nf.Meta.MarkedForCompaction { + b.MarkedForCompactionCountDiff++ + } + } + + for _, n := range ve.RemovedBackingTables { + if _, ok := b.AddedFileBacking[n]; ok { + delete(b.AddedFileBacking, n) + } else { + // Since a file can be removed from backing files in exactly one version + // edit it is safe to just append without any de-duplication. + b.RemovedFileBacking = append(b.RemovedFileBacking, n) + } + } + + return nil +} + +// Apply applies the delta b to the current version to produce a new version. +// The ordering of tables within the new version is consistent with respect to +// the comparer. +// +// Apply updates the backing refcounts (Ref/Unref) as files are installed into +// the levels. +// +// curr may be nil, which is equivalent to a pointer to a zero version. +// +// Not that L0SublevelFiles is not initialized in the returned version; it is +// the caller's responsibility to set it using L0Organizer.PerformUpdate(). +func (b *BulkVersionEdit) Apply(curr *Version, readCompactionRate int64) (*Version, error) { + comparer := curr.cmp + v := &Version{ + BlobFiles: curr.BlobFiles.clone(), + cmp: comparer, + } + + // Adjust the count of files marked for compaction. + v.Stats.MarkedForCompaction = curr.Stats.MarkedForCompaction + v.Stats.MarkedForCompaction += b.MarkedForCompactionCountDiff + if v.Stats.MarkedForCompaction < 0 { + return nil, base.CorruptionErrorf("pebble: version marked for compaction count negative") + } + + // Update the BlobFileSet to record blob files added and deleted. The + // BlobFileSet ensures any physical blob files that are referenced by the + // version remain on storage until they're no longer referenced by any + // version. + // + // We remove deleted blob files first, because during a blob file + // replacement the BlobFileID is reused. The B-Tree insert will fail if the + // old blob file is still present in the tree. + for blobFileID := range b.BlobFiles.Deleted { + v.BlobFiles.remove(BlobFileMetadata{FileID: blobFileID}) + } + for blobFileID, physical := range b.BlobFiles.Added { + if err := v.BlobFiles.insert(BlobFileMetadata{ + FileID: blobFileID, + Physical: physical, + }); err != nil { + return nil, err + } + } + + for level := range v.Levels { + v.Levels[level] = curr.Levels[level].clone() + v.RangeKeyLevels[level] = curr.RangeKeyLevels[level].clone() + + if len(b.AddedTables[level]) == 0 && len(b.DeletedTables[level]) == 0 { + // There are no edits on this level. + continue + } + + // Some edits on this level. + lm := &v.Levels[level] + lmRange := &v.RangeKeyLevels[level] + + addedTablesMap := b.AddedTables[level] + deletedTablesMap := b.DeletedTables[level] + if n := v.Levels[level].Len() + len(addedTablesMap); n == 0 { + return nil, base.CorruptionErrorf( + "pebble: internal error: No current or added files but have deleted files: %d", + errors.Safe(len(deletedTablesMap))) + } + + // NB: addedFilesMap may be empty. If a file is present in addedFilesMap + // for a level, it won't be present in deletedFilesMap for the same + // level. + + for _, f := range deletedTablesMap { + // Removing a table from the B-Tree may decrement file reference + // counts. However, because we cloned the previous level's B-Tree, + // this should never result in a file's reference count dropping to + // zero. The remove call will panic if this happens. + v.Levels[level].remove(f) + v.RangeKeyLevels[level].remove(f) + } + + addedTables := make([]*TableMetadata, 0, len(addedTablesMap)) + for _, f := range addedTablesMap { + addedTables = append(addedTables, f) + } + // Sort addedFiles by file number. This isn't necessary, but tests which + // replay invalid manifests check the error output, and the error output + // depends on the order in which files are added to the btree. + slices.SortFunc(addedTables, func(a, b *TableMetadata) int { + return stdcmp.Compare(a.TableNum, b.TableNum) + }) + + var sm, la *TableMetadata + for _, f := range addedTables { + // NB: allowedSeeks is used for read triggered compactions. It is set using + // Options.Experimental.ReadCompactionRate which defaults to 32KB. + var allowedSeeks int64 + if readCompactionRate != 0 { + allowedSeeks = int64(f.Size) / readCompactionRate + } + if allowedSeeks < 100 { + allowedSeeks = 100 + } + f.AllowedSeeks.Store(allowedSeeks) + f.InitAllowedSeeks = allowedSeeks + + // Validate that all referenced blob files exist. + for i, ref := range f.BlobReferences { + phys, ok := v.BlobFiles.LookupPhysical(ref.FileID) + if !ok { + return nil, errors.AssertionFailedf("pebble: blob file %s referenced by L%d.%s not found", + ref.FileID, level, f.TableNum) + } + // NB: It's possible that the reference already has an estimated + // physical size if the table was moved. + if ref.EstimatedPhysicalSize == 0 { + // We must call MakeBlobReference so that we compute the + // reference's physical estimated size. + f.BlobReferences[i] = MakeBlobReference(ref.FileID, ref.ValueSize, phys) + } + } + + err := lm.insert(f) + if err != nil { + return nil, errors.Wrap(err, "pebble") + } + if f.HasRangeKeys { + err = lmRange.insert(f) + if err != nil { + return nil, errors.Wrap(err, "pebble") + } + } + // Track the keys with the smallest and largest keys, so that we can + // check consistency of the modified span. + if sm == nil || base.InternalCompare(comparer.Compare, sm.Smallest(), f.Smallest()) > 0 { + + sm = f + } + if la == nil || base.InternalCompare(comparer.Compare, la.Largest(), f.Largest()) < 0 { + la = f + } + } + + if level == 0 { + if err := CheckOrdering(comparer, Level(0), v.Levels[level].Iter()); err != nil { + return nil, errors.Wrap(err, "pebble: internal error") + } + continue + } + + // Check consistency of the level in the vicinity of our edits. + if sm != nil && la != nil { + overlap := v.Levels[level].Slice().Overlaps(comparer.Compare, sm.UserKeyBounds()) + // overlap contains all of the added tables. We want to ensure that + // the added tables are consistent with neighboring existing tables + // too, so reslice the overlap to pull in a neighbor on each side. + check := overlap.Reslice(func(start, end *LevelIterator) { + if m := start.Prev(); m == nil { + start.Next() + } + if m := end.Next(); m == nil { + end.Prev() + } + }) + if err := CheckOrdering(comparer, Level(level), check.Iter()); err != nil { + return nil, errors.Wrap(err, "pebble: internal error") + } + } + } + + // In invariants builds, sometimes check invariants across all blob files + // and their references. + if invariants.Sometimes(20) { + if err := v.validateBlobFileInvariants(); err != nil { + return nil, err + } + } + + return v, nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/virtual_backings.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/virtual_backings.go new file mode 100644 index 0000000..9abed2b --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manifest/virtual_backings.go @@ -0,0 +1,310 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +import ( + "bytes" + stdcmp "cmp" + "fmt" + "slices" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" +) + +// VirtualBackings maintains information about the set of backings that support +// virtual tables in the latest version. +// +// The VirtualBackings set internally maintains for each backing the number of +// virtual tables that use that backing and the sum of their virtual sizes. When +// a backing is added to the set, it initially is not associated with any +// tables. AddTable/RemoveTable are used to maintain the set of tables that are +// associated with a backing. Finally, a backing can only be removed from the +// set when it is no longer in use. +// +// -- Protection API -- +// +// VirtualBackings exposes a Protect/Unprotect API. This is used to allow +// external file ingestions to reuse existing virtual backings. Because +// ingestions can run in parallel with other operations like compactions, it is +// possible for a backing to "go away" in-between the time the ingestion decides +// to use it and the time the ingestion installs a new version. The protection +// API solves this problem by keeping backings alive, even if they become +// otherwise unused by any tables. +// +// Backing protection achieves two goals: +// - it must prevent the removal of the backing from the latest version, where +// removal means becoming part of a VersionEdit.RemovedBackingTables. This +// is achieved by treating the backing as "in use", preventing Unused() from +// reporting it. +// - it must prevent the backing from becoming obsolete (i.e. reaching a ref +// count of 0). To achieve this, VirtualBackings takes a ref on each backing +// when it is added; this ref must be released after the backing is removed +// (when it is ok for the backing to be reported as obsolete). +// +// For example, say we have virtual table T1 with backing B1 and an ingestion tries +// to reuse the file. This is what will usually happen (the happy case): +// - latest version is V1 and it contains T1(B1). +// - ingestion request comes for another virtual portion of B1. Ingestion process +// finds B1 and calls Protect(B1). +// - ingestion completes, installs version V2 which has T1(B1) and a new +// T2(B1), and calls Unprotect(B1). +// +// In this path, the Protect/Unprotect calls do nothing. But here is what could +// happen (the corner case): +// - latest version is V1 and it contains T1(B1). +// - ingestion request comes for another virtual portion of B1. Ingestion process +// finds B1 and calls Protect(B1). +// - compaction completes and installs version V2 which no longer has T1. +// But because B1 is protected, V2 still has B1. +// - ingestion completes, installs version V3 which has a new T2(B1) and calls +// Unprotect(B1). +// +// If instead the ingestion fails to complete, the last step becomes: +// - ingestion fails, calls Unprotect(B1). B1 is now Unused() and the next +// version (applied by whatever next operation is) will remove B1. +type VirtualBackings struct { + m map[base.DiskFileNum]backingWithMetadata + + // unused are all the backings in m that are not inUse(). Used for + // implementing Unused() efficiently. + unused map[*TableBacking]struct{} + + totalSize uint64 +} + +// MakeVirtualBackings returns empty initialized VirtualBackings. +func MakeVirtualBackings() VirtualBackings { + return VirtualBackings{ + m: make(map[base.DiskFileNum]backingWithMetadata), + unused: make(map[*TableBacking]struct{}), + } +} + +type backingWithMetadata struct { + backing *TableBacking + + // A backing initially has a useCount of 0. The useCount is increased by + // AddTable and decreased by RemoveTable. Backings that have useCount=0 are + + useCount int32 + // protectionCount is used by Protect to temporarily prevent a backing from + // being reported as unused. + protectionCount int32 + // virtualizedSize is the sum of the sizes of the useCount virtual tables + // associated with this backing. + virtualizedSize uint64 +} + +// AddAndRef adds a new backing to the set and takes a reference on it. Another +// backing for the same DiskFileNum must not exist. +// +// The added backing is unused until it is associated with a table via AddTable +// or protected via Protect. +func (bv *VirtualBackings) AddAndRef(backing *TableBacking) { + // We take a reference on the backing because in case of protected backings + // (see Protect), we might be the only ones holding on to a backing. + backing.Ref() + bv.mustAdd(backingWithMetadata{ + backing: backing, + }) + bv.unused[backing] = struct{}{} + bv.totalSize += backing.Size +} + +// Remove removes a backing. The backing must not be in use; normally backings +// are removed once they are reported by Unused(). +// +// It is up to the caller to release the reference took by AddAndRef. +func (bv *VirtualBackings) Remove(n base.DiskFileNum) { + v := bv.mustGet(n) + if v.inUse() { + panic(errors.AssertionFailedf( + "backing %s still in use (useCount=%d protectionCount=%d)", + v.backing.DiskFileNum, v.useCount, v.protectionCount, + )) + } + delete(bv.m, n) + delete(bv.unused, v.backing) + bv.totalSize -= v.backing.Size +} + +// AddTable is used when a new table is using an exiting backing. The backing +// must be in the set already. +func (bv *VirtualBackings) AddTable(m *TableMetadata) { + if !m.Virtual { + panic(errors.AssertionFailedf("table %s not virtual", m.TableNum)) + } + v := bv.mustGet(m.TableBacking.DiskFileNum) + if !v.inUse() { + delete(bv.unused, v.backing) + } + v.useCount++ + v.virtualizedSize += m.Size + bv.m[m.TableBacking.DiskFileNum] = v +} + +// RemoveTable is used when a table using a backing is removed. The backing is +// not removed from the set, even if it becomes unused. +func (bv *VirtualBackings) RemoveTable(m *TableMetadata) { + if !m.Virtual { + panic(errors.AssertionFailedf("table %s not virtual", m.TableNum)) + } + v := bv.mustGet(m.TableBacking.DiskFileNum) + + if v.useCount <= 0 { + panic(errors.AssertionFailedf("invalid useCount")) + } + v.useCount-- + v.virtualizedSize -= m.Size + bv.m[m.TableBacking.DiskFileNum] = v + if !v.inUse() { + bv.unused[v.backing] = struct{}{} + } +} + +// Protect prevents a backing from being reported as unused until a +// corresponding Unprotect call is made. The backing must be in the set. +// +// Multiple Protect calls can be made for the same backing; each must have a +// corresponding Unprotect call before the backing can become unused. +func (bv *VirtualBackings) Protect(n base.DiskFileNum) { + v := bv.mustGet(n) + if !v.inUse() { + delete(bv.unused, v.backing) + } + v.protectionCount++ + bv.m[n] = v +} + +// Unprotect reverses a Protect call. +func (bv *VirtualBackings) Unprotect(n base.DiskFileNum) { + v := bv.mustGet(n) + + if v.protectionCount <= 0 { + panic(errors.AssertionFailedf("invalid protectionCount")) + } + v.protectionCount-- + bv.m[n] = v + if !v.inUse() { + bv.unused[v.backing] = struct{}{} + } +} + +// Stats returns the number and total size of all the virtual backings. +func (bv *VirtualBackings) Stats() (count int, totalSize uint64) { + return len(bv.m), bv.totalSize +} + +// Usage returns information about the usage of a backing, specifically: +// - useCount: the number of virtual tables that use this backing; +// - virtualizedSize: the sum of sizes of virtual tables that use the +// backing. +// +// During compaction picking, we compensate a virtual sstable file size by +// (TableBacking.Size - virtualizedSize) / useCount. +// The intuition is that if TableBacking.Size - virtualizedSize is high, then the +// space amplification due to virtual sstables is high, and we should pick the +// virtual sstable with a higher priority. +func (bv *VirtualBackings) Usage(n base.DiskFileNum) (useCount int, virtualizedSize uint64) { + v := bv.mustGet(n) + return int(v.useCount), v.virtualizedSize +} + +// Unused returns all backings that are and no longer used by the latest version +// and are not protected, in DiskFileNum order. +func (bv *VirtualBackings) Unused() []*TableBacking { + res := make([]*TableBacking, 0, len(bv.unused)) + for b := range bv.unused { + res = append(res, b) + } + slices.SortFunc(res, func(a, b *TableBacking) int { + return stdcmp.Compare(a.DiskFileNum, b.DiskFileNum) + }) + return res +} + +// Get returns the backing with the given DiskFileNum, if it is in the set. +func (bv *VirtualBackings) Get(n base.DiskFileNum) (_ *TableBacking, ok bool) { + v, ok := bv.m[n] + if ok { + return v.backing, true + } + return nil, false +} + +// ForEach calls fn on each backing, in unspecified order. +func (bv *VirtualBackings) ForEach(fn func(backing *TableBacking)) { + for _, v := range bv.m { + fn(v.backing) + } +} + +// DiskFileNums returns disk file nums of all the backing in the set, in sorted +// order. +func (bv *VirtualBackings) DiskFileNums() []base.DiskFileNum { + res := make([]base.DiskFileNum, 0, len(bv.m)) + for n := range bv.m { + res = append(res, n) + } + slices.Sort(res) + return res +} + +// Backings returns all backings in the set, in unspecified order. +func (bv *VirtualBackings) Backings() []*TableBacking { + res := make([]*TableBacking, 0, len(bv.m)) + for _, v := range bv.m { + res = append(res, v.backing) + } + return res +} + +func (bv *VirtualBackings) String() string { + nums := bv.DiskFileNums() + + var buf bytes.Buffer + count, totalSize := bv.Stats() + if count == 0 { + fmt.Fprintf(&buf, "no virtual backings\n") + } else { + fmt.Fprintf(&buf, "%d virtual backings, total size %d:\n", count, totalSize) + for _, n := range nums { + v := bv.m[n] + fmt.Fprintf(&buf, " %s: size=%d useCount=%d protectionCount=%d virtualizedSize=%d\n", + n, v.backing.Size, v.useCount, v.protectionCount, v.virtualizedSize) + } + } + unused := bv.Unused() + if len(unused) > 0 { + fmt.Fprintf(&buf, "unused virtual backings:") + for _, b := range unused { + fmt.Fprintf(&buf, " %s", b.DiskFileNum) + } + fmt.Fprintf(&buf, "\n") + } + return buf.String() +} + +func (bv *VirtualBackings) mustAdd(v backingWithMetadata) { + _, ok := bv.m[v.backing.DiskFileNum] + if ok { + panic("pebble: trying to add an existing file backing") + } + bv.m[v.backing.DiskFileNum] = v +} + +func (bv *VirtualBackings) mustGet(n base.DiskFileNum) backingWithMetadata { + v, ok := bv.m[n] + if !ok { + panic(fmt.Sprintf("unknown backing %s", n)) + } + return v +} + +// inUse returns true if b is used to back at least one virtual table. +func (v *backingWithMetadata) inUse() bool { + return v.useCount > 0 || v.protectionCount > 0 +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manual/manual.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manual/manual.go new file mode 100644 index 0000000..e866dd9 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manual/manual.go @@ -0,0 +1,93 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manual + +import ( + "fmt" + "sync/atomic" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/invariants" +) + +// Buf is a buffer allocated using this package. +type Buf struct { + data unsafe.Pointer + n uintptr +} + +// MakeBufUnsafe should be used with caution: the given data and n must match +// exactly the data and length of a Buf obtained from New. It is useful when +// these are stored implicitly in another type (like a []byte) and we want to +// reconstruct the Buf. +func MakeBufUnsafe(data unsafe.Pointer, n uintptr) Buf { + return Buf{data: data, n: n} +} + +// Data returns a pointer to the buffer data. If the buffer is not initialized +// (or is the result of calling New with a zero length), returns nil. +func (b Buf) Data() unsafe.Pointer { + return b.data +} + +func (b Buf) Len() uintptr { + return b.n +} + +// Slice converts the buffer to a byte slice. +func (b Buf) Slice() []byte { + return unsafe.Slice((*byte)(b.data), b.n) +} + +// Purpose identifies the use-case for an allocation. +type Purpose uint8 + +const ( + _ Purpose = iota + + BlockCacheMap + BlockCacheEntry + BlockCacheData + MemTable + + NumPurposes +) + +// Metrics contains memory statistics by purpose. +type Metrics [NumPurposes]struct { + // InUseBytes is the total number of bytes currently allocated. This is just + // the sum of the lengths of the allocations and does not include any overhead + // or fragmentation. + InUseBytes uint64 +} + +var counters [NumPurposes]struct { + InUseBytes atomic.Int64 + // Pad to separate counters into cache lines. This reduces the overhead when + // multiple purposes are used frequently. We assume 64 byte cache line size + // which is the case for ARM64 servers and AMD64. + _ [7]uint64 +} + +func recordAlloc(purpose Purpose, n uintptr) { + counters[purpose].InUseBytes.Add(int64(n)) +} + +func recordFree(purpose Purpose, n uintptr) { + newVal := counters[purpose].InUseBytes.Add(-int64(n)) + if invariants.Enabled && newVal < 0 { + panic(fmt.Sprintf("negative counter value %d", newVal)) + } +} + +// GetMetrics returns manual memory usage statistics. +func GetMetrics() Metrics { + var res Metrics + for i := range res { + // We load the freed count first to avoid a negative value, since we don't load both counters atomically. + res[i].InUseBytes = uint64(counters[i].InUseBytes.Load()) + } + return res +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/manual/manual.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manual/manual_cgo.go similarity index 50% rename from vendor/github.com/cockroachdb/pebble/internal/manual/manual.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/manual/manual_cgo.go index 640816a..d54ab55 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/manual/manual.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manual/manual_cgo.go @@ -6,7 +6,12 @@ package manual // #include import "C" -import "unsafe" +import ( + "math/rand/v2" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/invariants" +) // The go:linkname directives provides backdoor access to private functions in // the runtime. Below we're accessing the throw function. @@ -14,16 +19,35 @@ import "unsafe" //go:linkname throw runtime.throw func throw(s string) +// useGoAllocation is used in race-enabled builds to configure the package to +// use ordinary Go allocations with make([]byte, n). This is done under the +// assumption that the Go race detector will detect races within cgo-allocated +// memory. Performing some allocations using Go allows the race detector to +// observe concurrent memory access to memory allocated by this package. +// +// TODO(jackson): Confirm that the race detector does not detect races within +// cgo-allocated memory. +var useGoAllocation = invariants.RaceEnabled && rand.Uint32()%2 == 0 + // TODO(peter): Rather than relying an C malloc/free, we could fork the Go // runtime page allocator and allocate large chunks of memory using mmap or // similar. -// New allocates a slice of size n. The returned slice is from manually managed -// memory and MUST be released by calling Free. Failure to do so will result in -// a memory leak. -func New(n int) []byte { +// New allocates a slice of size n. The returned slice is from manually +// managed memory and MUST be released by calling Free. Failure to do so will +// result in a memory leak. +func New(purpose Purpose, n uintptr) Buf { if n == 0 { - return make([]byte, 0) + return Buf{} + } + recordAlloc(purpose, n) + + // In race-enabled builds, we sometimes make allocations using Go to allow + // the race detector to observe concurrent memory access to memory allocated + // by this package. See the definition of useGoAllocation for more details. + if invariants.RaceEnabled && useGoAllocation { + b := make([]byte, n) + return Buf{data: unsafe.Pointer(&b[0]), n: n} } // We need to be conscious of the Cgo pointer passing rules: // @@ -44,17 +68,17 @@ func New(n int) []byte { // it cannot allocate memory. throw("out of memory") } - // Interpret the C pointer as a pointer to a Go array, then slice. - return (*[MaxArrayLen]byte)(unsafe.Pointer(ptr))[:n:n] + return Buf{data: ptr, n: n} } -// Free frees the specified slice. -func Free(b []byte) { - if cap(b) != 0 { - if len(b) == 0 { - b = b[:cap(b)] +// Free frees the specified slice. It has to be exactly the slice that was +// returned by New. +func Free(purpose Purpose, b Buf) { + if b.n != 0 { + recordFree(purpose, b.n) + + if !invariants.RaceEnabled || !useGoAllocation { + C.free(b.data) } - ptr := unsafe.Pointer(&b[0]) - C.free(ptr) } } diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/manual/manual_nocgo.go b/vendor/github.com/cockroachdb/pebble/v2/internal/manual/manual_nocgo.go new file mode 100644 index 0000000..5c38f67 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/manual/manual_nocgo.go @@ -0,0 +1,31 @@ +// Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +//go:build !cgo + +package manual + +import "unsafe" + +// Provides versions of New and Free when cgo is not available (e.g. cross +// compilation). + +// New allocates a slice of size n. +func New(purpose Purpose, n uintptr) Buf { + if n == 0 { + return Buf{} + } + recordAlloc(purpose, n) + slice := make([]byte, n) + return Buf{ + data: unsafe.Pointer(unsafe.SliceData(slice)), + n: n, + } +} + +// Free frees the specified slice. It has to be exactly the slice that was +// returned by New. +func Free(purpose Purpose, b Buf) { + recordFree(purpose, b.n) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/overlap/checker.go b/vendor/github.com/cockroachdb/pebble/v2/internal/overlap/checker.go new file mode 100644 index 0000000..d48617e --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/overlap/checker.go @@ -0,0 +1,263 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package overlap provides facilities for checking whether tables have data +// overlap. +package overlap + +import ( + "context" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" +) + +// WithLSM stores the result of checking for boundary and data overlap between a +// region of key space and the LSM levels, starting from the top (L0) and +// stopping at the highest level with data overlap. +type WithLSM [manifest.NumLevels]WithLevel + +// WithLevel is the result of checking overlap against an LSM level. +type WithLevel struct { + Result Kind + // SplitFile can be set only when result is OnlyBoundary. If it is set, this + // file can be split to free up the range of interest. + SplitFile *manifest.TableMetadata +} + +// Kind indicates the kind of overlap detected between a key range and a level. +// We check two types of overlap: +// +// - file boundary overlap: whether the key range overlaps any of the level's +// user key boundaries; +// +// - data overlap: whether the key range overlaps any keys or ranges in the +// level. Data overlap implies file boundary overlap. +type Kind uint8 + +const ( + // None indicates that the key range of interest doesn't overlap any tables on + // the level. + None Kind = iota + 1 + // OnlyBoundary indicates that there is boundary overlap but no data overlap. + OnlyBoundary + // Data indicates that at least a key or range in the level overlaps with the + // key range of interest. Note that the data overlap check is best-effort and + // there could be false positives. + Data +) + +// Checker is used to check for data overlap between tables in the LSM and a +// user key region of interest. +type Checker struct { + cmp base.Compare + iteratorFactory IteratorFactory +} + +// IteratorFactory is an interface that is used by the Checker to create +// iterators for a given table. All methods can return nil as an empty iterator. +type IteratorFactory interface { + Points(ctx context.Context, m *manifest.TableMetadata) (base.InternalIterator, error) + RangeDels(ctx context.Context, m *manifest.TableMetadata) (keyspan.FragmentIterator, error) + RangeKeys(ctx context.Context, m *manifest.TableMetadata) (keyspan.FragmentIterator, error) +} + +// MakeChecker initializes a new Checker. +func MakeChecker(cmp base.Compare, iteratorFactory IteratorFactory) Checker { + return Checker{ + cmp: cmp, + iteratorFactory: iteratorFactory, + } +} + +// LSMOverlap calculates the LSM overlap for the given region. +func (c *Checker) LSMOverlap( + ctx context.Context, region base.UserKeyBounds, v *manifest.Version, +) (WithLSM, error) { + var result WithLSM + result[0].Result = None + for sublevel := 0; sublevel < len(v.L0SublevelFiles); sublevel++ { + res, err := c.LevelOverlap(ctx, region, v.L0SublevelFiles[sublevel]) + if err != nil { + return WithLSM{}, err + } + if res.Result == Data { + result[0].Result = Data + return result, nil + } + if res.Result == OnlyBoundary { + result[0].Result = OnlyBoundary + } + } + for level := 1; level < manifest.NumLevels; level++ { + var err error + result[level], err = c.LevelOverlap(ctx, region, v.Levels[level].Slice()) + if err != nil { + return WithLSM{}, err + } + if result[level].Result == Data { + return result, err + } + } + return result, nil +} + +// LevelOverlap returns true if there is possible data overlap between a user +// key region and an L0 sublevel or L1+ level. +func (c *Checker) LevelOverlap( + ctx context.Context, region base.UserKeyBounds, ls manifest.LevelSlice, +) (WithLevel, error) { + // Quick check: if the target region contains any file boundaries, we assume + // data overlap. This is a correct assumption in most cases; it is pessimistic + // only for external ingestions which could have "loose" boundaries. External + // ingestions are also the most expensive to look at, so we don't want to do + // that just in the off chance that we'll find a significant empty region at + // the boundary. + // + // This check is important because the region can be very large in the key + // space and encompass many files, and we don't want to open any of them in + // that case. + startIter := ls.Iter() + file := startIter.SeekGE(c.cmp, region.Start) + if file == nil { + // No overlapping files. + return WithLevel{Result: None}, nil + } + fileBounds := file.UserKeyBounds() + if !region.End.IsUpperBoundFor(c.cmp, fileBounds.Start) { + // No overlapping files. + return WithLevel{Result: None}, nil + } + if c.cmp(fileBounds.Start, region.Start) >= 0 || region.End.CompareUpperBounds(c.cmp, fileBounds.End) >= 0 { + // The file ends or starts inside our region; we assume data overlap. + return WithLevel{Result: Data}, nil + } + // We have a single file to look at; its boundaries enclose our region. + empty, err := c.EmptyRegion(ctx, region, file) + if err != nil { + return WithLevel{}, err + } + if !empty { + return WithLevel{Result: Data}, nil + } + return WithLevel{ + Result: OnlyBoundary, + SplitFile: file, + }, nil +} + +// EmptyRegion returns true if the given region doesn't overlap with any keys or +// ranges in the given table. +func (c *Checker) EmptyRegion( + ctx context.Context, region base.UserKeyBounds, m *manifest.TableMetadata, +) (bool, error) { + empty, err := c.emptyRegionPointsAndRangeDels(ctx, region, m) + if err != nil || !empty { + return empty, err + } + return c.emptyRegionRangeKeys(ctx, region, m) +} + +// emptyRegionPointsAndRangeDels returns true if the file doesn't contain any +// point keys or range del spans that overlap with region. +func (c *Checker) emptyRegionPointsAndRangeDels( + ctx context.Context, region base.UserKeyBounds, m *manifest.TableMetadata, +) (bool, error) { + if !m.HasPointKeys { + return true, nil + } + pointBounds := m.UserKeyBoundsByType(manifest.KeyTypePoint) + if !pointBounds.Overlaps(c.cmp, ®ion) { + return true, nil + } + points, err := c.iteratorFactory.Points(ctx, m) + if err != nil { + return false, err + } + if points != nil { + defer func() { _ = points.Close() }() + var kv *base.InternalKV + if c.cmp(region.Start, pointBounds.Start) <= 0 { + kv = points.First() + } else { + kv = points.SeekGE(region.Start, base.SeekGEFlagsNone) + } + if kv == nil && points.Error() != nil { + return false, points.Error() + } + if kv != nil && region.End.IsUpperBoundForInternalKey(c.cmp, kv.K) { + // Found overlap. + return false, nil + } + } + rangeDels, err := c.iteratorFactory.RangeDels(ctx, m) + if err != nil { + return false, err + } + if rangeDels != nil { + defer rangeDels.Close() + empty, err := c.emptyFragmentRegion(region, pointBounds.Start, rangeDels) + if err != nil || !empty { + return empty, err + } + } + // Found no overlap. + return true, nil +} + +// emptyRegionRangeKeys returns true if the file doesn't contain any range key +// spans that overlap with region. +func (c *Checker) emptyRegionRangeKeys( + ctx context.Context, region base.UserKeyBounds, m *manifest.TableMetadata, +) (bool, error) { + if !m.HasRangeKeys { + return true, nil + } + rangeKeyBounds := m.UserKeyBoundsByType(manifest.KeyTypeRange) + if !rangeKeyBounds.Overlaps(c.cmp, ®ion) { + return true, nil + } + rangeKeys, err := c.iteratorFactory.RangeKeys(ctx, m) + if err != nil { + return false, err + } + if rangeKeys != nil { + defer rangeKeys.Close() + empty, err := c.emptyFragmentRegion(region, rangeKeyBounds.Start, rangeKeys) + if err != nil || !empty { + return empty, err + } + } + // Found no overlap. + return true, nil +} + +// emptyFragmentRegion returns true if the given iterator doesn't contain any +// spans that overlap with region. The fragmentLowerBounds is a known lower +// bound for all the spans. +func (c *Checker) emptyFragmentRegion( + region base.UserKeyBounds, fragmentLowerBound []byte, fragments keyspan.FragmentIterator, +) (bool, error) { + var span *keyspan.Span + var err error + if c.cmp(region.Start, fragmentLowerBound) <= 0 { + // This is an optimization: we know there are no spans before region.Start, + // so we can use First. + span, err = fragments.First() + } else { + span, err = fragments.SeekGE(region.Start) + } + if err != nil { + return false, err + } + if span != nil && span.Empty() { + return false, base.AssertionFailedf("fragment iterator produced empty span") + } + if span != nil && region.End.IsUpperBoundFor(c.cmp, span.Start) { + // Found overlap. + return false, nil + } + return true, nil +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/private/batch.go b/vendor/github.com/cockroachdb/pebble/v2/internal/private/batch.go similarity index 83% rename from vendor/github.com/cockroachdb/pebble/internal/private/batch.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/private/batch.go index dcdd1f1..5156032 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/private/batch.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/private/batch.go @@ -5,8 +5,8 @@ package private import ( - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" ) // BatchSort is a hook for constructing iterators over the point and range diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/problemspans/by_level.go b/vendor/github.com/cockroachdb/pebble/v2/internal/problemspans/by_level.go new file mode 100644 index 0000000..d8aa6c9 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/problemspans/by_level.go @@ -0,0 +1,133 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package problemspans + +import ( + "fmt" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/cockroachdb/crlib/crstrings" + "github.com/cockroachdb/crlib/crtime" + "github.com/cockroachdb/pebble/v2/internal/base" +) + +// ByLevel maintains a set of spans (separated by LSM level) with expiration +// times and allows checking for overlap against active (non-expired) spans. +// +// When the spans added to the set are not overlapping, all operations are +// logarithmic. +// +// ByLevel is safe for concurrent use. +type ByLevel struct { + empty atomic.Bool + mu sync.Mutex + levels []Set +} + +// Init must be called before using the ByLevel. +func (bl *ByLevel) Init(numLevels int, cmp base.Compare) { + bl.empty.Store(false) + bl.levels = make([]Set, numLevels) + for i := range bl.levels { + bl.levels[i].Init(cmp) + } +} + +// InitForTesting is used by tests which mock the time source. +func (bl *ByLevel) InitForTesting(numLevels int, cmp base.Compare, nowFn func() crtime.Mono) { + bl.empty.Store(false) + bl.levels = make([]Set, numLevels) + for i := range bl.levels { + bl.levels[i].init(cmp, nowFn) + } +} + +// IsEmpty returns true if there are no problem spans (the "normal" case). It +// can be used in fast paths to avoid checking for specific overlaps. +func (bl *ByLevel) IsEmpty() bool { + if bl.empty.Load() { + // Fast path. + return true + } + bl.mu.Lock() + defer bl.mu.Unlock() + for i := range bl.levels { + if !bl.levels[i].IsEmpty() { + return false + } + } + bl.empty.Store(true) + return true +} + +// Add a span on a specific level. The span automatically expires after the +// given duration. +func (bl *ByLevel) Add(level int, bounds base.UserKeyBounds, expiration time.Duration) { + bl.mu.Lock() + defer bl.mu.Unlock() + bl.empty.Store(false) + bl.levels[level].Add(bounds, expiration) +} + +// Overlaps returns true if any active (non-expired) span on the given level +// overlaps the given bounds. +func (bl *ByLevel) Overlaps(level int, bounds base.UserKeyBounds) bool { + if bl.empty.Load() { + // Fast path. + return false + } + bl.mu.Lock() + defer bl.mu.Unlock() + return bl.levels[level].Overlaps(bounds) +} + +// Excise a span from all levels. Any overlapping active (non-expired) spans are +// split or trimmed accordingly. +func (bl *ByLevel) Excise(bounds base.UserKeyBounds) { + bl.mu.Lock() + defer bl.mu.Unlock() + for i := range bl.levels { + bl.levels[i].Excise(bounds) + } +} + +// Len returns the number of non-overlapping spans that have not expired. Two +// spans that touch are both counted if they have different expiration times. +func (bl *ByLevel) Len() int { + if bl.empty.Load() { + // Fast path. + return 0 + } + bl.mu.Lock() + defer bl.mu.Unlock() + n := 0 + for i := range bl.levels { + n += bl.levels[i].Len() + } + return n +} + +// String prints all active (non-expired) span fragments. +func (bl *ByLevel) String() string { + bl.mu.Lock() + defer bl.mu.Unlock() + var buf strings.Builder + + for i := range bl.levels { + if !bl.levels[i].IsEmpty() { + fmt.Fprintf(&buf, "L%d:\n", i) + for _, line := range crstrings.Lines(bl.levels[i].String()) { + fmt.Fprintf(&buf, " %s\n", line) + } + } + } + if buf.Len() == 0 { + return "" + } + return buf.String() +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/problemspans/doc.go b/vendor/github.com/cockroachdb/pebble/v2/internal/problemspans/doc.go new file mode 100644 index 0000000..7ece1b6 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/problemspans/doc.go @@ -0,0 +1,28 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package problemspans provides functionality for tracking and managing key +// spans that have been identified as problematic. It allows users to add spans +// with associated expiration times, check if a given key range overlaps any +// active (non-expired) spans, and remove spans when issues are resolved. +// +// This package is designed for efficiently tracking key ranges that may need +// special handling. +// +// Key Attributes: +// +// - **Span Registration:** +// Add spans with specified expiration times so that they automatically +// become inactive after a set duration. +// +// - **Overlap Detection:** +// Quickly check if a key range overlaps with any active problematic spans. +// +// - **Span Excise:** +// Remove or adjust spans to reflect changes as issues are resolved. +// +// - **Level-Based Organization:** +// The package offers a structure to organize and manage problematic spans +// per level, with built-in support for concurrent operations. +package problemspans diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/problemspans/set.go b/vendor/github.com/cockroachdb/pebble/v2/internal/problemspans/set.go new file mode 100644 index 0000000..69a096d --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/problemspans/set.go @@ -0,0 +1,134 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package problemspans + +import ( + "fmt" + "strings" + "time" + + "github.com/RaduBerinde/axisds" + "github.com/RaduBerinde/axisds/regiontree" + "github.com/cockroachdb/crlib/crtime" + "github.com/cockroachdb/pebble/v2/internal/base" +) + +// Set maintains a set of spans with expiration times and allows checking for +// overlap against non-expired spans. +// +// When the spans added to the set are not overlapping, all operations are +// logarithmic. +// +// Set is not safe for concurrent use. +type Set struct { + cmp base.Compare + nowFn func() crtime.Mono + + now crtime.Mono + + // We use a region tree with key boundaries and the expirationTime as a + // property. + rt regiontree.T[axisds.Endpoint[[]byte], expirationTime] +} + +// expirationTime of a problem span. 0 means that there is no problem span in a +// region. Expiration times <= Set.now are equivalent to 0. +type expirationTime crtime.Mono + +// Init must be called before a Set can be used. +func (s *Set) Init(cmp base.Compare) { + s.init(cmp, crtime.NowMono) +} + +func (s *Set) init(cmp base.Compare, nowFn func() crtime.Mono) { + *s = Set{} + s.cmp = cmp + s.nowFn = nowFn + // The region tree supports a property equality function that "evolves" over + // time, in that some properties that used to not be equal become equal. In + // our case expired properties become equal to 0. + // + // Note that the region tree automatically removes boundaries between two + // regions that have expired, even during enumeration. + propEqFn := func(a, b expirationTime) bool { + return a == b || + crtime.Mono(a) <= s.now && crtime.Mono(b) <= s.now // Both are expired or 0. + } + endpointCmp := axisds.EndpointCompareFn(axisds.CompareFn[[]byte](cmp)) + s.rt = regiontree.Make(endpointCmp, propEqFn) +} + +func boundsToEndpoints(bounds base.UserKeyBounds) (start, end axisds.Endpoint[[]byte]) { + start = axisds.MakeStartEndpoint(bounds.Start, axisds.Inclusive) + end = axisds.MakeEndEndpoint(bounds.End.Key, axisds.InclusiveIf(bounds.End.Kind == base.Inclusive)) + return start, end +} + +// Add a span to the set. The span automatically expires after the given duration. +func (s *Set) Add(bounds base.UserKeyBounds, expiration time.Duration) { + s.now = s.nowFn() + expTime := expirationTime(s.now + crtime.Mono(expiration)) + start, end := boundsToEndpoints(bounds) + s.rt.Update(start, end, func(p expirationTime) expirationTime { + return max(p, expTime) + }) +} + +// Overlaps returns true if the bounds overlap with a non-expired span. +func (s *Set) Overlaps(bounds base.UserKeyBounds) bool { + s.now = s.nowFn() + start, end := boundsToEndpoints(bounds) + return s.rt.AnyWithGC(start, end, func(exp expirationTime) bool { + return crtime.Mono(exp) > s.now + }) +} + +// Excise removes a span fragment from all spans in the set. Any overlapping +// non-expired spans are cut accordingly. +func (s *Set) Excise(bounds base.UserKeyBounds) { + s.now = s.nowFn() + start, end := boundsToEndpoints(bounds) + s.rt.Update(start, end, func(p expirationTime) expirationTime { + return 0 + }) +} + +// IsEmpty returns true if the set contains no non-expired spans. +func (s *Set) IsEmpty() bool { + s.now = s.nowFn() + return s.rt.IsEmpty() +} + +// Len returns the number of non-overlapping spans that have not expired. Two +// spans that touch are both counted if they have different expiration times. +func (s *Set) Len() int { + s.now = s.nowFn() + n := 0 + s.rt.EnumerateAll(func(start, end axisds.Endpoint[[]byte], prop expirationTime) bool { + n++ + return true + }) + return n +} + +// String prints all active (non-expired) span fragments. +func (s *Set) String() string { + var buf strings.Builder + s.now = s.nowFn() + s.rt.EnumerateAll(func(start, end axisds.Endpoint[[]byte], prop expirationTime) bool { + fmt.Fprintf(&buf, "%s expires in: %s\n", keyEndpointIntervalFormatter(start, end), time.Duration(prop)-time.Duration(s.now)) + return true + }) + if buf.Len() == 0 { + return "" + } + return buf.String() +} + +var keyBoundaryFormatter axisds.BoundaryFormatter[[]byte] = func(b []byte) string { + return string(b) +} + +var keyEndpointIntervalFormatter = axisds.MakeEndpointIntervalFormatter(keyBoundaryFormatter) diff --git a/vendor/github.com/cockroachdb/pebble/internal/rangedel/rangedel.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rangedel/rangedel.go similarity index 53% rename from vendor/github.com/cockroachdb/pebble/internal/rangedel/rangedel.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/rangedel/rangedel.go index f8504bb..2ea9b08 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/rangedel/rangedel.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/rangedel/rangedel.go @@ -5,15 +5,16 @@ package rangedel import ( - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" ) // Encode takes a Span containing only range deletions. It invokes the provided // closure with the encoded internal keys that represent the Span's state. The // keys and values passed to emit are only valid until the closure returns. If // emit returns an error, Encode stops and returns the error. -func Encode(s *keyspan.Span, emit func(k base.InternalKey, v []byte) error) error { +func Encode(s keyspan.Span, emit func(k base.InternalKey, v []byte) error) error { for _, k := range s.Keys { if k.Kind() != base.InternalKeyKindRangeDelete { return base.CorruptionErrorf("pebble: rangedel.Encode cannot encode %s key", k.Kind()) @@ -41,3 +42,22 @@ func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) keyspan.Span { }), } } + +// DecodeIntoSpan decodes an internal key pair encoding a range deletion and +// appends a key to the given span. The start and end keys must match those in +// the span. +func DecodeIntoSpan(cmp base.Compare, ik base.InternalKey, v []byte, s *keyspan.Span) error { + // This function should only be called when ik.UserKey matches the Start of + // the span we already have. If this is not the case, it is a bug in the + // calling code. + if invariants.Enabled && cmp(s.Start, ik.UserKey) != 0 { + return base.AssertionFailedf("DecodeIntoSpan called with different start key") + } + // The value can come from disk or from the user, so we want to check the end + // key in all builds. + if cmp(s.End, v) != 0 { + return base.CorruptionErrorf("pebble: corrupt range key fragmentation") + } + s.Keys = append(s.Keys, keyspan.Key{Trailer: ik.Trailer}) + return nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/rangekey/coalesce.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rangekey/coalesce.go new file mode 100644 index 0000000..ac5fb7f --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/rangekey/coalesce.go @@ -0,0 +1,203 @@ +// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package rangekey + +import ( + "math" + "slices" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" +) + +// Coalesce imposes range key semantics and coalesces range keys with the same +// bounds. Coalesce drops any keys shadowed by more recent sets, unsets or +// deletes. Coalesce modifies the provided span's Keys slice, reslicing the +// slice to remove dropped keys. +// +// Coalescence has subtle behavior with respect to sequence numbers. Coalesce +// depends on a keyspan.Span's Keys being sorted in sequence number descending +// order. The first key has the largest sequence number. The returned coalesced +// span includes only the largest sequence number. All other sequence numbers +// are forgotten. When a compaction constructs output range keys from a +// coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and +// one RANGEKEYDEL. Each one of these keys adopt the largest sequence number. +// +// This has the potentially surprising effect of 'promoting' a key to a higher +// sequence number. This is okay, because: +// - There are no other overlapping keys within the coalesced span of +// sequence numbers (otherwise they would be in the compaction, due to +// the LSM invariant). +// - Range key sequence numbers are never compared to point key sequence +// numbers. Range keys and point keys have parallel existences. +// - Compactions only coalesce within snapshot stripes. +// +// Additionally, internal range keys at the same sequence number have subtle +// mechanics: +// - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix. +// - RANGEKEYDELs only apply to keys at lower sequence numbers. +// +// This is required for ingestion. Ingested sstables are assigned a single +// sequence number for the file, at which all of the file's keys are visible. +// The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such +// that among keys with equal sequence numbers (thus ordered by their kinds) the +// keys do not affect one another. Ingested sstables are expected to be +// consistent with respect to the set/unset suffixes: A given suffix should be +// set or unset but not both. +// +// The resulting dst Keys slice is sorted by InternalKeyTrailer. +func Coalesce(suffixCmp base.CompareRangeSuffixes, keys []keyspan.Key, dst *[]keyspan.Key) { + // TODO(jackson): Currently, Coalesce doesn't actually perform the sequence + // number promotion described in the comment above. + *dst = CoalesceInto(suffixCmp, (*dst)[:0], math.MaxUint64, keys) + // Update the span with the (potentially reduced) keys slice. coalesce left + // the keys in *dst sorted by suffix. Re-sort them by trailer. + keyspan.SortKeysByTrailer(*dst) +} + +// CoalesceInto is a variant of Coalesce which outputs the results into dst +// without sorting them. +func CoalesceInto( + suffixCmp base.CompareRangeSuffixes, dst []keyspan.Key, snapshot base.SeqNum, keys []keyspan.Key, +) []keyspan.Key { + dst = dst[:0] + // First, enforce visibility and RangeKeyDelete mechanics. We only need to + // consider the prefix of keys before and including the first + // RangeKeyDelete. We also must skip any keys that aren't visible at the + // provided snapshot sequence number. + // + // NB: Within a given sequence number, keys are ordered as: + // RangeKeySet > RangeKeyUnset > RangeKeyDelete + // This is significant, because this ensures that a Set or Unset sharing a + // sequence number with a Delete do not shadow each other. + deleteIdx := -1 + for i := range keys { + if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer { + panic("pebble: invariant violation: span keys unordered") + } + if !keys[i].VisibleAt(snapshot) { + continue + } + // Once a RangeKeyDelete is observed, we know it shadows all subsequent + // keys and we can break early. We don't add the RangeKeyDelete key to + // keysBySuffix.keys yet, because we don't want a suffix-less key + // that appeared earlier in the slice to elide it. It'll be added back + // in at the end. + if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete { + deleteIdx = i + break + } + dst = append(dst, keys[i]) + } + + // Sort the accumulated keys by suffix. There may be duplicates within a + // suffix, in which case the one with a larger trailer survives. + // + // We use a stable sort so that the first key with a given suffix is the one + // that with the highest InternalKeyTrailer (because the input `keys` was sorted by + // trailer descending). + slices.SortStableFunc(dst, func(a, b keyspan.Key) int { + return suffixCmp(a.Suffix, b.Suffix) + }) + + // Grab a handle of the full sorted slice, before reslicing + // dst to accumulate the final coalesced keys. + sorted := dst + dst = dst[:0] + + var ( + // prevSuffix is updated on each iteration of the below loop, and + // compared by the subsequent iteration to determine whether adjacent + // keys are defined at the same suffix. + prevSuffix []byte + // shadowing is set to true once any Key is shadowed by another key. + // When it's set to true—or after the loop if no keys are shadowed—the + // keysBySuffix.keys slice is resliced to contain the prefix of + // unshadowed keys. This avoids copying them incrementally in the common + // case of no shadowing. + shadowing bool + ) + for i := range sorted { + if i > 0 && suffixCmp(prevSuffix, sorted[i].Suffix) == 0 { + // Skip; this key is shadowed by the predecessor that had a larger + // InternalKeyTrailer. If this is the first shadowed key, set shadowing=true + // and reslice keysBySuffix.keys to hold the entire unshadowed + // prefix. + if !shadowing { + dst = dst[:i] + shadowing = true + } + continue + } + prevSuffix = sorted[i].Suffix + if shadowing { + dst = append(dst, sorted[i]) + } + } + // If there was no shadowing, dst.keys is untouched. We can simply set it to + // the existing `sorted` slice (also backed by dst). + if !shadowing { + dst = sorted + } + // If the original input `keys` slice contained a RangeKeyDelete, add it. + if deleteIdx >= 0 { + dst = append(dst, keys[deleteIdx]) + } + return dst +} + +// ForeignSSTTransformer implements a keyspan.Transformer for range keys in +// shared ingested sstables. It is largely similar to the Transform function +// implemented in UserIteratorConfig in that it calls coalesce to remove range +// keys shadowed by other range keys, but also retains the range key that does +// the shadowing. In addition, it elides RangeKey unsets/dels in L6 as they are +// inapplicable when reading from a different Pebble instance. Finally, it +// returns keys sorted in trailer order, not suffix order, as that's what the +// rest of the iterator stack expects. +type ForeignSSTTransformer struct { + Equal base.Equal + SeqNum base.SeqNum + sortBuf []keyspan.Key +} + +// Transform implements the Transformer interface. +func (f *ForeignSSTTransformer) Transform( + suffixCmp base.CompareRangeSuffixes, s keyspan.Span, dst *keyspan.Span, +) error { + // Apply shadowing of keys. + dst.Start = s.Start + dst.End = s.End + f.sortBuf = f.sortBuf[:0] + f.sortBuf = CoalesceInto(suffixCmp, f.sortBuf, math.MaxUint64, s.Keys) + keys := f.sortBuf + dst.Keys = dst.Keys[:0] + for i := range keys { + switch keys[i].Kind() { + case base.InternalKeyKindRangeKeySet: + if invariants.Enabled && len(dst.Keys) > 0 && suffixCmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { + panic("pebble: keys unexpectedly not in ascending suffix order") + } + case base.InternalKeyKindRangeKeyUnset: + if invariants.Enabled && len(dst.Keys) > 0 && suffixCmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { + panic("pebble: keys unexpectedly not in ascending suffix order") + } + case base.InternalKeyKindRangeKeyDelete: + // Nothing to do. + default: + return base.CorruptionErrorf("pebble: unrecognized range key kind %s", keys[i].Kind()) + } + dst.Keys = append(dst.Keys, keyspan.Key{ + Trailer: base.MakeTrailer(f.SeqNum, keys[i].Kind()), + Suffix: keys[i].Suffix, + Value: keys[i].Value, + }) + } + // coalesce results in dst.Keys being sorted by Suffix. Change it back to + // ByTrailerDesc, as that's what the iterator stack will expect. + keyspan.SortKeysByTrailer(dst.Keys) + dst.KeysOrder = keyspan.ByTrailerDesc + return nil +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/rangekey/rangekey.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rangekey/rangekey.go similarity index 83% rename from vendor/github.com/cockroachdb/pebble/internal/rangekey/rangekey.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/rangekey/rangekey.go index 2a99834..20d3d89 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/rangekey/rangekey.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/rangekey/rangekey.go @@ -52,16 +52,16 @@ package rangekey import ( "encoding/binary" - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" ) // Encode takes a Span containing only range keys. It invokes the provided // closure with the encoded internal keys that represent the Span's state. The // keys and values passed to emit are only valid until the closure returns. // If emit returns an error, Encode stops and returns the error. -func Encode(s *keyspan.Span, emit func(k base.InternalKey, v []byte) error) error { +func Encode(s keyspan.Span, emit func(k base.InternalKey, v []byte) error) error { enc := Encoder{Emit: emit} return enc.Encode(s) } @@ -82,7 +82,7 @@ type Encoder struct { // // The encoded key-value pair passed to Emit is only valid until the closure // completes. -func (e *Encoder) Encode(s *keyspan.Span) error { +func (e *Encoder) Encode(s keyspan.Span) error { if s.Empty() { return nil } @@ -91,7 +91,7 @@ func (e *Encoder) Encode(s *keyspan.Span) error { // sequence number descending, grouping them into sequence numbers. All keys // with identical sequence numbers are flushed together. var del bool - var seqNum uint64 + var seqNum base.SeqNum for i := range s.Keys { if i == 0 || s.Keys[i].SeqNum() != seqNum { if i > 0 { @@ -127,7 +127,7 @@ func (e *Encoder) Encode(s *keyspan.Span) error { // flush constructs internal keys for accumulated key state, and emits the // internal keys. -func (e *Encoder) flush(s *keyspan.Span, seqNum uint64, del bool) error { +func (e *Encoder) flush(s keyspan.Span, seqNum base.SeqNum, del bool) error { if len(e.sets) > 0 { ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeySet) l := EncodedSetValueLen(s.End, e.sets) @@ -161,30 +161,59 @@ func (e *Encoder) flush(s *keyspan.Span, seqNum uint64, del bool) error { } // Decode takes an internal key pair encoding range key(s) and returns a decoded -// keyspan containing the keys. If keysDst is provided, keys will be appended to -// keysDst. -func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error) { +// keyspan containing the keys. If keysBuf is provided, keys will be appended to +// it. +func Decode(ik base.InternalKey, v []byte, keysBuf []keyspan.Key) (keyspan.Span, error) { var s keyspan.Span + s.Start = ik.UserKey + var err error + s.End, v, err = DecodeEndKey(ik.Kind(), v) + if err != nil { + return keyspan.Span{}, err + } + s.Keys, err = appendKeys(keysBuf, ik, v) + if err != nil { + return keyspan.Span{}, err + } + return s, nil +} +// DecodeIntoSpan decodes an internal key pair encoding range key(s) and appends +// them to the given span. The start and end keys must match those in the span. +func DecodeIntoSpan(cmp base.Compare, ik base.InternalKey, v []byte, s *keyspan.Span) error { // Hydrate the user key bounds. - s.Start = ik.UserKey - var ok bool - s.End, v, ok = DecodeEndKey(ik.Kind(), v) - if !ok { - return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key end from %s", ik.Kind()) + startKey := ik.UserKey + endKey, v, err := DecodeEndKey(ik.Kind(), v) + if err != nil { + return err } - s.Keys = keysDst + // This function should only be called when ik.UserKey matches the Start of + // the span we already have. If this is not the case, it is a bug in the + // calling code. + if invariants.Enabled && cmp(s.Start, startKey) != 0 { + return base.AssertionFailedf("DecodeIntoSpan called with different start key") + } + // The value can come from disk or from the user, so we want to check the end + // key in all builds. + if cmp(s.End, endKey) != 0 { + return base.CorruptionErrorf("pebble: corrupt range key fragmentation") + } + s.Keys, err = appendKeys(s.Keys, ik, v) + return err +} +func appendKeys(buf []keyspan.Key, ik base.InternalKey, v []byte) ([]keyspan.Key, error) { // Hydrate the contents of the range key(s). switch ik.Kind() { case base.InternalKeyKindRangeKeySet: for len(v) > 0 { var sv SuffixValue + var ok bool sv, v, ok = decodeSuffixValue(v) if !ok { - return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key suffix-value tuple") + return nil, base.CorruptionErrorf("pebble: unable to decode range key suffix-value tuple") } - s.Keys = append(s.Keys, keyspan.Key{ + buf = append(buf, keyspan.Key{ Trailer: ik.Trailer, Suffix: sv.Suffix, Value: sv.Value, @@ -193,24 +222,25 @@ func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, case base.InternalKeyKindRangeKeyUnset: for len(v) > 0 { var suffix []byte + var ok bool suffix, v, ok = decodeSuffix(v) if !ok { - return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key unset suffix") + return nil, base.CorruptionErrorf("pebble: unable to decode range key unset suffix") } - s.Keys = append(s.Keys, keyspan.Key{ + buf = append(buf, keyspan.Key{ Trailer: ik.Trailer, Suffix: suffix, }) } case base.InternalKeyKindRangeKeyDelete: if len(v) > 0 { - return keyspan.Span{}, base.CorruptionErrorf("pebble: RANGEKEYDELs must not contain additional data") + return nil, base.CorruptionErrorf("pebble: RANGEKEYDELs must not contain additional data") } - s.Keys = append(s.Keys, keyspan.Key{Trailer: ik.Trailer}) + buf = append(buf, keyspan.Key{Trailer: ik.Trailer}) default: - return keyspan.Span{}, base.CorruptionErrorf("pebble: %s is not a range key", ik.Kind()) + return nil, base.CorruptionErrorf("pebble: %s is not a range key", ik.Kind()) } - return s, nil + return buf, nil } // SuffixValue represents a tuple of a suffix and a corresponding value. A @@ -284,21 +314,23 @@ func EncodeSetValue(dst []byte, endKey []byte, suffixValues []SuffixValue) int { // DecodeEndKey reads the end key from the beginning of a range key (RANGEKEYSET, // RANGEKEYUNSET or RANGEKEYDEL)'s physical encoded value. Both sets and unsets // encode the range key, plus additional data in the value. -func DecodeEndKey(kind base.InternalKeyKind, data []byte) (endKey, value []byte, ok bool) { +func DecodeEndKey(kind base.InternalKeyKind, data []byte) (endKey, value []byte, _ error) { switch kind { case base.InternalKeyKindRangeKeyDelete: // No splitting is necessary for range key deletes. The value is the end // key, and there is no additional associated value. - return data, nil, true + return data, nil, nil + case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset: v, n := binary.Uvarint(data) if n <= 0 || uint64(n)+v >= uint64(len(data)) { - return nil, nil, false + return nil, nil, base.CorruptionErrorf("pebble: unable to decode range key end from %s", kind) } endKey, value = data[n:n+int(v)], data[n+int(v):] - return endKey, value, true + return endKey, value, nil + default: - panic(errors.Newf("key kind %s is not a range key kind", kind)) + return nil, nil, base.AssertionFailedf("key kind %s is not a range key kind", kind) } } diff --git a/vendor/github.com/cockroachdb/pebble/internal/rangekey/coalesce.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rangekeystack/user_iterator.go similarity index 50% rename from vendor/github.com/cockroachdb/pebble/internal/rangekey/coalesce.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/rangekeystack/user_iterator.go index c0456bb..039e0e4 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/rangekey/coalesce.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/rangekeystack/user_iterator.go @@ -1,23 +1,23 @@ -// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use // of this source code is governed by a BSD-style license that can be found in // the LICENSE file. -package rangekey +package rangekeystack import ( "bytes" - "math" - "sort" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/rangekey" ) // UserIteratorConfig holds state for constructing the range key iterator stack // for user iteration. The range key iterator must merge range key spans across -// the levels of the LSM. This merging is performed by a keyspan.MergingIter +// the levels of the LSM. This merging is performed by a keyspanimpl.MergingIter // on-the-fly. The UserIteratorConfig implements keyspan.Transformer, evaluating // range-key semantics and shadowing, so the spans returned by a MergingIter are // fully resolved. @@ -52,12 +52,12 @@ import ( // │ // ╰── .Next type UserIteratorConfig struct { - snapshot uint64 + snapshot base.SeqNum comparer *base.Comparer - miter keyspan.MergingIter + miter keyspanimpl.MergingIter biter keyspan.BoundedIter diter keyspan.DefragmentingIter - liters [manifest.NumLevels]keyspan.LevelIter + liters [manifest.NumLevels]keyspanimpl.LevelIter litersUsed int internalKeys bool bufs *Buffers @@ -66,9 +66,9 @@ type UserIteratorConfig struct { // Buffers holds various buffers used for range key iteration. They're exposed // so that they may be pooled and reused between iterators. type Buffers struct { - merging keyspan.MergingBuffers + merging keyspanimpl.MergingBuffers defragmenting keyspan.DefragmentingBuffers - sortBuf keyspan.KeysBySuffix + sortBuf []keyspan.Key } // PrepareForReuse discards any excessively large buffers. @@ -88,7 +88,7 @@ func (bufs *Buffers) PrepareForReuse() { // keys not visible at the provided snapshot are ignored. func (ui *UserIteratorConfig) Init( comparer *base.Comparer, - snapshot uint64, + snapshot base.SeqNum, lower, upper []byte, hasPrefix *bool, prefix *[]byte, @@ -99,7 +99,7 @@ func (ui *UserIteratorConfig) Init( ui.snapshot = snapshot ui.comparer = comparer ui.internalKeys = internalKeys - ui.miter.Init(comparer.Compare, ui, &bufs.merging, iters...) + ui.miter.Init(comparer, ui, &bufs.merging, iters...) ui.biter.Init(comparer.Compare, comparer.Split, &ui.miter, lower, upper, hasPrefix, prefix) if internalKeys { ui.diter.Init(comparer, &ui.biter, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, &bufs.defragmenting) @@ -118,11 +118,11 @@ func (ui *UserIteratorConfig) AddLevel(iter keyspan.FragmentIterator) { } // NewLevelIter returns a pointer to a newly allocated or reused -// keyspan.LevelIter. The caller is responsible for calling Init() on this +// keyspanimpl.LevelIter. The caller is responsible for calling Init() on this // instance. -func (ui *UserIteratorConfig) NewLevelIter() *keyspan.LevelIter { +func (ui *UserIteratorConfig) NewLevelIter() *keyspanimpl.LevelIter { if ui.litersUsed >= len(ui.liters) { - return &keyspan.LevelIter{} + return &keyspanimpl.LevelIter{} } ui.litersUsed++ return &ui.liters[ui.litersUsed-1] @@ -136,45 +136,41 @@ func (ui *UserIteratorConfig) SetBounds(lower, upper []byte) { } // Transform implements the keyspan.Transformer interface for use with a -// keyspan.MergingIter. It transforms spans by resolving range keys at the +// keyspanimpl.MergingIter. It transforms spans by resolving range keys at the // provided snapshot sequence number. Shadowing of keys is resolved (eg, removal // of unset keys, removal of keys overwritten by a set at the same suffix, etc) // and then non-RangeKeySet keys are removed. The resulting transformed spans // only contain RangeKeySets describing the state visible at the provided // sequence number, and hold their Keys sorted by Suffix (except if internalKeys // is true, then keys remain sorted by trailer. -func (ui *UserIteratorConfig) Transform(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error { +func (ui *UserIteratorConfig) Transform( + suffixCmp base.CompareRangeSuffixes, s keyspan.Span, dst *keyspan.Span, +) error { // Apply shadowing of keys. dst.Start = s.Start dst.End = s.End - ui.bufs.sortBuf = keyspan.KeysBySuffix{ - Cmp: cmp, - Keys: ui.bufs.sortBuf.Keys[:0], - } - if err := coalesce(ui.comparer.Equal, &ui.bufs.sortBuf, ui.snapshot, s.Keys); err != nil { - return err - } + ui.bufs.sortBuf = rangekey.CoalesceInto(suffixCmp, ui.bufs.sortBuf[:0], ui.snapshot, s.Keys) if ui.internalKeys { if s.KeysOrder != keyspan.ByTrailerDesc { panic("unexpected key ordering in UserIteratorTransform with internalKeys = true") } - dst.Keys = ui.bufs.sortBuf.Keys - keyspan.SortKeysByTrailer(&dst.Keys) + dst.Keys = ui.bufs.sortBuf + keyspan.SortKeysByTrailer(dst.Keys) return nil } // During user iteration over range keys, unsets and deletes don't matter. This // step helps logical defragmentation during iteration. - keys := ui.bufs.sortBuf.Keys + keys := ui.bufs.sortBuf dst.Keys = dst.Keys[:0] for i := range keys { switch keys[i].Kind() { case base.InternalKeyKindRangeKeySet: - if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { + if invariants.Enabled && len(dst.Keys) > 0 && suffixCmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { panic("pebble: keys unexpectedly not in ascending suffix order") } dst.Keys = append(dst.Keys, keys[i]) case base.InternalKeyKindRangeKeyUnset: - if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { + if invariants.Enabled && len(dst.Keys) > 0 && suffixCmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { panic("pebble: keys unexpectedly not in ascending suffix order") } // Skip. @@ -199,7 +195,9 @@ func (ui *UserIteratorConfig) Transform(cmp base.Compare, s keyspan.Span, dst *k // defragmenter checks for equality between set suffixes and values (ignoring // sequence numbers). It's intended for use during user iteration, when the // wrapped keyspan iterator is merging spans across all levels of the LSM. -func (ui *UserIteratorConfig) ShouldDefragment(equal base.Equal, a, b *keyspan.Span) bool { +func (ui *UserIteratorConfig) ShouldDefragment( + suffixCmp base.CompareRangeSuffixes, a, b *keyspan.Span, +) bool { // This method is not called with internalKeys = true. if ui.internalKeys { panic("unexpected call to ShouldDefragment with internalKeys = true") @@ -223,12 +221,12 @@ func (ui *UserIteratorConfig) ShouldDefragment(equal base.Equal, a, b *keyspan.S b.Keys[i].Kind() != base.InternalKeyKindRangeKeySet { panic("pebble: unexpected non-RangeKeySet during defragmentation") } - if i > 0 && (ui.comparer.Compare(a.Keys[i].Suffix, a.Keys[i-1].Suffix) < 0 || - ui.comparer.Compare(b.Keys[i].Suffix, b.Keys[i-1].Suffix) < 0) { + if i > 0 && (suffixCmp(a.Keys[i].Suffix, a.Keys[i-1].Suffix) < 0 || + suffixCmp(b.Keys[i].Suffix, b.Keys[i-1].Suffix) < 0) { panic("pebble: range keys not ordered by suffix during defragmentation") } } - if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) { + if suffixCmp(a.Keys[i].Suffix, b.Keys[i].Suffix) != 0 { ret = false break } @@ -239,142 +237,3 @@ func (ui *UserIteratorConfig) ShouldDefragment(equal base.Equal, a, b *keyspan.S } return ret } - -// Coalesce imposes range key semantics and coalesces range keys with the same -// bounds. Coalesce drops any keys shadowed by more recent sets, unsets or -// deletes. Coalesce modifies the provided span's Keys slice, reslicing the -// slice to remove dropped keys. -// -// Coalescence has subtle behavior with respect to sequence numbers. Coalesce -// depends on a keyspan.Span's Keys being sorted in sequence number descending -// order. The first key has the largest sequence number. The returned coalesced -// span includes only the largest sequence number. All other sequence numbers -// are forgotten. When a compaction constructs output range keys from a -// coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and -// one RANGEKEYDEL. Each one of these keys adopt the largest sequence number. -// -// This has the potentially surprising effect of 'promoting' a key to a higher -// sequence number. This is okay, because: -// - There are no other overlapping keys within the coalesced span of -// sequence numbers (otherwise they would be in the compaction, due to -// the LSM invariant). -// - Range key sequence numbers are never compared to point key sequence -// numbers. Range keys and point keys have parallel existences. -// - Compactions only coalesce within snapshot stripes. -// -// Additionally, internal range keys at the same sequence number have subtle -// mechanics: -// - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix. -// - RANGEKEYDELs only apply to keys at lower sequence numbers. -// -// This is required for ingestion. Ingested sstables are assigned a single -// sequence number for the file, at which all of the file's keys are visible. -// The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such -// that among keys with equal sequence numbers (thus ordered by their kinds) the -// keys do not affect one another. Ingested sstables are expected to be -// consistent with respect to the set/unset suffixes: A given suffix should be -// set or unset but not both. -// -// The resulting dst Keys slice is sorted by Trailer. -func Coalesce(cmp base.Compare, eq base.Equal, keys []keyspan.Key, dst *[]keyspan.Key) error { - // TODO(jackson): Currently, Coalesce doesn't actually perform the sequence - // number promotion described in the comment above. - keysBySuffix := keyspan.KeysBySuffix{ - Cmp: cmp, - Keys: (*dst)[:0], - } - if err := coalesce(eq, &keysBySuffix, math.MaxUint64, keys); err != nil { - return err - } - // Update the span with the (potentially reduced) keys slice. coalesce left - // the keys in *dst sorted by suffix. Re-sort them by trailer. - *dst = keysBySuffix.Keys - keyspan.SortKeysByTrailer(dst) - return nil -} - -func coalesce( - equal base.Equal, keysBySuffix *keyspan.KeysBySuffix, snapshot uint64, keys []keyspan.Key, -) error { - // First, enforce visibility and RangeKeyDelete mechanics. We only need to - // consider the prefix of keys before and including the first - // RangeKeyDelete. We also must skip any keys that aren't visible at the - // provided snapshot sequence number. - // - // NB: Within a given sequence number, keys are ordered as: - // RangeKeySet > RangeKeyUnset > RangeKeyDelete - // This is significant, because this ensures that a Set or Unset sharing a - // sequence number with a Delete do not shadow each other. - deleteIdx := -1 - for i := range keys { - if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer { - panic("pebble: invariant violation: span keys unordered") - } - if !keys[i].VisibleAt(snapshot) { - continue - } - // Once a RangeKeyDelete is observed, we know it shadows all subsequent - // keys and we can break early. We don't add the RangeKeyDelete key to - // keysBySuffix.keys yet, because we don't want a suffix-less key - // that appeared earlier in the slice to elide it. It'll be added back - // in at the end. - if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete { - deleteIdx = i - break - } - keysBySuffix.Keys = append(keysBySuffix.Keys, keys[i]) - } - - // Sort the accumulated keys by suffix. There may be duplicates within a - // suffix, in which case the one with a larger trailer survives. - // - // We use a stable sort so that the first key with a given suffix is the one - // that with the highest Trailer (because the input `keys` was sorted by - // trailer descending). - sort.Stable(keysBySuffix) - - // Grab a handle of the full sorted slice, before reslicing - // keysBySuffix.keys to accumulate the final coalesced keys. - sorted := keysBySuffix.Keys - keysBySuffix.Keys = keysBySuffix.Keys[:0] - - var ( - // prevSuffix is updated on each iteration of the below loop, and - // compared by the subsequent iteration to determine whether adjacent - // keys are defined at the same suffix. - prevSuffix []byte - // shadowing is set to true once any Key is shadowed by another key. - // When it's set to true—or after the loop if no keys are shadowed—the - // keysBySuffix.keys slice is resliced to contain the prefix of - // unshadowed keys. This avoids copying them incrementally in the common - // case of no shadowing. - shadowing bool - ) - for i := range sorted { - if i > 0 && equal(prevSuffix, sorted[i].Suffix) { - // Skip; this key is shadowed by the predecessor that had a larger - // Trailer. If this is the first shadowed key, set shadowing=true - // and reslice keysBySuffix.keys to hold the entire unshadowed - // prefix. - if !shadowing { - keysBySuffix.Keys = keysBySuffix.Keys[:i] - shadowing = true - } - continue - } - prevSuffix = sorted[i].Suffix - if shadowing { - keysBySuffix.Keys = append(keysBySuffix.Keys, sorted[i]) - } - } - // If there was no shadowing, keysBySuffix.keys is untouched. We can simply - // set it to the existing `sorted` slice (also backed by keysBySuffix.keys). - if !shadowing { - keysBySuffix.Keys = sorted - } - // If the original input `keys` slice contained a RangeKeyDelete, add it. - if deleteIdx >= 0 { - keysBySuffix.Keys = append(keysBySuffix.Keys, keys[deleteIdx]) - } - return nil -} diff --git a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc.go diff --git a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_32bit.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_32bit.go similarity index 94% rename from vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_32bit.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_32bit.go index 3112cc9..1046d39 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_32bit.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_32bit.go @@ -13,7 +13,6 @@ // permissions and limitations under the License. //go:build 386 || amd64p32 || arm || armbe || ppc || sparc -// +build 386 amd64p32 arm armbe ppc sparc package rawalloc diff --git a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_64bit.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_64bit.go similarity index 89% rename from vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_64bit.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_64bit.go index 6660462..f07420d 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_64bit.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_64bit.go @@ -13,7 +13,6 @@ // permissions and limitations under the License. //go:build amd64 || arm64 || arm64be || ppc64 || ppc64le || mips64 || mips64le || s390x || sparc64 || riscv64 || loong64 -// +build amd64 arm64 arm64be ppc64 ppc64le mips64 mips64le s390x sparc64 riscv64 loong64 package rawalloc diff --git a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_gccgo.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_gccgo.go similarity index 97% rename from vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_gccgo.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_gccgo.go index f2db79c..716fe6c 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_gccgo.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_gccgo.go @@ -13,7 +13,6 @@ // permissions and limitations under the License. //go:build gccgo -// +build gccgo package rawalloc diff --git a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_go1.9.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_go1.9.go similarity index 98% rename from vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_go1.9.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_go1.9.go index 65da436..055f671 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_go1.9.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_go1.9.go @@ -13,7 +13,6 @@ // permissions and limitations under the License. //go:build gc && go1.9 -// +build gc,go1.9 package rawalloc diff --git a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_mips.go b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_mipsall.go similarity index 94% rename from vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_mips.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_mipsall.go index 55b45eb..badb036 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/rawalloc/rawalloc_mips.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/rawalloc/rawalloc_mipsall.go @@ -13,7 +13,6 @@ // permissions and limitations under the License. //go:build mips || mipsle || mips64p32 || mips64p32le -// +build mips mipsle mips64p32 mips64p32le package rawalloc diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/sstableinternal/options.go b/vendor/github.com/cockroachdb/pebble/v2/internal/sstableinternal/options.go new file mode 100644 index 0000000..5d8018d --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/sstableinternal/options.go @@ -0,0 +1,36 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstableinternal + +import ( + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/cache" +) + +// CacheOptions contains the information needed to interact with the block +// cache. +type CacheOptions struct { + // CacheHandle can be nil, in which case no cache is used. When non-nil, the + // other fields must be set accordingly. + CacheHandle *cache.Handle + FileNum base.DiskFileNum +} + +// ReaderOptions are fields of sstable.ReaderOptions that can only be set from +// within the pebble package. +type ReaderOptions struct { + CacheOpts CacheOptions +} + +// WriterOptions are fields of sstable.ReaderOptions that can only be set from +// within the pebble package. +type WriterOptions struct { + CacheOpts CacheOptions + + // DisableKeyOrderChecks disables the checks that keys are added to an sstable + // in order. It is intended for use only in the construction of invalid + // sstables for testing. See tool/make_test_sstables.go. + DisableKeyOrderChecks bool +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/strparse/strparse.go b/vendor/github.com/cockroachdb/pebble/v2/internal/strparse/strparse.go new file mode 100644 index 0000000..9da41e2 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/strparse/strparse.go @@ -0,0 +1,186 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package strparse provides facilities for parsing strings, intended for use in +// tests and debug input. +package strparse + +import ( + "fmt" + "regexp" + "strconv" + "strings" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" +) + +// Parser is a helper used to implement parsing of strings, like +// manifest.ParseFileMetadataDebug. +// +// It takes a string and splits it into tokens. Tokens are separated by +// whitespace; in addition user-specified separators are also always separate +// tokens. For example, when passed the separators `:-[]();` the string +// `000001:[a - b]` results in tokens `000001`, `:`, `[`, `a`, `-`, `b`, `]`, . +// +// All Parser methods throw panics instead of returning errors. The code +// that uses a Parser can recover them and convert them to errors. +type Parser struct { + original string + tokens []string + lastToken string +} + +// MakeParser constructs a new Parser that converts any instance of the runes +// contained in [separators] into separate tokens, and consumes the provided +// input string. +func MakeParser(separators string, input string) Parser { + p := Parser{ + original: input, + } + for _, f := range strings.Fields(input) { + for f != "" { + pos := strings.IndexAny(f, separators) + if pos == -1 { + p.tokens = append(p.tokens, f) + break + } + if pos > 0 { + p.tokens = append(p.tokens, f[:pos]) + } + p.tokens = append(p.tokens, f[pos:pos+1]) + f = f[pos+1:] + } + } + return p +} + +// Done returns true if there are no more tokens. +func (p *Parser) Done() bool { + return len(p.tokens) == 0 +} + +// Peek returns the next token, without consuming the token. Returns "" if there +// are no more tokens. +func (p *Parser) Peek() string { + if p.Done() { + p.lastToken = "" + return "" + } + p.lastToken = p.tokens[0] + return p.tokens[0] +} + +// Next returns the next token, or "" if there are no more tokens. +func (p *Parser) Next() string { + res := p.Peek() + if res != "" { + p.tokens = p.tokens[1:] + } + return res +} + +// Remaining returns all the remaining tokens, separated by spaces. +func (p *Parser) Remaining() string { + res := strings.Join(p.tokens, " ") + p.tokens = nil + return res +} + +// Expect consumes the next tokens, verifying that they exactly match the +// arguments. +func (p *Parser) Expect(tokens ...string) { + for _, tok := range tokens { + if res := p.Next(); res != tok { + p.Errf("expected %q, got %q", tok, res) + } + } +} + +// TryLevel tries to parse a token as a level (e.g. L1, L0.2). If successful, +// the token is consumed. +func (p *Parser) TryLevel() (level int, ok bool) { + t := p.Peek() + if regexp.MustCompile(`^L[0-9](|\.[0-9]+)$`).MatchString(t) { + p.Next() + return int(t[1] - '0'), true + } + return 0, false +} + +// Level parses the next token as a level. +func (p *Parser) Level() int { + level, ok := p.TryLevel() + if !ok { + p.Errf("cannot parse level") + } + return level +} + +// Int parses the next token as an integer. +func (p *Parser) Int() int { + x, err := strconv.Atoi(p.Next()) + if err != nil { + p.Errf("cannot parse number: %v", err) + } + return x +} + +// Uint64 parses the next token as an uint64. +func (p *Parser) Uint64() uint64 { + x, err := strconv.ParseUint(p.Next(), 10, 64) + if err != nil { + p.Errf("cannot parse number: %v", err) + } + return x +} + +// Uint32 parses the next token as an uint32. +func (p *Parser) Uint32() uint32 { + x, err := strconv.ParseUint(p.Next(), 10, 32) + if err != nil { + p.Errf("cannot parse number: %v", err) + } + return uint32(x) +} + +// Uint64 parses the next token as a sequence number. +func (p *Parser) SeqNum() base.SeqNum { + return base.ParseSeqNum(p.Next()) +} + +// BlobFileID parses the next token as a BlobFileID. +func (p *Parser) BlobFileID() base.BlobFileID { + s := p.Next() + if !strings.HasPrefix(s, "B") { + p.Errf("expected blob file ID, got %q", s) + } + v, err := strconv.ParseUint(s[1:], 10, 64) + if err != nil { + p.Errf("cannot parse blob file ID: %v", err) + } + return base.BlobFileID(v) +} + +// FileNum parses the next token as a FileNum. +func (p *Parser) FileNum() base.FileNum { + return base.FileNum(p.Int()) +} + +// DiskFileNum parses the next token as a DiskFileNum. +func (p *Parser) DiskFileNum() base.DiskFileNum { + return base.DiskFileNum(p.Int()) +} + +// InternalKey parses the next token as an internal key. +func (p *Parser) InternalKey() base.InternalKey { + return base.ParseInternalKey(p.Next()) +} + +// Errf panics with an error which includes the original string and the last +// token. +func (p *Parser) Errf(format string, args ...any) { + msg := fmt.Sprintf(format, args...) + panic(errors.Errorf("error parsing %q at token %q: %s", p.original, p.lastToken, msg)) +} diff --git a/vendor/github.com/cockroachdb/pebble/internal/testkeys/strconv.go b/vendor/github.com/cockroachdb/pebble/v2/internal/testkeys/strconv.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/internal/testkeys/strconv.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/testkeys/strconv.go diff --git a/vendor/github.com/cockroachdb/pebble/internal/testkeys/testkeys.go b/vendor/github.com/cockroachdb/pebble/v2/internal/testkeys/testkeys.go similarity index 68% rename from vendor/github.com/cockroachdb/pebble/internal/testkeys/testkeys.go rename to vendor/github.com/cockroachdb/pebble/v2/internal/testkeys/testkeys.go index fa7b10c..c966b1a 100644 --- a/vendor/github.com/cockroachdb/pebble/internal/testkeys/testkeys.go +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/testkeys/testkeys.go @@ -14,14 +14,16 @@ package testkeys import ( "bytes" + "cmp" "fmt" "math" + "math/rand/v2" + "regexp" "strconv" "strings" - "github.com/cockroachdb/pebble/internal/base" - "golang.org/x/exp/constraints" - "golang.org/x/exp/rand" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" ) const alpha = "abcdefghijklmnopqrstuvwxyz" @@ -30,6 +32,13 @@ const suffixDelim = '@' var inverseAlphabet = make(map[byte]int64, len(alpha)) +var prefixRE = regexp.MustCompile("[" + alpha + "]+") + +// ignoreTimestampSuffix is a suffix that is ignored when comparing keys, but +// not when comparing suffixes. It simulates the CRDB synthetic bit situation +// (see https://github.com/cockroachdb/cockroach/issues/130533). +var ignoreTimestampSuffix = []byte("_synthetic") + func init() { for i := range alpha { inverseAlphabet[alpha[i]] = int64(i) @@ -41,8 +50,10 @@ var MaxSuffixLen = 1 + len(fmt.Sprintf("%d", int64(math.MaxInt64))) // Comparer is the comparer for test keys generated by this package. var Comparer = &base.Comparer{ - Compare: compare, - Equal: func(a, b []byte) bool { return compare(a, b) == 0 }, + ComparePointSuffixes: compareSuffixes, + CompareRangeSuffixes: compareSuffixes, + Compare: compare, + Equal: func(a, b []byte) bool { return compare(a, b) == 0 }, AbbreviatedKey: func(k []byte) uint64 { return base.DefaultComparer.AbbreviatedKey(k[:split(k)]) }, @@ -96,23 +107,32 @@ var Comparer = &base.Comparer{ return append(append(dst, a...), 0x00) }, Split: split, - Name: "pebble.internal.testkeys", + ValidateKey: func(k []byte) error { + // Ensure that if the key has a suffix, it's a valid integer + // (potentially modulo a faux synthetic bit suffix). + k = bytes.TrimSuffix(k, ignoreTimestampSuffix) + i := split(k) + if i == len(k) { + return nil + } + if _, err := parseUintBytes(k[i+1:], 10, 64); err != nil { + return errors.Wrapf(err, "invalid key %q", k) + } + return nil + }, + Name: "pebble.internal.testkeys", } +// The comparator is similar to the one in Cockroach; when the prefixes are +// equal: +// - a key without a suffix is smaller than one with a suffix; +// - when both keys have a suffix, the key with the larger (decoded) suffix +// value is smaller. func compare(a, b []byte) int { ai, bi := split(a), split(b) if v := bytes.Compare(a[:ai], b[:bi]); v != 0 { return v } - - if len(a[ai:]) == 0 { - if len(b[bi:]) == 0 { - return 0 - } - return -1 - } else if len(b[bi:]) == 0 { - return +1 - } return compareTimestamps(a[ai:], b[bi:]) } @@ -125,22 +145,39 @@ func split(a []byte) int { } func compareTimestamps(a, b []byte) int { - ai, err := parseUintBytes(bytes.TrimPrefix(a, []byte{suffixDelim}), 10, 64) + a = bytes.TrimSuffix(a, ignoreTimestampSuffix) + b = bytes.TrimSuffix(b, ignoreTimestampSuffix) + if len(a) == 0 || len(b) == 0 { + // The empty suffix sorts first. + return cmp.Compare(len(a), len(b)) + } + if a[0] != suffixDelim || b[0] != suffixDelim { + panic(fmt.Sprintf("invalid suffixes %q %q", a, b)) + } + ai, err := parseUintBytes(a[1:], 10, 64) if err != nil { panic(fmt.Sprintf("invalid test mvcc timestamp %q", a)) } - bi, err := parseUintBytes(bytes.TrimPrefix(b, []byte{suffixDelim}), 10, 64) + bi, err := parseUintBytes(b[1:], 10, 64) if err != nil { panic(fmt.Sprintf("invalid test mvcc timestamp %q", b)) } - switch { - case ai < bi: - return +1 - case ai > bi: - return -1 - default: - return 0 + return cmp.Compare(bi, ai) +} + +func compareSuffixes(a, b []byte) int { + cmp := compareTimestamps(a, b) + if cmp == 0 { + aHasIgnorableSuffix := bytes.HasSuffix(a, ignoreTimestampSuffix) + bHasIgnorableSuffix := bytes.HasSuffix(b, ignoreTimestampSuffix) + if aHasIgnorableSuffix && !bHasIgnorableSuffix { + return 1 + } + if !aHasIgnorableSuffix && bHasIgnorableSuffix { + return -1 + } } + return cmp } // Keyspace describes a finite keyspace of unsuffixed test keys. @@ -220,6 +257,7 @@ func SuffixLen(t int64) int { // ParseSuffix returns the integer representation of the encoded suffix. func ParseSuffix(s []byte) (int64, error) { + s = bytes.TrimSuffix(s, ignoreTimestampSuffix) return strconv.ParseInt(strings.TrimPrefix(string(s), string(suffixDelim)), 10, 64) } @@ -288,21 +326,22 @@ func (a alphabet) EveryN(n int64) Keyspace { } func keyCount(n, l int) int64 { - if n == 0 { - return 0 - } else if n == 1 { - return int64(l) - } // The number of representable keys in the keyspace is a function of the - // length of the alphabet n and the max key length l. Consider how the - // number of representable keys grows as l increases: - // - // l = 1: n - // l = 2: n + n^2 - // l = 3: n + n^2 + n^3 - // ... - // Σ i=(1...l) n^i = n*(n^l - 1)/(n-1) - return (int64(n) * (int64(math.Pow(float64(n), float64(l))) - 1)) / int64(n-1) + // length of the alphabet n and the max key length l: + // n + n^2 + ... + n^l + x := int64(1) + res := int64(0) + for i := 1; i <= l; i++ { + if x >= math.MaxInt64/int64(n) { + panic("overflow") + } + x *= int64(n) + res += x + if res < 0 { + panic("overflow") + } + } + return res } func (a alphabet) key(buf []byte, idx int64) int { @@ -384,135 +423,71 @@ func computeAlphabetKeyIndex(key []byte, alphabet map[byte]int64, n int) int64 { return ret } -func abs(a int64) int64 { - if a < 0 { - return -a +// RandomPrefixInRange returns a random prefix in the range [a, b), where a and +// b are prefixes. +func RandomPrefixInRange(a, b []byte, rng *rand.Rand) []byte { + assertValidPrefix(a) + assertValidPrefix(b) + assertLess(a, b) + commonPrefix := 0 + for commonPrefix < len(a)-1 && commonPrefix < len(b)-1 && a[commonPrefix] == b[commonPrefix] { + commonPrefix++ } - return a -} -// RandomSeparator returns a random alphabetic key k such that a < k < b, -// pulling randomness from the provided random number generator. If dst is -// provided and the generated key fits within dst's capacity, the returned slice -// will use dst's memory. -// -// If a prefix P exists such that Prefix(a) < P < Prefix(b), the generated key -// will consist of the prefix P appended with the provided suffix. A zero suffix -// generates an unsuffixed key. If no such prefix P exists, RandomSeparator will -// try to find a key k with either Prefix(a) or Prefix(b) such that a < k < b, -// but the generated key will not use the provided suffix. Note that it's -// possible that no separator key exists (eg, a='a@2', b='a@1'), in which case -// RandomSeparator returns nil. -// -// If RandomSeparator generates a new prefix, the generated prefix will have -// length at most MAX(maxLength, len(Prefix(a)), len(Prefix(b))). -// -// RandomSeparator panics if a or b fails to decode. -func RandomSeparator(dst, a, b []byte, suffix int64, maxLength int, rng *rand.Rand) []byte { - if Comparer.Compare(a, b) >= 0 { - return nil - } + // We will generate a piece of a key from the Alpha(maxLength) keyspace. Note + // that maxLength cannot be higher than ~13 or we will encounter overflows. + maxLength := 4 + rng.IntN(8) - // Determine both keys' logical prefixes and suffixes. - ai := Comparer.Split(a) - bi := Comparer.Split(b) - ap := a[:ai] - bp := b[:bi] - maxLength = max[int](maxLength, max[int](len(ap), len(bp))) - var as, bs int64 - var err error - if ai != len(a) { - as, err = ParseSuffix(a[ai:]) - if err != nil { - panic(fmt.Sprintf("failed to parse suffix of %q", a)) - } + // Skip any common prefix (but leave at least one character in each key). + skipPrefix := 0 + for skipPrefix+1 < min(len(a), len(b)) && a[skipPrefix] == b[skipPrefix] { + skipPrefix++ } - if bi != len(b) { - bs, err = ParseSuffix(b[bi:]) - if err != nil { - panic(fmt.Sprintf("failed to parse suffix of %q", b)) - } + aPiece := a[skipPrefix:] + bPiece := b[skipPrefix:] + if len(aPiece) > maxLength { + // The trimmed prefix is smaller than a; we must be careful below to not + // return a key smaller than a. + aPiece = aPiece[:maxLength] } - - apIdx := computeAlphabetKeyIndex(ap, inverseAlphabet, maxLength) - bpIdx := computeAlphabetKeyIndex(bp, inverseAlphabet, maxLength) - diff := bpIdx - apIdx - generatedIdx := bpIdx - if diff > 0 { - var add int64 = diff + 1 - var start int64 = apIdx - if as == 1 { - // There's no expressible key with prefix a greater than a@1. So, - // exclude ap. - start = apIdx + 1 - add = diff - } - if bs == 0 { - // No key with prefix b can sort before b@0. We don't want to pick b. - add-- - } - // We're allowing generated id to be in the range [start, start + add - 1]. - if start > start+add-1 { - return nil - } - // If we can generate a key which is actually in the middle of apIdx - // and bpIdx use it so that we don't have to bother about timestamps. - generatedIdx = rng.Int63n(add) + start - for diff > 1 && generatedIdx == apIdx || generatedIdx == bpIdx { - generatedIdx = rng.Int63n(add) + start - } + if len(bPiece) > maxLength { + // The trimmed prefix is smaller than b, so we will still respect the bound. + bPiece = bPiece[:maxLength] } - - switch { - case generatedIdx == apIdx && generatedIdx == bpIdx: - if abs(bs-as) <= 1 { - // There's no expressible suffix between the two, and there's no - // possible separator key. - return nil - } - // The key b is >= key a, but has the same prefix, so b must have the - // smaller timestamp, unless a has timestamp of 0. - // - // NB: The zero suffix (suffix-less) sorts before all other suffixes, so - // any suffix we generate will be greater than it. - if as == 0 { - // bs > as - suffix = bs + rng.Int63n(10) + 1 - } else { - // bs < as. - // Generate suffix in range [bs + 1, as - 1] - suffix = bs + 1 + rng.Int63n(as-bs-1) - } - case generatedIdx == apIdx: - // NB: The zero suffix (suffix-less) sorts before all other suffixes, so - // any suffix we generate will be greater than it. - if as == 0 && suffix == 0 { - suffix++ - } else if as != 0 && suffix >= as { - suffix = rng.Int63n(as) - } - case generatedIdx == bpIdx: - if suffix <= bs { - suffix = bs + rng.Int63n(10) + 1 - } + assertLess(aPiece, bPiece) + apIdx := computeAlphabetKeyIndex(aPiece, inverseAlphabet, maxLength) + bpIdx := computeAlphabetKeyIndex(bPiece, inverseAlphabet, maxLength) + if bpIdx <= apIdx { + panic("unreachable") } - if sz := maxLength + SuffixLen(suffix); cap(dst) < sz { - dst = make([]byte, sz) - } else { - dst = dst[:cap(dst)] + generatedIdx := apIdx + rng.Int64N(bpIdx-apIdx) + if generatedIdx == apIdx { + // Return key a. We handle this separately in case we trimmed aPiece above. + return append([]byte(nil), a...) } - var w int - if suffix == 0 { - w = WriteKey(dst, Alpha(maxLength), generatedIdx) - } else { - w = WriteKeyAt(dst, Alpha(maxLength), generatedIdx, suffix) + dst := make([]byte, skipPrefix+maxLength) + copy(dst, a[:skipPrefix]) + pieceLen := WriteKey(dst[skipPrefix:], Alpha(maxLength), generatedIdx) + dst = dst[:skipPrefix+pieceLen] + assertLE(a, dst) + assertLess(dst, b) + return dst +} + +func assertValidPrefix(p []byte) { + if !prefixRE.Match(p) { + panic(fmt.Sprintf("invalid prefix %q", p)) + } +} + +func assertLess(a, b []byte) { + if Comparer.Compare(a, b) >= 0 { + panic(fmt.Sprintf("invalid key ordering: %q >= %q", a, b)) } - return dst[:w] } -func max[I constraints.Ordered](a, b I) I { - if b > a { - return b +func assertLE(a, b []byte) { + if Comparer.Compare(a, b) > 0 { + panic(fmt.Sprintf("invalid key ordering: %q > %q", a, b)) } - return a } diff --git a/vendor/github.com/cockroachdb/pebble/v2/internal/treeprinter/tree_printer.go b/vendor/github.com/cockroachdb/pebble/v2/internal/treeprinter/tree_printer.go new file mode 100644 index 0000000..128b784 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/internal/treeprinter/tree_printer.go @@ -0,0 +1,344 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package treeprinter + +import ( + "bytes" + "fmt" + "strings" +) + +var ( + edgeLinkChr = rune('│') + edgeMidChr = rune('├') + edgeLastChr = rune('└') + horLineChr = rune('─') + bulletChr = rune('•') +) + +// Node is a handle associated with a specific depth in a tree. See below for +// sample usage. +type Node struct { + tree *tree + level int +} + +// New creates a tree printer and returns a sentinel node reference which +// should be used to add the root. Sample usage: +// +// tp := New() +// root := tp.Child("root") +// root.Child("child-1") +// root.Child("child-2").Child("grandchild\ngrandchild-more-info") +// root.Child("child-3") +// +// fmt.Print(tp.String()) +// +// Output: +// +// root +// ├── child-1 +// ├── child-2 +// │ └── grandchild +// │ grandchild-more-info +// └── child-3 +// +// Note that the Child calls can't be rearranged arbitrarily; they have +// to be in the order they need to be displayed (depth-first pre-order). +func New() Node { + return NewWithStyle(DefaultStyle) +} + +// NewWithStyle creates a tree printer like New, permitting customization of +// the style of the resulting tree. +func NewWithStyle(style Style) Node { + t := &tree{style: style} + + switch style { + case CompactStyle: + t.edgeLink = []rune{edgeLinkChr} + t.edgeMid = []rune{edgeMidChr, ' '} + t.edgeLast = []rune{edgeLastChr, ' '} + + case BulletStyle: + t.edgeLink = []rune{edgeLinkChr} + t.edgeMid = []rune{edgeMidChr, horLineChr, horLineChr, ' '} + t.edgeLast = []rune{edgeLastChr, horLineChr, horLineChr, ' '} + + default: + t.edgeLink = []rune{' ', edgeLinkChr} + t.edgeMid = []rune{' ', edgeMidChr, horLineChr, horLineChr, ' '} + t.edgeLast = []rune{' ', edgeLastChr, horLineChr, horLineChr, ' '} + } + + return Node{ + tree: t, + level: 0, + } +} + +// Style is one of the predefined treeprinter styles. +type Style int + +const ( + // DefaultStyle is the default style. Example: + // + // foo + // ├── bar1 + // │ bar2 + // │ └── baz + // └── qux + // + DefaultStyle Style = iota + + // CompactStyle is a compact style, for deeper trees. Example: + // + // foo + // ├ bar1 + // │ bar2 + // │ └ baz + // └ qux + // + CompactStyle + + // BulletStyle is a style that shows a bullet for each node, and groups any + // other lines under that bullet. Example: + // + // • foo + // │ + // ├── • bar1 + // │ │ bar2 + // │ │ + // │ └── • baz + // │ + // └── • qux + // + BulletStyle +) + +// tree implements the tree printing machinery. +// +// All Nodes hold a reference to the tree and Node calls result in modification +// of the tree. At any point in time, tree.rows contains the formatted tree that +// was described by the Node calls performed so far. +// +// When new nodes are added, some of the characters of the previous formatted +// tree need to be updated. Here is an example stepping through the state: +// +// API call Rows +// +// +// tp := New() +// +// +// root := tp.Child("root") root +// +// +// root.Child("child-1") root +// └── child-1 +// +// +// c2 := root.Child("child-2") root +// ├── child-1 +// └── child-2 +// +// Note: here we had to go back up and change └─ into ├─ for child-1. +// +// +// c2.Child("grandchild") root +// ├── child-1 +// └── child-2 +// └── grandchild +// +// +// root.Child("child-3" root +// ├── child-1 +// ├── child-2 +// │ └── grandchild +// └── child-3 +// +// Note: here we had to go back up and change └─ into ├─ for child-2, and +// add a │ on the grandchild row. In general, we may need to add an +// arbitrary number of vertical bars. +// +// In order to perform these character changes, we maintain information about +// the nodes on the bottom-most path. +type tree struct { + style Style + + // rows maintains the rows accumulated so far, as rune arrays. + rows [][]rune + + // stack contains information pertaining to the nodes on the bottom-most path + // of the tree. + stack []nodeInfo + + edgeLink []rune + edgeMid []rune + edgeLast []rune +} + +type nodeInfo struct { + // firstChildConnectRow is the index (in tree.rows) of the row up to which we + // have to connect the first child of this node. + firstChildConnectRow int + + // nextSiblingConnectRow is the index (in tree.rows) of the row up to which we + // have to connect the next sibling of this node. Typically this is the same + // with firstChildConnectRow, except when the node has multiple rows. For + // example: + // + // foo + // └── bar1 <---- nextSiblingConnectRow + // bar2 <---- firstChildConnectRow + // + // firstChildConnectRow is used when adding "baz", nextSiblingConnectRow + // is used when adding "qux": + // foo + // ├── bar1 + // │ bar2 + // │ └── baz + // └── qux + // + nextSiblingConnectRow int +} + +// set copies the string of runes into a given row, at a specific position. The +// row is extended with spaces if needed. +func (t *tree) set(rowIdx int, colIdx int, what []rune) { + // Extend the line if necessary. + for len(t.rows[rowIdx]) < colIdx+len(what) { + t.rows[rowIdx] = append(t.rows[rowIdx], ' ') + } + copy(t.rows[rowIdx][colIdx:], what) +} + +// addRow adds a row with a given text, with the proper indentation for the +// given level. +func (t *tree) addRow(level int, text string) (rowIdx int) { + runes := []rune(text) + // Each level indents by this much. + k := len(t.edgeLast) + indent := level * k + row := make([]rune, indent+len(runes)) + for i := 0; i < indent; i++ { + row[i] = ' ' + } + copy(row[indent:], runes) + t.rows = append(t.rows, row) + return len(t.rows) - 1 +} + +// Childf adds a node as a child of the given node. +func (n Node) Childf(format string, args ...interface{}) Node { + return n.Child(fmt.Sprintf(format, args...)) +} + +// Child adds a node as a child of the given node. Multi-line strings are +// supported with appropriate indentation. +func (n Node) Child(text string) Node { + if strings.ContainsRune(text, '\n') { + splitLines := strings.Split(text, "\n") + node := n.childLine(splitLines[0]) + for _, l := range splitLines[1:] { + node.AddLine(l) + } + return node + } + return n.childLine(text) +} + +// AddLine adds a new line to a node without an edge. +func (n Node) AddLine(text string) { + t := n.tree + if t.style == BulletStyle { + text = " " + text + } + rowIdx := t.addRow(n.level-1, text) + if t.style != BulletStyle { + t.stack[n.level-1].firstChildConnectRow = rowIdx + } +} + +// childLine adds a node as a child of the given node. +func (n Node) childLine(text string) Node { + t := n.tree + if t.style == BulletStyle { + text = fmt.Sprintf("%c %s", bulletChr, text) + if n.level > 0 { + n.AddEmptyLine() + } + } + rowIdx := t.addRow(n.level, text) + edgePos := (n.level - 1) * len(t.edgeLast) + if n.level == 0 { + // Case 1: root. + if len(t.stack) != 0 { + panic("multiple root nodes") + } + } else if len(t.stack) <= n.level { + // Case 2: first child. Connect to parent. + if len(t.stack) != n.level { + panic("misuse of node") + } + parentRow := t.stack[n.level-1].firstChildConnectRow + for i := parentRow + 1; i < rowIdx; i++ { + t.set(i, edgePos, t.edgeLink) + } + t.set(rowIdx, edgePos, t.edgeLast) + } else { + // Case 3: non-first child. Connect to sibling. + siblingRow := t.stack[n.level].nextSiblingConnectRow + t.set(siblingRow, edgePos, t.edgeMid) + for i := siblingRow + 1; i < rowIdx; i++ { + t.set(i, edgePos, t.edgeLink) + } + t.set(rowIdx, edgePos, t.edgeLast) + // Update the nextSiblingConnectRow. + t.stack = t.stack[:n.level] + } + + t.stack = append(t.stack, nodeInfo{ + firstChildConnectRow: rowIdx, + nextSiblingConnectRow: rowIdx, + }) + + // Return a TreePrinter that can be used for children of this node. + return Node{ + tree: t, + level: n.level + 1, + } +} + +// AddEmptyLine adds an empty line to the output; used to introduce vertical +// spacing as needed. +func (n Node) AddEmptyLine() { + n.tree.rows = append(n.tree.rows, []rune{}) +} + +// FormattedRows returns the formatted rows. Can only be called on the result of +// treeprinter.New. +func (n Node) FormattedRows() []string { + if n.level != 0 { + panic("Only the root can be stringified") + } + res := make([]string, len(n.tree.rows)) + for i, r := range n.tree.rows { + res[i] = string(r) + } + return res +} + +func (n Node) String() string { + if n.level != 0 { + panic("Only the root can be stringified") + } + var buf bytes.Buffer + for _, r := range n.tree.rows { + buf.WriteString(string(r)) + buf.WriteByte('\n') + } + return buf.String() +} diff --git a/vendor/github.com/cockroachdb/pebble/iterator.go b/vendor/github.com/cockroachdb/pebble/v2/iterator.go similarity index 87% rename from vendor/github.com/cockroachdb/pebble/iterator.go rename to vendor/github.com/cockroachdb/pebble/v2/iterator.go index f19df53..aa7a470 100644 --- a/vendor/github.com/cockroachdb/pebble/iterator.go +++ b/vendor/github.com/cockroachdb/pebble/v2/iterator.go @@ -8,19 +8,21 @@ import ( "bytes" "context" "io" + "math/rand/v2" "sync" "unsafe" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/bytealloc" - "github.com/cockroachdb/pebble/internal/fastrand" - "github.com/cockroachdb/pebble/internal/humanize" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/internal/rangekey" - "github.com/cockroachdb/pebble/sstable" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/bytealloc" + "github.com/cockroachdb/pebble/v2/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/rangekeystack" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/sstable/blob" "github.com/cockroachdb/redact" ) @@ -81,7 +83,7 @@ const ( // Approximate gap in bytes between samples of data read during iteration. // This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield // 1 << 20 (1MB). The 1MB factor comes from: -// https://github.com/cockroachdb/pebble/issues/29#issuecomment-494477985 +// https://github.com/cockroachdb/pebble/v2/issues/29#issuecomment-494477985 const readBytesPeriod uint64 = 1 << 16 var errReversePrefixIteration = errors.New("pebble: unsupported reverse prefix iteration") @@ -198,10 +200,10 @@ type Iterator struct { merge Merge comparer base.Comparer iter internalIterator - pointIter internalIterator + pointIter topLevelIterator // Either readState or version is set, but not both. readState *readState - version *version + version *manifest.Version // rangeKey holds iteration state specific to iteration over range keys. // The range key field may be nil if the Iterator has never been configured // to iterate over range keys. Its non-nilness cannot be used to determine @@ -216,43 +218,45 @@ type Iterator struct { // is backed by keyBuf. key []byte keyBuf []byte - value LazyValue + value base.InternalValue // For use in LazyValue.Clone. valueBuf []byte fetcher base.LazyFetcher // For use in LazyValue.Value. lazyValueBuf []byte valueCloser io.Closer + // blobValueFetcher is the ValueFetcher to use when retrieving values stored + // externally in blob files. + blobValueFetcher blob.ValueFetcher // boundsBuf holds two buffers used to store the lower and upper bounds. // Whenever the Iterator's bounds change, the new bounds are copied into // boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce // allocations. opts.LowerBound and opts.UpperBound point into this slice. boundsBuf [2][]byte boundsBufIdx int - // iterKey, iterValue reflect the latest position of iter, except when - // SetBounds is called. In that case, these are explicitly set to nil. - iterKey *InternalKey - iterValue LazyValue + // iterKV reflects the latest position of iter, except when SetBounds is + // called. In that case, it is explicitly set to nil. + iterKV *base.InternalKV alloc *iterAlloc getIterAlloc *getIterAlloc prefixOrFullSeekKey []byte readSampling readSampling stats IteratorStats - externalReaders [][]*sstable.Reader - + externalIter *externalIterState // Following fields used when constructing an iterator stack, eg, in Clone // and SetOptions or when re-fragmenting a batch's range keys/range dels. // Non-nil if this Iterator includes a Batch. batch *Batch + fc *fileCacheHandle newIters tableNewIters - newIterRangeKey keyspan.TableNewSpanIter + newIterRangeKey keyspanimpl.TableNewSpanIter lazyCombinedIter lazyCombinedIter - seqNum uint64 + seqNum base.SeqNum // batchSeqNum is used by Iterators over indexed batches to detect when the // underlying batch has been mutated. The batch beneath an indexed batch may // be mutated while the Iterator is open, but new keys are not surfaced // until the next call to SetOptions. - batchSeqNum uint64 + batchSeqNum base.SeqNum // batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is // configured to read through an indexed batch. If a batch is set, these // iterators will be included within the iterator stack regardless of @@ -311,19 +315,8 @@ type Iterator struct { // batchIter, Seek[Prefix]GE set flags.BatchJustRefreshed()=true if this // bit is enabled. batchJustRefreshed bool - // Used for an optimization in external iterators to reduce the number of - // merging levels. - forwardOnly bool - // closePointIterOnce is set to true if this point iter can only be Close()d - // once, _and_ closing i.iter and then i.pointIter would close i.pointIter - // twice. This is necessary to track if the point iter is an internal iterator - // that could release its resources to a pool on Close(), making it harder for - // that iterator to make its own closes idempotent. - // - // TODO(bilal): Update SetOptions to always close out point key iterators when - // they won't be used, so that Close() doesn't need to default to closing - // point iterators twice. - closePointIterOnce bool + // batchOnlyIter is set to true for Batch.NewBatchOnlyIter. + batchOnlyIter bool // Used in some tests to disable the random disabling of seek optimizations. forceEnableSeekOpt bool // Set to true if NextPrefix is not currently permitted. Defaults to false @@ -336,11 +329,6 @@ func (i *Iterator) cmp(a, b []byte) int { return i.comparer.Compare(a, b) } -// split is a convenience shorthand for the i.comparer.Split function. -func (i *Iterator) split(a []byte) int { - return i.comparer.Split(a) -} - // equal is a convenience shorthand for the i.comparer.Equal function. func (i *Iterator) equal(a, b []byte) bool { return i.comparer.Equal(a, b) @@ -348,9 +336,6 @@ func (i *Iterator) equal(a, b []byte) bool { // iteratorRangeKeyState holds an iterator's range key iteration state. type iteratorRangeKeyState struct { - opts *IterOptions - cmp base.Compare - split base.Split // rangeKeyIter holds the range key iterator stack that iterates over the // merged spans across the entirety of the LSM. rangeKeyIter keyspan.FragmentIterator @@ -401,7 +386,7 @@ type iteratorRangeKeyState struct { // iterator stack, but do not need to be directly accessed during iteration. // This struct is bundled within the iteratorRangeKeyState struct to reduce // allocations. - iterConfig rangekey.UserIteratorConfig + iterConfig rangekeystack.UserIteratorConfig } type rangeKeyBuffers struct { @@ -411,7 +396,7 @@ type rangeKeyBuffers struct { // Start and end boundaries, suffixes and values are all copied into buf. buf bytealloc.A // internal holds buffers used by the range key internal iterators. - internal rangekey.Buffers + internal rangekeystack.Buffers } func (b *rangeKeyBuffers) PrepareForReuse() { @@ -429,12 +414,6 @@ func (b *rangeKeyBuffers) PrepareForReuse() { b.internal.PrepareForReuse() } -func (i *iteratorRangeKeyState) init(cmp base.Compare, split base.Split, opts *IterOptions) { - i.cmp = cmp - i.split = split - i.opts = opts -} - var iterRangeKeyStateAllocPool = sync.Pool{ New: func() interface{} { return &iteratorRangeKeyState{} @@ -464,14 +443,6 @@ const ( // around calling CanDeterministicallySingleDelete at most once per external // iterator position. internalNextOp - // invalidatedLastPositionOp is similar to unknownLastPositionOp and the - // only reason to distinguish this is for the wider set of SeekGE - // optimizations we permit for the external iterator Iterator.forwardOnly - // case. Most code predicates should be doing equality comparisons with one - // of the seek* enum values, so this duplication should not result in code - // of the form: - // if unknownLastPositionOp || invalidLastPositionOp - invalidatedLastPositionOp ) // Limited iteration mode. Not for use with prefix iteration. @@ -546,14 +517,22 @@ func (i *Iterator) findNextEntry(limit []byte) { return } - for i.iterKey != nil { - key := *i.iterKey + for i.iterKV != nil { + key := i.iterKV.K - if i.hasPrefix { - if n := i.split(key.UserKey); !i.equal(i.prefixOrFullSeekKey, key.UserKey[:n]) { - return + // The topLevelIterator.StrictSeekPrefixGE contract requires that in + // prefix mode [i.hasPrefix=t], every point key returned by the internal + // iterator must have the current iteration prefix. + if invariants.Enabled && i.hasPrefix { + // Range keys are an exception to the contract and may return a different + // prefix. This case is explicitly handled in the switch statement below. + if key.Kind() != base.InternalKeyKindRangeKeySet { + if p := i.comparer.Split.Prefix(key.UserKey); !i.equal(i.prefixOrFullSeekKey, p) { + i.opts.logger.Fatalf("pebble: prefix violation: key %q does not have prefix %q\n", key.UserKey, i.prefixOrFullSeekKey) + } } } + // Compare with limit every time we start at a different user key. // Note that given the best-effort contract of limit, we could avoid a // comparison in the common case by doing this only after @@ -561,7 +540,7 @@ func (i *Iterator) findNextEntry(limit []byte) { // the behavior non-deterministic (since the behavior will vary based // on what has been compacted), which makes it hard to test with the // metamorphic test. So we forego that performance optimization. - if limit != nil && i.cmp(limit, i.iterKey.UserKey) <= 0 { + if limit != nil && i.cmp(limit, i.iterKV.K.UserKey) <= 0 { i.iterValidityState = IterAtLimit i.pos = iterPosCurForwardPaused return @@ -569,7 +548,7 @@ func (i *Iterator) findNextEntry(limit []byte) { // If the user has configured a SkipPoint function, invoke it to see // whether we should skip over the current user key. - if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(i.iterKey.UserKey) { + if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(i.iterKV.K.UserKey) { // NB: We could call nextUserKey, but in some cases the SkipPoint // predicate function might be cheaper than nextUserKey's key copy // and key comparison. This should be the case for MVCC suffix @@ -578,16 +557,21 @@ func (i *Iterator) findNextEntry(limit []byte) { // whether we skip over just the internal key, the user key, or even // the key prefix. i.stats.ForwardStepCount[InternalIterCall]++ - i.iterKey, i.iterValue = i.iter.Next() + i.iterKV = i.iter.Next() continue } switch key.Kind() { case InternalKeyKindRangeKeySet: + if i.hasPrefix { + if p := i.comparer.Split.Prefix(key.UserKey); !i.equal(i.prefixOrFullSeekKey, p) { + return + } + } // Save the current key. i.keyBuf = append(i.keyBuf[:0], key.UserKey...) i.key = i.keyBuf - i.value = LazyValue{} + i.value = base.InternalValue{} // There may also be a live point key at this userkey that we have // not yet read. We need to find the next entry with this user key // to find it. Save the range key so we don't lose it when we Next @@ -612,7 +596,7 @@ func (i *Iterator) findNextEntry(limit []byte) { case InternalKeyKindSet, InternalKeyKindSetWithDelete: i.keyBuf = append(i.keyBuf[:0], key.UserKey...) i.key = i.keyBuf - i.value = i.iterValue + i.value = i.iterKV.V i.iterValidityState = IterValid i.saveRangeKey() return @@ -648,6 +632,12 @@ func (i *Iterator) findNextEntry(limit []byte) { return } } + + // Is iterKey nil due to an error? + if err := i.iter.Error(); err != nil { + i.err = err + i.iterValidityState = IterExhausted + } } func (i *Iterator) nextPointCurrentUserKey() bool { @@ -660,14 +650,22 @@ func (i *Iterator) nextPointCurrentUserKey() bool { i.pos = iterPosCurForward - i.iterKey, i.iterValue = i.iter.Next() + i.iterKV = i.iter.Next() i.stats.ForwardStepCount[InternalIterCall]++ - if i.iterKey == nil || !i.equal(i.key, i.iterKey.UserKey) { + if i.iterKV == nil { + if err := i.iter.Error(); err != nil { + i.err = err + } else { + i.pos = iterPosNext + } + return false + } + if !i.equal(i.key, i.iterKV.K.UserKey) { i.pos = iterPosNext return false } - key := *i.iterKey + key := i.iterKV.K switch key.Kind() { case InternalKeyKindRangeKeySet: // RangeKeySets must always be interleaved as the first internal key @@ -682,7 +680,7 @@ func (i *Iterator) nextPointCurrentUserKey() bool { return false case InternalKeyKindSet, InternalKeyKindSetWithDelete: - i.value = i.iterValue + i.value = i.iterKV.V return true case InternalKeyKindMerge: @@ -703,7 +701,7 @@ func (i *Iterator) nextPointCurrentUserKey() bool { // mergeForward does not update iterValidityState. func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) { var iterValue []byte - iterValue, _, i.err = i.iterValue.Value(nil) + iterValue, _, i.err = i.iterKV.Value(nil) if i.err != nil { return false } @@ -742,35 +740,41 @@ func (i *Iterator) closeValueCloser() error { } func (i *Iterator) nextUserKey() { - if i.iterKey == nil { + if i.iterKV == nil { return } - trailer := i.iterKey.Trailer - done := i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer + trailer := i.iterKV.K.Trailer + done := i.iterKV.K.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer if i.iterValidityState != IterValid { - i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) + i.keyBuf = append(i.keyBuf[:0], i.iterKV.K.UserKey...) i.key = i.keyBuf } for { - i.iterKey, i.iterValue = i.iter.Next() i.stats.ForwardStepCount[InternalIterCall]++ + i.iterKV = i.iter.Next() + if i.iterKV == nil { + if err := i.iter.Error(); err != nil { + i.err = err + return + } + } // NB: We're guaranteed to be on the next user key if the previous key // had a zero sequence number (`done`), or the new key has a trailer // greater or equal to the previous key's trailer. This is true because - // internal keys with the same user key are sorted by Trailer in + // internal keys with the same user key are sorted by InternalKeyTrailer in // strictly monotonically descending order. We expect the trailer // optimization to trigger around 50% of the time with randomly // distributed writes. We expect it to trigger very frequently when // iterating through ingested sstables, which contain keys that all have // the same sequence number. - if done || i.iterKey == nil || i.iterKey.Trailer >= trailer { + if done || i.iterKV == nil || i.iterKV.K.Trailer >= trailer { break } - if !i.equal(i.key, i.iterKey.UserKey) { + if !i.equal(i.key, i.iterKV.K.UserKey) { break } - done = i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer - trailer = i.iterKey.Trailer + done = i.iterKV.K.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer + trailer = i.iterKV.K.Trailer } } @@ -800,15 +804,17 @@ func (i *Iterator) maybeSampleRead() { } bytesRead := uint64(len(i.key) + i.value.Len()) for i.readSampling.bytesUntilReadSampling < bytesRead { - i.readSampling.bytesUntilReadSampling += uint64(fastrand.Uint32n(2 * uint32(samplingPeriod))) + i.readSampling.bytesUntilReadSampling += uint64(rand.Uint32N(2 * uint32(samplingPeriod))) // The block below tries to adjust for the case where this is the // first read in a newly-opened iterator. As bytesUntilReadSampling // starts off at zero, we don't want to sample the first read of // every newly-opened iterator, but we do want to sample some of them. if !i.readSampling.initialSamplePassed { i.readSampling.initialSamplePassed = true - if fastrand.Uint32n(uint32(i.readSampling.bytesUntilReadSampling)) > uint32(bytesRead) { - continue + if i.readSampling.bytesUntilReadSampling > bytesRead { + if rand.Uint64N(i.readSampling.bytesUntilReadSampling) > bytesRead { + continue + } } } i.sampleRead() @@ -817,23 +823,26 @@ func (i *Iterator) maybeSampleRead() { } func (i *Iterator) sampleRead() { - var topFile *manifest.FileMetadata + var topFile *manifest.TableMetadata topLevel, numOverlappingLevels := numLevels, 0 mi := i.merging if mi == nil { return } if len(mi.levels) > 1 { - mi.ForEachLevelIter(func(li *levelIter) bool { - l := manifest.LevelToInt(li.level) + mi.ForEachLevelIter(func(li *levelIter) (done bool) { + if li.layer.IsFlushableIngests() { + return false + } + l := li.layer.Level() if f := li.iterFile; f != nil { var containsKey bool if i.pos == iterPosNext || i.pos == iterPosCurForward || i.pos == iterPosCurForwardPaused { - containsKey = i.cmp(f.SmallestPointKey.UserKey, i.key) <= 0 + containsKey = i.cmp(f.PointKeyBounds.SmallestUserKey(), i.key) <= 0 } else if i.pos == iterPosPrev || i.pos == iterPosCurReverse || i.pos == iterPosCurReversePaused { - containsKey = i.cmp(f.LargestPointKey.UserKey, i.key) >= 0 + containsKey = i.cmp(f.PointKeyBounds.LargestUserKey(), i.key) >= 0 } // Do nothing if the current key is not contained in f's // bounds. We could seek the LevelIterator at this level @@ -841,7 +850,7 @@ func (i *Iterator) sampleRead() { // doing that are significant enough to negate the benefits // of read sampling in the first place. See the discussion // at: - // https://github.com/cockroachdb/pebble/pull/1041#issuecomment-763226492 + // https://github.com/cockroachdb/pebble/v2/pull/1041#issuecomment-763226492 if containsKey { numOverlappingLevels++ if numOverlappingLevels >= 2 { @@ -869,10 +878,10 @@ func (i *Iterator) sampleRead() { topFile.AllowedSeeks.Add(topFile.InitAllowedSeeks) read := readCompaction{ - start: topFile.SmallestPointKey.UserKey, - end: topFile.LargestPointKey.UserKey, - level: topLevel, - fileNum: topFile.FileNum, + start: topFile.PointKeyBounds.SmallestUserKey(), + end: topFile.PointKeyBounds.LargestUserKey(), + level: topLevel, + tableNum: topFile.TableNum, } i.readSampling.pendingCompactions.add(&read, i.cmp) } @@ -903,15 +912,15 @@ func (i *Iterator) findPrevEntry(limit []byte) { // findNextEntry, this is being done to make the behavior of limit // deterministic to allow for metamorphic testing. It is not required by // the best-effort contract of limit. - for i.iterKey != nil { - key := *i.iterKey + for i.iterKV != nil { + key := i.iterKV.K // NB: We cannot pause if the current key is covered by a range key. // Otherwise, the user might not ever learn of a range key that covers // the key space being iterated over in which there are no point keys. // Since limits are best effort, ignoring the limit in this case is // allowed by the contract of limit. - if firstLoopIter && limit != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { + if firstLoopIter && limit != nil && i.cmp(limit, i.iterKV.K.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { i.iterValidityState = IterAtLimit i.pos = iterPosCurReversePaused return @@ -932,7 +941,7 @@ func (i *Iterator) findPrevEntry(limit []byte) { // a range key boundary at this key, we still want to // return. Otherwise, we need to continue looking for // a live key. - i.value = LazyValue{} + i.value = base.InternalValue{} if rangeKeyBoundary { i.rangeKey.rangeKeyOnly = true } else { @@ -961,8 +970,15 @@ func (i *Iterator) findPrevEntry(limit []byte) { // whether we skip over just the internal key, the user key, or even // the key prefix. i.stats.ReverseStepCount[InternalIterCall]++ - i.iterKey, i.iterValue = i.iter.Prev() - if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { + i.iterKV = i.iter.Prev() + if i.iterKV == nil { + if err := i.iter.Error(); err != nil { + i.err = err + i.iterValidityState = IterExhausted + return + } + } + if limit != nil && i.iterKV != nil && i.cmp(limit, i.iterKV.K.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { i.iterValidityState = IterAtLimit i.pos = iterPosCurReversePaused return @@ -977,6 +993,10 @@ func (i *Iterator) findPrevEntry(limit []byte) { // must've already iterated over it. // This is the final entry at this user key, so we may return i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid + if i.rangeKey.rangeKeyOnly { + // The point iterator is now invalid, so clear the point value. + i.value = base.InternalValue{} + } i.keyBuf = append(i.keyBuf[:0], key.UserKey...) i.key = i.keyBuf i.iterValidityState = IterValid @@ -992,18 +1012,18 @@ func (i *Iterator) findPrevEntry(limit []byte) { // that we can maintain the invariant during backward iteration that // i.iterPos = iterPosPrev. i.stats.ReverseStepCount[InternalIterCall]++ - i.iterKey, i.iterValue = i.iter.Prev() + i.iterKV = i.iter.Prev() // Set rangeKeyBoundary so that on the next iteration, we know to // return the key even if the MERGE point key is deleted. rangeKeyBoundary = true case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: - i.value = LazyValue{} + i.value = base.InternalValue{} i.iterValidityState = IterExhausted valueMerger = nil - i.iterKey, i.iterValue = i.iter.Prev() i.stats.ReverseStepCount[InternalIterCall]++ + i.iterKV = i.iter.Prev() // Compare with the limit. We could optimize by only checking when // we step to the previous user key, but detecting that requires a // comparison too. Note that this position may already passed a @@ -1013,7 +1033,7 @@ func (i *Iterator) findPrevEntry(limit []byte) { // other than the firstLoopIter and SkipPoint cases above, where we // could step to a different user key and start processing it for // returning to the caller. - if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { + if limit != nil && i.iterKV != nil && i.cmp(limit, i.iterKV.K.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { i.iterValidityState = IterAtLimit i.pos = iterPosCurReversePaused return @@ -1027,10 +1047,10 @@ func (i *Iterator) findPrevEntry(limit []byte) { // call, so use valueBuf instead. Note that valueBuf is only used // in this one instance; everywhere else (eg. in findNextEntry), // we just point i.value to the unsafe i.iter-owned value buffer. - i.value, i.valueBuf = i.iterValue.Clone(i.valueBuf[:0], &i.fetcher) + i.value, i.valueBuf = i.iterKV.V.Clone(i.valueBuf[:0], &i.fetcher) i.saveRangeKey() i.iterValidityState = IterValid - i.iterKey, i.iterValue = i.iter.Prev() + i.iterKV = i.iter.Prev() i.stats.ReverseStepCount[InternalIterCall]++ valueMerger = nil continue @@ -1041,7 +1061,7 @@ func (i *Iterator) findPrevEntry(limit []byte) { i.key = i.keyBuf i.saveRangeKey() var iterValue []byte - iterValue, _, i.err = i.iterValue.Value(nil) + iterValue, _, i.err = i.iterKV.Value(nil) if i.err != nil { return } @@ -1061,12 +1081,14 @@ func (i *Iterator) findPrevEntry(limit []byte) { i.lazyValueBuf = value[:0] } if i.err != nil { + i.iterValidityState = IterExhausted return } valueMerger, i.err = i.merge(i.key, value) var iterValue []byte - iterValue, _, i.err = i.iterValue.Value(nil) + iterValue, _, i.err = i.iterKV.Value(nil) if i.err != nil { + i.iterValidityState = IterExhausted return } if i.err == nil { @@ -1078,8 +1100,9 @@ func (i *Iterator) findPrevEntry(limit []byte) { } } else { var iterValue []byte - iterValue, _, i.err = i.iterValue.Value(nil) + iterValue, _, i.err = i.iterKV.Value(nil) if i.err != nil { + i.iterValidityState = IterExhausted return } i.err = valueMerger.MergeNewer(iterValue) @@ -1088,7 +1111,7 @@ func (i *Iterator) findPrevEntry(limit []byte) { return } } - i.iterKey, i.iterValue = i.iter.Prev() + i.iterKV = i.iter.Prev() i.stats.ReverseStepCount[InternalIterCall]++ continue @@ -1098,8 +1121,14 @@ func (i *Iterator) findPrevEntry(limit []byte) { return } } + // i.iterKV == nil, so broke out of the preceding loop. + + // Is iterKey nil due to an error? + if i.err = i.iter.Error(); i.err != nil { + i.iterValidityState = IterExhausted + return + } - // i.iterKey == nil, so broke out of the preceding loop. if i.iterValidityState == IterValid { i.pos = iterPosPrev if valueMerger != nil { @@ -1109,7 +1138,7 @@ func (i *Iterator) findPrevEntry(limit []byte) { i.value = base.MakeInPlaceValue(value) if i.err == nil && needDelete { i.key = nil - i.value = LazyValue{} + i.value = base.InternalValue{} i.iterValidityState = IterExhausted } } @@ -1120,22 +1149,26 @@ func (i *Iterator) findPrevEntry(limit []byte) { } func (i *Iterator) prevUserKey() { - if i.iterKey == nil { + if i.iterKV == nil { return } if i.iterValidityState != IterValid { // If we're going to compare against the prev key, we need to save the // current key. - i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) + i.keyBuf = append(i.keyBuf[:0], i.iterKV.K.UserKey...) i.key = i.keyBuf } for { - i.iterKey, i.iterValue = i.iter.Prev() + i.iterKV = i.iter.Prev() i.stats.ReverseStepCount[InternalIterCall]++ - if i.iterKey == nil { + if i.iterKV == nil { + if err := i.iter.Error(); err != nil { + i.err = err + i.iterValidityState = IterExhausted + } break } - if !i.equal(i.key, i.iterKey.UserKey) { + if !i.equal(i.key, i.iterKV.K.UserKey) { break } } @@ -1148,13 +1181,16 @@ func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) { // Loop looking for older values for this key and merging them. for { - i.iterKey, i.iterValue = i.iter.Next() + i.iterKV = i.iter.Next() i.stats.ForwardStepCount[InternalIterCall]++ - if i.iterKey == nil { + if i.iterKV == nil { + if i.err = i.iter.Error(); i.err != nil { + return + } i.pos = iterPosNext return } - key = *i.iterKey + key = i.iterKV.K if !i.equal(i.key, key.UserKey) { // We've advanced to the next key. i.pos = iterPosNext @@ -1173,7 +1209,7 @@ func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) { case InternalKeyKindSet, InternalKeyKindSetWithDelete: // We've hit a Set value. Merge with the existing value and return. var iterValue []byte - iterValue, _, i.err = i.iterValue.Value(nil) + iterValue, _, i.err = i.iterKV.Value(nil) if i.err != nil { return } @@ -1184,7 +1220,7 @@ func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) { // We've hit another Merge value. Merge with the existing value and // continue looping. var iterValue []byte - iterValue, _, i.err = i.iterValue.Value(nil) + iterValue, _, i.err = i.iterKV.Value(nil) if i.err != nil { return } @@ -1242,7 +1278,6 @@ func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState { i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() } lastPositioningOp := i.lastPositioningOp - hasPrefix := i.hasPrefix // Set it to unknown, since this operation may not succeed, in which case // the SeekGE following this should not make any assumption about iterator // position. @@ -1275,7 +1310,7 @@ func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState { (i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 && (limit == nil || i.cmp(i.key, limit) < 0))) { // Noop - if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) || i.forceEnableSeekOpt { + if i.forceEnableSeekOpt || !testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) { i.lastPositioningOp = seekGELastPositioningOp return i.iterValidityState } @@ -1296,38 +1331,24 @@ func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState { if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil { flags = flags.EnableTrySeekUsingNext() } - if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { + if testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) && !i.forceEnableSeekOpt { flags = flags.DisableTrySeekUsingNext() } - if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKey.UserKey) <= 0 { + if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKV.K.UserKey) <= 0 { // Have some work to do, but don't need to seek, and we can // start doing findNextEntry from i.iterKey. seekInternalIter = false } } } - // Check for another TrySeekUsingNext optimization opportunity, currently - // specifically tailored to external iterators. This case is intended to - // trigger in instances of Seek-ing with monotonically increasing keys with - // Nexts interspersed. At the time of writing, this is the case for - // CockroachDB scans. This optimization is important for external iterators - // to avoid re-seeking within an already-exhausted sstable. It is not always - // a performance win more generally, so we restrict it to external iterators - // that are configured to only use forward positioning operations. - // - // TODO(jackson): This optimization should be obsolete once we introduce and - // use the NextPrefix iterator positioning operation. - if seekInternalIter && i.forwardOnly && lastPositioningOp != invalidatedLastPositionOp && - i.pos == iterPosCurForward && !hasPrefix && i.iterValidityState == IterValid && - i.cmp(key, i.iterKey.UserKey) > 0 { - flags = flags.EnableTrySeekUsingNext() - if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { - flags = flags.DisableTrySeekUsingNext() - } - } if seekInternalIter { - i.iterKey, i.iterValue = i.iter.SeekGE(key, flags) + i.iterKV = i.iter.SeekGE(key, flags) i.stats.ForwardSeekCount[InternalIterCall]++ + if err := i.iter.Error(); err != nil { + i.err = err + i.iterValidityState = IterExhausted + return i.iterValidityState + } } i.findNextEntry(limit) i.maybeSampleRead() @@ -1413,13 +1434,10 @@ func (i *Iterator) SeekPrefixGE(key []byte) bool { i.requiresReposition = false i.err = nil // clear cached iteration error i.stats.ForwardSeekCount[InterfaceCall]++ - if i.comparer.Split == nil { - panic("pebble: split must be provided for SeekPrefixGE") - } if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly { panic("pebble: ImmediateSuccessor must be provided for SeekPrefixGE with range keys") } - prefixLen := i.split(key) + prefixLen := i.comparer.Split(key) keyPrefix := key[:prefixLen] var flags base.SeekGEFlags if i.batchJustRefreshed { @@ -1452,7 +1470,7 @@ func (i *Iterator) SeekPrefixGE(key []byte) bool { if cmp < 0 { flags = flags.EnableTrySeekUsingNext() } - if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { + if testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) && !i.forceEnableSeekOpt { flags = flags.DisableTrySeekUsingNext() } } @@ -1467,21 +1485,21 @@ func (i *Iterator) SeekPrefixGE(key []byte) bool { copy(i.prefixOrFullSeekKey, keyPrefix) if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { - if n := i.split(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, lowerBound[:n]) { + if p := i.comparer.Split.Prefix(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, p) { i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound") i.iterValidityState = IterExhausted return false } key = lowerBound } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { - if n := i.split(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, upperBound[:n]) { + if p := i.comparer.Split.Prefix(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, p) { i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound") i.iterValidityState = IterExhausted return false } key = upperBound } - i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags) + i.iterKV = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags) i.stats.ForwardSeekCount[InternalIterCall]++ i.findNextEntry(nil) i.maybeSampleRead() @@ -1491,10 +1509,13 @@ func (i *Iterator) SeekPrefixGE(key []byte) bool { return i.iterValidityState == IterValid } -// Deterministic disabling of the seek optimizations. It uses the iterator -// pointer, since we want diversity in iterator behavior for the same key. Used -// for tests. -func disableSeekOpt(key []byte, ptr uintptr) bool { +// Deterministic disabling (in testing mode) of the seek optimizations. It uses +// the iterator pointer, since we want diversity in iterator behavior for the +// same key. Used for tests. +func testingDisableSeekOpt(key []byte, ptr uintptr) bool { + if !invariants.Enabled { + return false + } // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ simpleHash := (11400714819323198485 * uint64(ptr)) >> 63 return key != nil && key[0]&byte(1) == 0 && simpleHash == 0 @@ -1567,12 +1588,12 @@ func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState { if i.iterValidityState == IterExhausted || (i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 && (limit == nil || i.cmp(limit, i.key) <= 0)) { - if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { + if !testingDisableSeekOpt(key, uintptr(unsafe.Pointer(i))) { i.lastPositioningOp = seekLTLastPositioningOp return i.iterValidityState } } - if i.pos == iterPosCurReversePaused && i.cmp(i.iterKey.UserKey, key) < 0 { + if i.pos == iterPosCurReversePaused && i.cmp(i.iterKV.K.UserKey, key) < 0 { // Have some work to do, but don't need to seek, and we can // start doing findPrevEntry from i.iterKey. seekInternalIter = false @@ -1580,8 +1601,13 @@ func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState { } } if seekInternalIter { - i.iterKey, i.iterValue = i.iter.SeekLT(key, base.SeekLTFlagsNone) + i.iterKV = i.iter.SeekLT(key, base.SeekLTFlagsNone) i.stats.ReverseSeekCount[InternalIterCall]++ + if err := i.iter.Error(); err != nil { + i.err = err + i.iterValidityState = IterExhausted + return i.iterValidityState + } } i.findPrevEntry(limit) i.maybeSampleRead() @@ -1593,7 +1619,7 @@ func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState { return i.iterValidityState } -// First moves the iterator the the first key/value pair. Returns true if the +// First moves the iterator the first key/value pair. Returns true if the // iterator is pointing at a valid entry and false otherwise. func (i *Iterator) First() bool { if i.rangeKey != nil { @@ -1619,13 +1645,17 @@ func (i *Iterator) First() bool { i.requiresReposition = false i.stats.ForwardSeekCount[InterfaceCall]++ - i.iterFirstWithinBounds() + i.err = i.iterFirstWithinBounds() + if i.err != nil { + i.iterValidityState = IterExhausted + return false + } i.findNextEntry(nil) i.maybeSampleRead() return i.iterValidityState == IterValid } -// Last moves the iterator the the last key/value pair. Returns true if the +// Last moves the iterator the last key/value pair. Returns true if the // iterator is pointing at a valid entry and false otherwise. func (i *Iterator) Last() bool { if i.rangeKey != nil { @@ -1651,7 +1681,10 @@ func (i *Iterator) Last() bool { i.requiresReposition = false i.stats.ReverseSeekCount[InterfaceCall]++ - i.iterLastWithinBounds() + if i.err = i.iterLastWithinBounds(); i.err != nil { + i.iterValidityState = IterExhausted + return false + } i.findPrevEntry(nil) i.maybeSampleRead() return i.iterValidityState == IterValid @@ -1701,6 +1734,9 @@ func (i *Iterator) NextPrefix() bool { i.iterValidityState = IterExhausted return false } + if i.Error() != nil { + return false + } return i.nextPrefix() == IterValid } @@ -1735,7 +1771,7 @@ func (i *Iterator) nextPrefix() IterValidityState { switch i.pos { case iterPosCurForward: // Positioned on the current key. Advance to the next prefix. - i.internalNextPrefix(i.split(i.key)) + i.internalNextPrefix(i.comparer.Split(i.key)) case iterPosCurForwardPaused: // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO // up above. The iterator is already positioned at the next key. @@ -1743,21 +1779,24 @@ func (i *Iterator) nextPrefix() IterValidityState { // Switching directions. // Unless the iterator was exhausted, reverse iteration needs to // position the iterator at iterPosPrev. - if i.iterKey != nil { + if i.iterKV != nil { i.err = errors.New("switching from reverse to forward but iter is not at prev") i.iterValidityState = IterExhausted return i.iterValidityState } // The Iterator is exhausted and i.iter is positioned before the first // key. Reposition to point to the first internal key. - i.iterFirstWithinBounds() + if i.err = i.iterFirstWithinBounds(); i.err != nil { + i.iterValidityState = IterExhausted + return i.iterValidityState + } case iterPosCurReversePaused: // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO // up above. // // Switching directions; The iterator must not be exhausted since it // paused. - if i.iterKey == nil { + if i.iterKV == nil { i.err = errors.New("switching paused from reverse to forward but iter is exhausted") i.iterValidityState = IterExhausted return i.iterValidityState @@ -1766,31 +1805,48 @@ func (i *Iterator) nextPrefix() IterValidityState { case iterPosPrev: // The underlying iterator is pointed to the previous key (this can // only happen when switching iteration directions). - if i.iterKey == nil { + if i.iterKV == nil { // We're positioned before the first key. Need to reposition to point to // the first key. - i.iterFirstWithinBounds() + i.err = i.iterFirstWithinBounds() + if i.iterKV == nil { + i.iterValidityState = IterExhausted + return i.iterValidityState + } + if invariants.Enabled && !i.equal(i.iterKV.K.UserKey, i.key) { + i.opts.getLogger().Fatalf("pebble: invariant violation: First internal iterator from iterPosPrev landed on %q, not %q", + i.iterKV.K.UserKey, i.key) + } } else { // Move the internal iterator back onto the user key stored in // i.key. iterPosPrev guarantees that it's positioned at the last // key with the user key less than i.key, so we're guaranteed to // land on the correct key with a single Next. - i.iterKey, i.iterValue = i.iter.Next() - if invariants.Enabled && !i.equal(i.iterKey.UserKey, i.key) { - i.opts.logger.Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q", - i.iterKey.UserKey, i.key) + i.iterKV = i.iter.Next() + if i.iterKV == nil { + // This should only be possible if i.iter.Next() encountered an + // error. + if i.iter.Error() == nil { + i.opts.getLogger().Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev found nothing") + } + // NB: Iterator.Error() will return i.iter.Error(). + i.iterValidityState = IterExhausted + return i.iterValidityState + } + if invariants.Enabled && !i.equal(i.iterKV.K.UserKey, i.key) { + i.opts.getLogger().Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q", + i.iterKV.K.UserKey, i.key) } } // The internal iterator is now positioned at i.key. Advance to the next // prefix. - i.internalNextPrefix(i.split(i.key)) + i.internalNextPrefix(i.comparer.Split(i.key)) case iterPosNext: // Already positioned on the next key. Only call nextPrefixKey if the // next key shares the same prefix. - if i.iterKey != nil { - currKeyPrefixLen := i.split(i.key) - iterKeyPrefixLen := i.split(i.iterKey.UserKey) - if bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) { + if i.iterKV != nil { + currKeyPrefixLen := i.comparer.Split(i.key) + if bytes.Equal(i.comparer.Split.Prefix(i.iterKV.K.UserKey), i.key[:currKeyPrefixLen]) { i.internalNextPrefix(currKeyPrefixLen) } } @@ -1803,7 +1859,7 @@ func (i *Iterator) nextPrefix() IterValidityState { } func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) { - if i.iterKey == nil { + if i.iterKV == nil { return } // The Next "fast-path" is not really a fast-path when there is more than @@ -1811,20 +1867,23 @@ func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) { // slowdown (~10%) for one version if we remove it and only call NextPrefix. // When there are two versions, only calling NextPrefix is ~30% faster. i.stats.ForwardStepCount[InternalIterCall]++ - if i.iterKey, i.iterValue = i.iter.Next(); i.iterKey == nil { + if i.iterKV = i.iter.Next(); i.iterKV == nil { return } - iterKeyPrefixLen := i.split(i.iterKey.UserKey) - if !bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) { + if !bytes.Equal(i.comparer.Split.Prefix(i.iterKV.K.UserKey), i.key[:currKeyPrefixLen]) { return } i.stats.ForwardStepCount[InternalIterCall]++ i.prefixOrFullSeekKey = i.comparer.ImmediateSuccessor(i.prefixOrFullSeekKey[:0], i.key[:currKeyPrefixLen]) - i.iterKey, i.iterValue = i.iter.NextPrefix(i.prefixOrFullSeekKey) - if invariants.Enabled && i.iterKey != nil { - if iterKeyPrefixLen := i.split(i.iterKey.UserKey); i.cmp(i.iterKey.UserKey[:iterKeyPrefixLen], i.prefixOrFullSeekKey) < 0 { + if i.iterKV.K.IsExclusiveSentinel() { + panic(errors.AssertionFailedf("pebble: unexpected exclusive sentinel key: %q", i.iterKV.K)) + } + + i.iterKV = i.iter.NextPrefix(i.prefixOrFullSeekKey) + if invariants.Enabled && i.iterKV != nil { + if p := i.comparer.Split.Prefix(i.iterKV.K.UserKey); i.cmp(p, i.prefixOrFullSeekKey) < 0 { panic(errors.AssertionFailedf("pebble: iter.NextPrefix did not advance beyond the current prefix: now at %q; expected to be geq %q", - i.iterKey, i.prefixOrFullSeekKey)) + i.iterKV.K, i.prefixOrFullSeekKey)) } } } @@ -1837,7 +1896,7 @@ func (i *Iterator) nextWithLimit(limit []byte) IterValidityState { i.iterValidityState = IterExhausted return i.iterValidityState } else if i.iterValidityState == IterExhausted { - // No-op, already exhasuted. We avoid executing the Next because it + // No-op, already exhausted. We avoid executing the Next because it // can break invariants: Specifically, a file that fails the bloom // filter test may result in its level being removed from the // merging iterator. The level's removal can cause a lazy combined @@ -1876,18 +1935,21 @@ func (i *Iterator) nextWithLimit(limit []byte) IterValidityState { // Switching directions. // Unless the iterator was exhausted, reverse iteration needs to // position the iterator at iterPosPrev. - if i.iterKey != nil { + if i.iterKV != nil { i.err = errors.New("switching from reverse to forward but iter is not at prev") i.iterValidityState = IterExhausted return i.iterValidityState } // We're positioned before the first key. Need to reposition to point to // the first key. - i.iterFirstWithinBounds() + if i.err = i.iterFirstWithinBounds(); i.err != nil { + i.iterValidityState = IterExhausted + return i.iterValidityState + } case iterPosCurReversePaused: // Switching directions. // The iterator must not be exhausted since it paused. - if i.iterKey == nil { + if i.iterKV == nil { i.err = errors.New("switching paused from reverse to forward but iter is exhausted") i.iterValidityState = IterExhausted return i.iterValidityState @@ -1900,13 +1962,17 @@ func (i *Iterator) nextWithLimit(limit []byte) IterValidityState { // nextUserKey to save the current key i.iter is pointing at in order // to determine when the next user-key is reached. i.iterValidityState = IterExhausted - if i.iterKey == nil { + if i.iterKV == nil { // We're positioned before the first key. Need to reposition to point to // the first key. - i.iterFirstWithinBounds() + i.err = i.iterFirstWithinBounds() } else { i.nextUserKey() } + if i.err != nil { + i.iterValidityState = IterExhausted + return i.iterValidityState + } i.nextUserKey() case iterPosNext: // Already at the right place. @@ -1998,15 +2064,21 @@ func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState { // to prevUserKey to save the current key i.iter is pointing at in // order to determine when the prev user-key is reached. i.iterValidityState = IterExhausted - if i.iterKey == nil { + if i.iterKV == nil { // We're positioned after the last key. Need to reposition to point to // the last key. - i.iterLastWithinBounds() + i.err = i.iterLastWithinBounds() } else { i.prevUserKey() } + if i.err != nil { + return i.iterValidityState + } if stepAgain { i.prevUserKey() + if i.err != nil { + return i.iterValidityState + } } } i.findPrevEntry(limit) @@ -2016,24 +2088,32 @@ func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState { // iterFirstWithinBounds moves the internal iterator to the first key, // respecting bounds. -func (i *Iterator) iterFirstWithinBounds() { +func (i *Iterator) iterFirstWithinBounds() error { i.stats.ForwardSeekCount[InternalIterCall]++ if lowerBound := i.opts.GetLowerBound(); lowerBound != nil { - i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone) + i.iterKV = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone) } else { - i.iterKey, i.iterValue = i.iter.First() + i.iterKV = i.iter.First() } + if i.iterKV == nil { + return i.iter.Error() + } + return nil } // iterLastWithinBounds moves the internal iterator to the last key, respecting // bounds. -func (i *Iterator) iterLastWithinBounds() { +func (i *Iterator) iterLastWithinBounds() error { i.stats.ReverseSeekCount[InternalIterCall]++ if upperBound := i.opts.GetUpperBound(); upperBound != nil { - i.iterKey, i.iterValue = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone) + i.iterKV = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone) } else { - i.iterKey, i.iterValue = i.iter.Last() + i.iterKV = i.iter.Last() + } + if i.iterKV == nil { + return i.iter.Error() } + return nil } // RangeKeyData describes a range key's data, set through RangeKeySet. The key @@ -2119,7 +2199,7 @@ func (i *Iterator) saveRangeKey() { if invariants.Enabled { if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet { panic("pebble: user iteration encountered non-RangeKeySet key kind") - } else if j > 0 && i.cmp(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 { + } else if j > 0 && i.comparer.CompareRangeSuffixes(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 { panic("pebble: user iteration encountered range keys not in suffix order") } } @@ -2200,6 +2280,7 @@ func (i *Iterator) ValueAndErr() ([]byte, error) { val, callerOwned, err := i.value.Value(i.lazyValueBuf) if err != nil { i.err = err + i.iterValidityState = IterExhausted } if callerOwned { i.lazyValueBuf = val[:0] @@ -2210,7 +2291,7 @@ func (i *Iterator) ValueAndErr() ([]byte, error) { // LazyValue returns the LazyValue. Only for advanced use cases. // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint. func (i *Iterator) LazyValue() LazyValue { - return i.value + return i.value.LazyValue() } // RangeKeys returns the range key values and their suffixes covering the @@ -2229,7 +2310,7 @@ func (i *Iterator) Valid() bool { valid := i.iterValidityState == IterValid && !i.requiresReposition if invariants.Enabled { if err := i.Error(); valid && err != nil { - panic(errors.WithSecondaryError(errors.AssertionFailedf("pebble: iterator is valid with non-nil Error"), err)) + panic(errors.AssertionFailedf("pebble: iterator is valid with non-nil Error: %+v", err)) } } return valid @@ -2237,10 +2318,13 @@ func (i *Iterator) Valid() bool { // Error returns any accumulated error. func (i *Iterator) Error() error { + if i.err != nil { + return i.err + } if i.iter != nil { - return firstError(i.err, i.iter.Error()) + return i.iter.Error() } - return i.err + return nil } const maxKeyBufCacheSize = 4 << 10 // 4 KB @@ -2266,12 +2350,13 @@ func (i *Iterator) Close() error { // NB: If the iterators were still connected to i.iter, they may be // closed, but calling Close on a closed internal iterator or fragment // iterator is allowed. - if i.pointIter != nil && !i.closePointIterOnce { + if i.pointIter != nil { i.err = firstError(i.err, i.pointIter.Close()) } if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil { - i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) + i.rangeKey.rangeKeyIter.Close() } + i.err = firstError(i.err, i.blobValueFetcher.Close()) } err := i.err @@ -2300,11 +2385,8 @@ func (i *Iterator) Close() error { if i.version != nil { i.version.Unref() } - - for _, readers := range i.externalReaders { - for _, r := range readers { - err = firstError(err, r.Close()) - } + if i.externalIter != nil { + err = firstError(err, i.externalIter.Close()) } // Close the closer for the current value if one was open. @@ -2314,7 +2396,6 @@ func (i *Iterator) Close() error { } if i.rangeKey != nil { - i.rangeKey.rangeKeyBuffers.PrepareForReuse() *i.rangeKey = iteratorRangeKeyState{ rangeKeyBuffers: i.rangeKey.rangeKeyBuffers, @@ -2323,30 +2404,50 @@ func (i *Iterator) Close() error { i.rangeKey = nil } if alloc := i.alloc; alloc != nil { + var ( + keyBuf []byte + boundsBuf [2][]byte + prefixOrFullSeekKey []byte + mergingIterHeapItems []mergingIterHeapItem + ) + // Avoid caching the key buf if it is overly large. The constant is fairly // arbitrary. - if cap(i.keyBuf) >= maxKeyBufCacheSize { - alloc.keyBuf = nil - } else { - alloc.keyBuf = i.keyBuf + if cap(i.keyBuf) < maxKeyBufCacheSize { + keyBuf = i.keyBuf } - if cap(i.prefixOrFullSeekKey) >= maxKeyBufCacheSize { - alloc.prefixOrFullSeekKey = nil - } else { - alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey + if cap(i.prefixOrFullSeekKey) < maxKeyBufCacheSize { + prefixOrFullSeekKey = i.prefixOrFullSeekKey } for j := range i.boundsBuf { - if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize { - alloc.boundsBuf[j] = nil - } else { - alloc.boundsBuf[j] = i.boundsBuf[j] + if cap(i.boundsBuf[j]) < maxKeyBufCacheSize { + boundsBuf[j] = i.boundsBuf[j] } } - *alloc = iterAlloc{ - keyBuf: alloc.keyBuf, - boundsBuf: alloc.boundsBuf, - prefixOrFullSeekKey: alloc.prefixOrFullSeekKey, - } + mergingIterHeapItems = alloc.merging.heap.items + + // Reset the alloc struct, re-assign the fields that are being recycled, and + // then return it to the pool. Splitting the first two steps performs better + // than doing them in a single step (e.g. *alloc = iterAlloc{...}) because + // the compiler can avoid the use of a stack allocated autotmp iterAlloc + // variable (~12KB, as of Dec 2024), which must first be zeroed out, then + // assigned into, then copied over into the heap-allocated alloc. Instead, + // the two-step process allows the compiler to quickly zero out the heap + // allocated object and then assign the few fields we want to preserve. + // + // TODO(nvanbenschoten): even with this optimization, zeroing out the alloc + // struct still shows up in profiles because it is such a large struct. Can + // we do something better here? We are hanging 22 separated iterators off of + // the alloc struct (or more, depending on how you count), many of which are + // only used in a few cases. Can those iterators be responsible for zeroing + // out their own memory on Close, allowing us to assume that most of the + // alloc struct is already zeroed out by this point? + *alloc = iterAlloc{} + alloc.keyBuf = keyBuf + alloc.boundsBuf = boundsBuf + alloc.prefixOrFullSeekKey = prefixOrFullSeekKey + alloc.merging.heap.items = mergingIterHeapItems + iterAllocPool.Put(alloc) } else if alloc := i.getIterAlloc; alloc != nil { if cap(i.keyBuf) >= maxKeyBufCacheSize { @@ -2407,6 +2508,22 @@ func (i *Iterator) SetBounds(lower, upper []byte) { i.invalidate() } +// SetContext replaces the context provided at iterator creation, or the last +// one provided by SetContext. Even though iterators are expected to be +// short-lived, there are some cases where either (a) iterators are used far +// from the code that created them, (b) iterators are reused (while being +// short-lived) for processing different requests. For such scenarios, we +// allow the caller to replace the context. +func (i *Iterator) SetContext(ctx context.Context) { + i.ctx = ctx + i.iter.SetContext(ctx) + // If the iterator has an open point iterator that's not currently being + // used, propagate the new context to it. + if i.pointIter != nil && !i.opts.pointKeys() { + i.pointIter.SetContext(i.ctx) + } +} + // Initialization and changing of the bounds must call processBounds. // processBounds saves the bounds and computes derived state from those // bounds. @@ -2426,14 +2543,12 @@ func (i *Iterator) processBounds(lower, upper []byte) { if upper != nil { buf = append(buf, upper...) i.opts.UpperBound = buf[len(buf)-len(upper):] - if i.comparer.Split != nil { - if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) { - // Setting an upper bound that is a versioned MVCC key. This means - // that a key can have some MVCC versions before the upper bound and - // some after. This causes significant complications for NextPrefix, - // so we bar the user of NextPrefix. - i.nextPrefixNotPermittedByUpperBound = true - } + if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) { + // Setting an upper bound that is a versioned MVCC key. This means + // that a key can have some MVCC versions before the upper bound and + // some after. This causes significant complications for NextPrefix, + // so we bar the user of NextPrefix. + i.nextPrefixNotPermittedByUpperBound = true } } else { i.opts.UpperBound = nil @@ -2460,7 +2575,7 @@ func (i *Iterator) processBounds(lower, upper []byte) { // // If only lower and upper bounds need to be modified, prefer SetBounds. func (i *Iterator) SetOptions(o *IterOptions) { - if i.externalReaders != nil { + if i.externalIter != nil { if err := validateExternalIterOpts(o); err != nil { panic(err) } @@ -2481,14 +2596,8 @@ func (i *Iterator) SetOptions(o *IterOptions) { // If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect, // improperly including or excluding memtables. Invalidate them so that // finishInitializingIter will reconstruct them. - // - // If either the original options or the new options specify a table filter, - // we need to reconstruct the iterator stacks. If they both supply a table - // filter, we can't be certain that it's the same filter since we have no - // mechanism to compare the filter closures. closeBoth := i.err != nil || - o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable || - o.TableFilter != nil || i.opts.TableFilter != nil + o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable // If either options specify block property filters for an iterator stack, // reconstruct it. @@ -2500,7 +2609,7 @@ func (i *Iterator) SetOptions(o *IterOptions) { } if i.rangeKey != nil { if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 { - i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) + i.rangeKey.rangeKeyIter.Close() i.rangeKey = nil } else { // If there's still a range key iterator stack, invalidate the @@ -2517,7 +2626,7 @@ func (i *Iterator) SetOptions(o *IterOptions) { // iterator or range-key iterator but we require one, it'll be created in // the slow path that reconstructs the iterator in finishInitializingIter. if i.batch != nil { - nextBatchSeqNum := (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch) + nextBatchSeqNum := (base.SeqNum(len(i.batch.data)) | base.SeqNumBatchBit) if nextBatchSeqNum != i.batchSeqNum { i.batchSeqNum = nextBatchSeqNum if i.merging != nil { @@ -2558,7 +2667,7 @@ func (i *Iterator) SetOptions(o *IterOptions) { // iterator stack. We need to reconstruct the range key // iterator to add i.batchRangeKeyIter into the iterator // stack. - i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) + i.rangeKey.rangeKeyIter.Close() i.rangeKey = nil } else { // There are range keys in the batch and we already @@ -2593,7 +2702,7 @@ func (i *Iterator) SetOptions(o *IterOptions) { if boundsEqual && o.KeyTypes == i.opts.KeyTypes && (i.pointIter != nil || !i.opts.pointKeys()) && (i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) && - i.equal(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) && + i.comparer.CompareRangeSuffixes(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) == 0 && o.UseL6Filters == i.opts.UseL6Filters { // The options are identical, so we can likely use the fast path. In // addition to all the above constraints, we cannot use the fast path if @@ -2638,18 +2747,17 @@ func (i *Iterator) SetOptions(o *IterOptions) { // Iterators created through NewExternalIter have a different iterator // initialization process. - if i.externalReaders != nil { - finishInitializingExternal(i.ctx, i) + if i.externalIter != nil { + _ = finishInitializingExternal(i.ctx, i) return } finishInitializingIter(i.ctx, i.alloc) } func (i *Iterator) invalidate() { - i.lastPositioningOp = invalidatedLastPositionOp + i.lastPositioningOp = unknownLastPositionOp i.hasPrefix = false - i.iterKey = nil - i.iterValue = LazyValue{} + i.iterKV = nil i.err = nil // This switch statement isn't necessary for correctness since callers // should call a repositioning method. We could have arbitrarily set i.pos @@ -2731,7 +2839,9 @@ func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*It if opts.IterOptions == nil { opts.IterOptions = &i.opts } - + if i.batchOnlyIter { + return nil, errors.Errorf("cannot Clone a batch-only Iterator") + } readState := i.readState vers := i.version if readState == nil && vers == nil { @@ -2766,6 +2876,7 @@ func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*It boundsBuf: buf.boundsBuf, batch: i.batch, batchSeqNum: i.batchSeqNum, + fc: i.fc, newIters: i.newIters, newIterRangeKey: i.newIterRangeKey, seqNum: i.seqNum, @@ -2775,7 +2886,7 @@ func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*It // If the caller requested the clone have a current view of the indexed // batch, set the clone's batch sequence number appropriately. if i.batch != nil && opts.RefreshBatchView { - dbi.batchSeqNum = (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch) + dbi.batchSeqNum = (base.SeqNum(len(i.batch.data)) | base.SeqNumBatchBit) } return finishInitializingIter(ctx, buf), nil @@ -2974,8 +3085,8 @@ func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) { switch i.pos { case iterPosCurForward: - i.iterKey, i.iterValue = i.iter.Next() - if i.iterKey == nil { + i.iterKV = i.iter.Next() + if i.iterKV == nil { // We check i.iter.Error() here and return an internalNextError enum // variant so that the caller does not need to check i.iter.Error() // in the common case that the next internal key has a new user key. @@ -2984,8 +3095,8 @@ func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) { } i.pos = iterPosNext return internalNextExhausted, base.InternalKeyKindInvalid - } else if i.comparer.Equal(i.iterKey.UserKey, i.key) { - return internalNextValid, i.iterKey.Kind() + } else if i.comparer.Equal(i.iterKV.K.UserKey, i.key) { + return internalNextValid, i.iterKV.Kind() } i.pos = iterPosNext return internalNextExhausted, base.InternalKeyKindInvalid @@ -3006,3 +3117,16 @@ func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) { panic("unreachable") } } + +var _ base.IteratorDebug = (*Iterator)(nil) + +// DebugTree implements the base.IteratorDebug interface. +func (i *Iterator) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.iter != nil { + i.iter.DebugTree(n) + } + if i.pointIter != nil { + i.pointIter.DebugTree(n) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/level_checker.go b/vendor/github.com/cockroachdb/pebble/v2/level_checker.go similarity index 62% rename from vendor/github.com/cockroachdb/pebble/level_checker.go rename to vendor/github.com/cockroachdb/pebble/v2/level_checker.go index 2901c45..31e4580 100644 --- a/vendor/github.com/cockroachdb/pebble/level_checker.go +++ b/vendor/github.com/cockroachdb/pebble/v2/level_checker.go @@ -5,15 +5,21 @@ package pebble import ( + stdcmp "cmp" "context" "fmt" "io" + "iter" + "slices" "sort" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/block" ) // This file implements DB.CheckLevels() which checks that every entry in the @@ -49,16 +55,22 @@ import ( type simpleMergingIterLevel struct { iter internalIterator rangeDelIter keyspan.FragmentIterator - levelIterBoundaryContext - iterKey *InternalKey - iterValue base.LazyValue + iterKV *base.InternalKV tombstone *keyspan.Span } +func (ml *simpleMergingIterLevel) setRangeDelIter(iter keyspan.FragmentIterator) { + ml.tombstone = nil + if ml.rangeDelIter != nil { + ml.rangeDelIter.Close() + } + ml.rangeDelIter = iter +} + type simpleMergingIter struct { levels []simpleMergingIterLevel - snapshot uint64 + snapshot base.SeqNum heap simpleMergingIterHeap // The last point's key and level. For validation. lastKey InternalKey @@ -76,7 +88,7 @@ type simpleMergingIter struct { func (m *simpleMergingIter) init( merge Merge, cmp Compare, - snapshot uint64, + snapshot base.SeqNum, formatKey base.FormatKey, levels ...simpleMergingIterLevel, ) { @@ -89,14 +101,13 @@ func (m *simpleMergingIter) init( m.heap.items = make([]simpleMergingIterItem, 0, len(levels)) for i := range m.levels { l := &m.levels[i] - l.iterKey, l.iterValue = l.iter.First() - if l.iterKey != nil { + l.iterKV = l.iter.First() + if l.iterKV != nil { item := simpleMergingIterItem{ index: i, - value: l.iterValue, + kv: *l.iterKV, } - item.key.Trailer = l.iterKey.Trailer - item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...) + item.kv.K = l.iterKV.K.Clone() m.heap.items = append(m.heap.items, item) } } @@ -117,7 +128,9 @@ func (m *simpleMergingIter) positionRangeDels() { if l.rangeDelIter == nil { continue } - l.tombstone = l.rangeDelIter.SeekGE(item.key.UserKey) + t, err := l.rangeDelIter.SeekGE(item.kv.K.UserKey) + m.err = firstError(m.err, err) + l.tombstone = t } } @@ -129,97 +142,11 @@ func (m *simpleMergingIter) step() bool { item := &m.heap.items[0] l := &m.levels[item.index] // Sentinels are not relevant for this point checking. - if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { - m.numPoints++ - keyChanged := m.heap.cmp(item.key.UserKey, m.lastKey.UserKey) != 0 - if !keyChanged { - // At the same user key. We will see them in decreasing seqnum - // order so the lastLevel must not be lower. - if m.lastLevel > item.index { - m.err = errors.Errorf("found InternalKey %s in %s and InternalKey %s in %s", - item.key.Pretty(m.formatKey), l.iter, m.lastKey.Pretty(m.formatKey), - m.lastIterMsg) - return false - } - m.lastLevel = item.index - } else { - // The user key has changed. - m.lastKey.Trailer = item.key.Trailer - m.lastKey.UserKey = append(m.lastKey.UserKey[:0], item.key.UserKey...) - m.lastLevel = item.index - } - // Ongoing series of MERGE records ends with a MERGE record. - if keyChanged && m.valueMerger != nil { - var closer io.Closer - _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) - if m.err == nil && closer != nil { - m.err = closer.Close() - } - m.valueMerger = nil - } - itemValue, _, err := item.value.Value(nil) - if err != nil { - m.err = err + if !item.kv.K.IsExclusiveSentinel() && item.kv.K.Visible(m.snapshot, base.SeqNumMax) { + // This is a visible point key. + if !m.handleVisiblePoint(item, l) { return false } - if m.valueMerger != nil { - // Ongoing series of MERGE records. - switch item.key.Kind() { - case InternalKeyKindSingleDelete, InternalKeyKindDelete, InternalKeyKindDeleteSized: - var closer io.Closer - _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) - if m.err == nil && closer != nil { - m.err = closer.Close() - } - m.valueMerger = nil - case InternalKeyKindSet, InternalKeyKindSetWithDelete: - m.err = m.valueMerger.MergeOlder(itemValue) - if m.err == nil { - var closer io.Closer - _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) - if m.err == nil && closer != nil { - m.err = closer.Close() - } - } - m.valueMerger = nil - case InternalKeyKindMerge: - m.err = m.valueMerger.MergeOlder(itemValue) - default: - m.err = errors.Errorf("pebble: invalid internal key kind %s in %s", - item.key.Pretty(m.formatKey), - l.iter) - return false - } - } else if item.key.Kind() == InternalKeyKindMerge && m.err == nil { - // New series of MERGE records. - m.valueMerger, m.err = m.merge(item.key.UserKey, itemValue) - } - if m.err != nil { - m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s", - item.key.Pretty(m.formatKey), l.iter) - return false - } - // Is this point covered by a tombstone at a lower level? Note that all these - // iterators must be positioned at a key > item.key. So the Largest key bound - // of the sstable containing the tombstone >= item.key. So the upper limit of - // the tombstone cannot be file-bounds-constrained to < item.key. But it is - // possible that item.key < smallest key bound of the sstable, in which case - // this tombstone should be ignored. - for level := item.index + 1; level < len(m.levels); level++ { - lvl := &m.levels[level] - if lvl.rangeDelIter == nil || lvl.tombstone.Empty() { - continue - } - if (lvl.smallestUserKey == nil || m.heap.cmp(lvl.smallestUserKey, item.key.UserKey) <= 0) && - lvl.tombstone.Contains(m.heap.cmp, item.key.UserKey) { - if lvl.tombstone.CoversAt(m.snapshot, item.key.SeqNum()) { - m.err = errors.Errorf("tombstone %s in %s deletes key %s in %s", - lvl.tombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey), - l.iter) - return false - } - } - } } // The iterator for the current level may be closed in the following call to @@ -228,67 +155,162 @@ func (m *simpleMergingIter) step() bool { m.lastIterMsg = l.iter.String() // Step to the next point. - if l.iterKey, l.iterValue = l.iter.Next(); l.iterKey != nil { + l.iterKV = l.iter.Next() + if l.iterKV == nil { + m.err = errors.CombineErrors(l.iter.Error(), l.iter.Close()) + l.iter = nil + m.heap.pop() + } else { // Check point keys in an sstable are ordered. Although not required, we check // for memtables as well. A subtle check here is that successive sstables of // L1 and higher levels are ordered. This happens when levelIter moves to the // next sstable in the level, in which case item.key is previous sstable's // last point key. - if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 { + if !l.iterKV.K.IsExclusiveSentinel() && base.InternalCompare(m.heap.cmp, item.kv.K, l.iterKV.K) >= 0 { m.err = errors.Errorf("out of order keys %s >= %s in %s", - item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter) + item.kv.K.Pretty(m.formatKey), l.iterKV.K.Pretty(m.formatKey), l.iter) return false } - item.key.Trailer = l.iterKey.Trailer - item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...) - item.value = l.iterValue + userKeyBuf := item.kv.K.UserKey[:0] + item.kv = *l.iterKV + item.kv.K.UserKey = append(userKeyBuf, l.iterKV.K.UserKey...) if m.heap.len() > 1 { m.heap.fix(0) } - } else { - m.err = l.iter.Close() - l.iter = nil - m.heap.pop() } if m.err != nil { return false } if m.heap.len() == 0 { - // Last record was a MERGE record. + // If m.valueMerger != nil, the last record was a MERGE record. if m.valueMerger != nil { + var closer io.Closer + var err error + _, closer, err = m.valueMerger.Finish(true /* includesBase */) + if closer != nil { + err = errors.CombineErrors(err, closer.Close()) + } + if err != nil { + m.err = errors.CombineErrors(m.err, + errors.Wrapf(err, "merge processing error on key %s in %s", + item.kv.K.Pretty(m.formatKey), m.lastIterMsg)) + } + m.valueMerger = nil + } + return false + } + m.positionRangeDels() + return true +} + +// handleVisiblePoint returns true if validation succeeded and level checking +// can continue. +func (m *simpleMergingIter) handleVisiblePoint( + item *simpleMergingIterItem, l *simpleMergingIterLevel, +) (ok bool) { + m.numPoints++ + keyChanged := m.heap.cmp(item.kv.K.UserKey, m.lastKey.UserKey) != 0 + if !keyChanged { + // At the same user key. We will see them in decreasing seqnum + // order so the lastLevel must not be lower. + if m.lastLevel > item.index { + m.err = errors.Errorf("found InternalKey %s in %s and InternalKey %s in %s", + item.kv.K.Pretty(m.formatKey), l.iter, m.lastKey.Pretty(m.formatKey), + m.lastIterMsg) + return false + } + m.lastLevel = item.index + } else { + // The user key has changed. + m.lastKey.Trailer = item.kv.K.Trailer + m.lastKey.UserKey = append(m.lastKey.UserKey[:0], item.kv.K.UserKey...) + m.lastLevel = item.index + } + // Ongoing series of MERGE records ends with a MERGE record. + if keyChanged && m.valueMerger != nil { + var closer io.Closer + _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) + if m.err == nil && closer != nil { + m.err = closer.Close() + } + m.valueMerger = nil + } + itemValue, _, err := item.kv.Value(nil) + if err != nil { + m.err = err + return false + } + if m.valueMerger != nil { + // Ongoing series of MERGE records. + switch item.kv.K.Kind() { + case InternalKeyKindSingleDelete, InternalKeyKindDelete, InternalKeyKindDeleteSized: var closer io.Closer _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) if m.err == nil && closer != nil { m.err = closer.Close() } - if m.err != nil { - m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s", - item.key.Pretty(m.formatKey), m.lastIterMsg) + m.valueMerger = nil + case InternalKeyKindSet, InternalKeyKindSetWithDelete: + m.err = m.valueMerger.MergeOlder(itemValue) + if m.err == nil { + var closer io.Closer + _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) + if m.err == nil && closer != nil { + m.err = closer.Close() + } } m.valueMerger = nil + case InternalKeyKindMerge: + m.err = m.valueMerger.MergeOlder(itemValue) + default: + m.err = errors.Errorf("pebble: invalid internal key kind %s in %s", + item.kv.K.Pretty(m.formatKey), + l.iter) + return false } + } else if item.kv.K.Kind() == InternalKeyKindMerge && m.err == nil { + // New series of MERGE records. + m.valueMerger, m.err = m.merge(item.kv.K.UserKey, itemValue) + } + if m.err != nil { + m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s", + item.kv.K.Pretty(m.formatKey), l.iter) return false } - m.positionRangeDels() + // Is this point covered by a tombstone at a lower level? Note that all these + // iterators must be positioned at a key > item.key. + for level := item.index + 1; level < len(m.levels); level++ { + lvl := &m.levels[level] + if lvl.rangeDelIter == nil || lvl.tombstone.Empty() { + continue + } + if lvl.tombstone.Contains(m.heap.cmp, item.kv.K.UserKey) && lvl.tombstone.CoversAt(m.snapshot, item.kv.K.SeqNum()) { + m.err = errors.Errorf("tombstone %s in %s deletes key %s in %s", + lvl.tombstone.Pretty(m.formatKey), lvl.iter, item.kv.K.Pretty(m.formatKey), + l.iter) + return false + } + } return true } -// Checking that range tombstones are mutually consistent is performed by checkRangeTombstones(). -// See the overview comment at the top of the file. +// Checking that range tombstones are mutually consistent is performed by +// checkRangeTombstones(). See the overview comment at the top of the file. // // We do this check as follows: -// - For each level that can have untruncated tombstones, compute the atomic compaction -// bounds (getAtomicUnitBounds()) and use them to truncate tombstones. -// - Now that we have a set of truncated tombstones for each level, put them into one -// pool of tombstones along with their level information (addTombstonesFromIter()). -// - Collect the start and end user keys from all these tombstones (collectAllUserKey()) and use -// them to fragment all the tombstones (fragmentUsingUserKey()). -// - Sort tombstones by start key and decreasing seqnum (tombstonesByStartKeyAndSeqnum) -- all -// tombstones that have the same start key will have the same end key because they have been -// fragmented. +// - Collect the tombstones for each level, put them into one pool of tombstones +// along with their level information (addTombstonesFromIter()). +// - Collect the start and end user keys from all these tombstones +// (collectAllUserKey()) and use them to fragment all the tombstones +// (fragmentUsingUserKey()). +// - Sort tombstones by start key and decreasing seqnum (all tombstones that +// have the same start key will have the same end key because they have been +// fragmented) // - Iterate and check (iterateAndCheckTombstones()). -// Note that this simple approach requires holding all the tombstones across all levels in-memory. -// A more sophisticated incremental approach could be devised, if necessary. +// +// Note that this simple approach requires holding all the tombstones across all +// levels in-memory. A more sophisticated incremental approach could be devised, +// if necessary. // A tombstone and the corresponding level it was found in. type tombstoneWithLevel struct { @@ -296,36 +318,18 @@ type tombstoneWithLevel struct { level int // The level in LSM. A -1 means it's a memtable. lsmLevel int - fileNum FileNum -} - -// For sorting tombstoneWithLevels in increasing order of start UserKey and -// for the same start UserKey in decreasing order of seqnum. -type tombstonesByStartKeyAndSeqnum struct { - cmp Compare - buf []tombstoneWithLevel -} - -func (v *tombstonesByStartKeyAndSeqnum) Len() int { return len(v.buf) } -func (v *tombstonesByStartKeyAndSeqnum) Less(i, j int) bool { - less := v.cmp(v.buf[i].Start, v.buf[j].Start) - if less == 0 { - return v.buf[i].LargestSeqNum() > v.buf[j].LargestSeqNum() - } - return less < 0 -} -func (v *tombstonesByStartKeyAndSeqnum) Swap(i, j int) { - v.buf[i], v.buf[j] = v.buf[j], v.buf[i] + tableNum base.TableNum } func iterateAndCheckTombstones( cmp Compare, formatKey base.FormatKey, tombstones []tombstoneWithLevel, ) error { - sortBuf := tombstonesByStartKeyAndSeqnum{ - cmp: cmp, - buf: tombstones, - } - sort.Sort(&sortBuf) + slices.SortFunc(tombstones, func(a, b tombstoneWithLevel) int { + if v := cmp(a.Start, b.Start); v != 0 { + return v + } + return stdcmp.Compare(b.LargestSeqNum(), a.LargestSeqNum()) + }) // For a sequence of tombstones that share the same start UserKey, we will // encounter them in non-increasing seqnum order and so should encounter them @@ -335,8 +339,8 @@ func iterateAndCheckTombstones( if cmp(lastTombstone.Start, t.Start) == 0 && lastTombstone.level > t.level { return errors.Errorf("encountered tombstone %s in %s"+ " that has a lower seqnum than the same tombstone in %s", - t.Span.Pretty(formatKey), levelOrMemtable(t.lsmLevel, t.fileNum), - levelOrMemtable(lastTombstone.lsmLevel, lastTombstone.fileNum)) + t.Span.Pretty(formatKey), levelOrMemtable(t.lsmLevel, t.tableNum), + levelOrMemtable(lastTombstone.lsmLevel, lastTombstone.tableNum)) } lastTombstone = t } @@ -348,10 +352,14 @@ type checkConfig struct { comparer *Comparer readState *readState newIters tableNewIters - seqNum uint64 + seqNum base.SeqNum stats *CheckLevelsStats merge Merge formatKey base.FormatKey + readEnv block.ReadEnv + // blobValueFetcher is the ValueFetcher to use when retrieving values stored + // externally in blob files. + blobValueFetcher blob.ValueFetcher } // cmp is shorthand for comparer.Compare. @@ -368,44 +376,30 @@ func checkRangeTombstones(c *checkConfig) error { if iter == nil { continue } - if tombstones, err = addTombstonesFromIter(iter, level, -1, 0, tombstones, - c.seqNum, c.cmp, c.formatKey, nil); err != nil { + tombstones, err = addTombstonesFromIter( + iter, level, -1, 0, tombstones, c.seqNum, c.cmp, c.formatKey, + ) + iter.Close() + if err != nil { return err } level++ } current := c.readState.current - addTombstonesFromLevel := func(files manifest.LevelIterator, lsmLevel int) error { - for f := files.First(); f != nil; f = files.Next() { - lf := files.Take() - atomicUnit, _ := expandToAtomicUnit(c.cmp, lf.Slice(), true /* disableIsCompacting */) - lower, upper := manifest.KeyRange(c.cmp, atomicUnit.Iter()) - iterToClose, iter, err := c.newIters( - context.Background(), lf.FileMetadata, &IterOptions{level: manifest.Level(lsmLevel)}, internalIterOpts{}) + addTombstonesFromLevel := func(files iter.Seq[*manifest.TableMetadata], lsmLevel int) error { + for f := range files { + iters, err := c.newIters( + context.Background(), f, &IterOptions{layer: manifest.Level(lsmLevel)}, + internalIterOpts{}, iterRangeDeletions) if err != nil { return err } - iterToClose.Close() - if iter == nil { - continue - } - truncate := func(t keyspan.Span) keyspan.Span { - // Same checks as in keyspan.Truncate. - if c.cmp(t.Start, lower.UserKey) < 0 { - t.Start = lower.UserKey - } - if c.cmp(t.End, upper.UserKey) > 0 { - t.End = upper.UserKey - } - if c.cmp(t.Start, t.End) >= 0 { - // Remove the keys. - t.Keys = t.Keys[:0] - } - return t - } - if tombstones, err = addTombstonesFromIter(iter, level, lsmLevel, f.FileNum, - tombstones, c.seqNum, c.cmp, c.formatKey, truncate); err != nil { + tombstones, err = addTombstonesFromIter(iters.RangeDeletion(), level, lsmLevel, f.TableNum, + tombstones, c.seqNum, c.cmp, c.formatKey) + _ = iters.CloseAll() + + if err != nil { return err } } @@ -416,14 +410,14 @@ func checkRangeTombstones(c *checkConfig) error { if current.L0SublevelFiles[i].Empty() { continue } - err := addTombstonesFromLevel(current.L0SublevelFiles[i].Iter(), 0) + err := addTombstonesFromLevel(current.L0SublevelFiles[i].All(), 0) if err != nil { return err } level++ } for i := 1; i < len(current.Levels); i++ { - if err := addTombstonesFromLevel(current.Levels[i].Iter(), i); err != nil { + if err := addTombstonesFromLevel(current.Levels[i].All(), i); err != nil { return err } level++ @@ -438,94 +432,64 @@ func checkRangeTombstones(c *checkConfig) error { return iterateAndCheckTombstones(c.cmp, c.formatKey, tombstones) } -func levelOrMemtable(lsmLevel int, fileNum FileNum) string { +func levelOrMemtable(lsmLevel int, tableNum base.TableNum) string { if lsmLevel == -1 { return "memtable" } - return fmt.Sprintf("L%d: fileNum=%s", lsmLevel, fileNum) + return fmt.Sprintf("L%d: fileNum=%s", lsmLevel, tableNum) } func addTombstonesFromIter( iter keyspan.FragmentIterator, level int, lsmLevel int, - fileNum FileNum, + tableNum base.TableNum, tombstones []tombstoneWithLevel, - seqNum uint64, + seqNum base.SeqNum, cmp Compare, formatKey base.FormatKey, - truncate func(tombstone keyspan.Span) keyspan.Span, ) (_ []tombstoneWithLevel, err error) { - defer func() { - err = firstError(err, iter.Close()) - }() - var prevTombstone keyspan.Span - for tomb := iter.First(); tomb != nil; tomb = iter.Next() { + tomb, err := iter.First() + for ; tomb != nil; tomb, err = iter.Next() { t := tomb.Visible(seqNum) if t.Empty() { continue } - t = t.DeepClone() + t = t.Clone() // This is mainly a test for rangeDelV2 formatted blocks which are expected to // be ordered and fragmented on disk. But we anyways check for memtables, // rangeDelV1 as well. if cmp(prevTombstone.End, t.Start) > 0 { return nil, errors.Errorf("unordered or unfragmented range delete tombstones %s, %s in %s", - prevTombstone.Pretty(formatKey), t.Pretty(formatKey), levelOrMemtable(lsmLevel, fileNum)) + prevTombstone.Pretty(formatKey), t.Pretty(formatKey), levelOrMemtable(lsmLevel, tableNum)) } prevTombstone = t - // Truncation of a tombstone must happen after checking its ordering, - // fragmentation wrt previous tombstone. Since it is possible that after - // truncation the tombstone is ordered, fragmented when it originally wasn't. - if truncate != nil { - t = truncate(t) - } if !t.Empty() { tombstones = append(tombstones, tombstoneWithLevel{ Span: t, level: level, lsmLevel: lsmLevel, - fileNum: fileNum, + tableNum: tableNum, }) } } + if err != nil { + return nil, err + } return tombstones, nil } -type userKeysSort struct { - cmp Compare - buf [][]byte -} - -func (v *userKeysSort) Len() int { return len(v.buf) } -func (v *userKeysSort) Less(i, j int) bool { - return v.cmp(v.buf[i], v.buf[j]) < 0 -} -func (v *userKeysSort) Swap(i, j int) { - v.buf[i], v.buf[j] = v.buf[j], v.buf[i] -} func collectAllUserKeys(cmp Compare, tombstones []tombstoneWithLevel) [][]byte { keys := make([][]byte, 0, len(tombstones)*2) for _, t := range tombstones { - keys = append(keys, t.Start) - keys = append(keys, t.End) - } - sorter := userKeysSort{ - cmp: cmp, - buf: keys, - } - sort.Sort(&sorter) - var last, curr int - for last, curr = -1, 0; curr < len(keys); curr++ { - if last < 0 || cmp(keys[last], keys[curr]) != 0 { - last++ - keys[last] = keys[curr] - } + keys = append(keys, t.Start, t.End) } - keys = keys[:last+1] - return keys + slices.SortFunc(keys, cmp) + return slices.CompactFunc(keys, func(a, b []byte) bool { + return cmp(a, b) == 0 + }) } func fragmentUsingUserKeys( @@ -581,11 +545,21 @@ func (d *DB) CheckLevels(stats *CheckLevelsStats) error { stats: stats, merge: d.merge, formatKey: d.opts.Comparer.FormatKey, + readEnv: block.ReadEnv{ + // TODO(jackson): Add categorized stats. + }, } + checkConfig.blobValueFetcher.Init(&readState.current.BlobFiles, d.fileCache, checkConfig.readEnv) + defer func() { _ = checkConfig.blobValueFetcher.Close() }() return checkLevelsInternal(checkConfig) } func checkLevelsInternal(c *checkConfig) (err error) { + internalOpts := internalIterOpts{ + readEnv: sstable.ReadEnv{Block: c.readEnv}, + blobValueFetcher: &c.blobValueFetcher, + } + // Phase 1: Use a simpleMergingIter to step through all the points and ensure // that points with the same user key at different levels are not inverted // wrt sequence numbers and the same holds for tombstones that cover points. @@ -602,7 +576,7 @@ func checkLevelsInternal(c *checkConfig) (err error) { l.iter = nil } if l.rangeDelIter != nil { - err = firstError(err, l.rangeDelIter.Close()) + l.rangeDelIter.Close() l.rangeDelIter = nil } } @@ -643,9 +617,8 @@ func checkLevelsInternal(c *checkConfig) (err error) { iterOpts := IterOptions{logger: c.logger} li := &levelIter{} li.init(context.Background(), iterOpts, c.comparer, c.newIters, manifestIter, - manifest.L0Sublevel(sublevel), internalIterOpts{}) - li.initRangeDel(&mlevelAlloc[0].rangeDelIter) - li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext) + manifest.L0Sublevel(sublevel), internalOpts) + li.initRangeDel(&mlevelAlloc[0]) mlevelAlloc[0].iter = li mlevelAlloc = mlevelAlloc[1:] } @@ -657,9 +630,8 @@ func checkLevelsInternal(c *checkConfig) (err error) { iterOpts := IterOptions{logger: c.logger} li := &levelIter{} li.init(context.Background(), iterOpts, c.comparer, c.newIters, - current.Levels[level].Iter(), manifest.Level(level), internalIterOpts{}) - li.initRangeDel(&mlevelAlloc[0].rangeDelIter) - li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext) + current.Levels[level].Iter(), manifest.Level(level), internalOpts) + li.initRangeDel(&mlevelAlloc[0]) mlevelAlloc[0].iter = li mlevelAlloc = mlevelAlloc[1:] } @@ -681,8 +653,7 @@ func checkLevelsInternal(c *checkConfig) (err error) { type simpleMergingIterItem struct { index int - key InternalKey - value base.LazyValue + kv base.InternalKV } type simpleMergingIterHeap struct { @@ -696,7 +667,7 @@ func (h *simpleMergingIterHeap) len() int { } func (h *simpleMergingIterHeap) less(i, j int) bool { - ikey, jkey := h.items[i].key, h.items[j].key + ikey, jkey := h.items[i].kv.K, h.items[j].kv.K if c := h.cmp(ikey.UserKey, jkey.UserKey); c != 0 { if h.reverse { return c > 0 diff --git a/vendor/github.com/cockroachdb/pebble/v2/level_iter.go b/vendor/github.com/cockroachdb/pebble/v2/level_iter.go new file mode 100644 index 0000000..d51c44a --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/level_iter.go @@ -0,0 +1,958 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + "fmt" + "runtime/debug" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// levelIter provides a merged view of the sstables in a level. +// +// levelIter is used during compaction and as part of the Iterator +// implementation. When used as part of the Iterator implementation, level +// iteration needs to "pause" at range deletion boundaries if file contains +// range deletions. In this case, the levelIter uses a keyspan.InterleavingIter +// to materialize InternalKVs at start and end boundaries of range deletions. +// This prevents mergingIter from advancing past the sstable until the sstable +// contains the smallest (or largest for reverse iteration) key in the merged +// heap. Note that mergingIter treats a range deletion tombstone returned by the +// point iterator as a no-op. +type levelIter struct { + // The context is stored here since (a) iterators are expected to be + // short-lived (since they pin sstables), (b) plumbing a context into every + // method is very painful, (c) they do not (yet) respect context + // cancellation and are only used for tracing. + ctx context.Context + logger Logger + comparer *Comparer + cmp Compare + split Split + // The lower/upper bounds for iteration as specified at creation or the most + // recent call to SetBounds. + lower []byte + upper []byte + // prefix holds the iteration prefix when the most recent absolute + // positioning method was a SeekPrefixGE. + prefix []byte + // The iterator options for the currently open table. If + // tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary + // does not lie within the table bounds. + tableOpts IterOptions + // The layer this levelIter is initialized for. This can be either + // a level L1+, an L0 sublevel, or a flushable ingests layer. + layer manifest.Layer + // combinedIterState may be set when a levelIter is used during user + // iteration. Although levelIter only iterates over point keys, it's also + // responsible for lazily constructing the combined range & point iterator + // when it observes a file containing range keys. If the combined iter + // state's initialized field is true, the iterator is already using combined + // iterator, OR the iterator is not configured to use combined iteration. If + // it's false, the levelIter must set the `triggered` and `key` fields when + // the levelIter passes over a file containing range keys. See the + // lazyCombinedIter for more details. + combinedIterState *combinedIterState + // The iter for the current file. It is nil under any of the following conditions: + // - files.Current() == nil + // - err != nil + // - some other constraint, like the bounds in opts, caused the file at index to not + // be relevant to the iteration. + iter internalIterator + // iterFile holds the current file. It is always equal to l.files.Current(). + iterFile *manifest.TableMetadata + newIters tableNewIters + files manifest.LevelIterator + err error + + // When rangeDelIterSetter != nil, the caller requires that this function + // gets called with a range deletion iterator whenever the current file + // changes. The iterator is relinquished to the caller which is responsible + // for closing it. + // + // When rangeDelIterSetter != nil, the levelIter will also interleave the + // boundaries of range deletions among point keys. + rangeDelIterSetter rangeDelIterSetter + + // interleaving is used when rangeDelIterFn != nil to interleave the + // boundaries of range deletions among point keys. When the leve iterator is + // used by a merging iterator, this ensures that we don't advance to a new + // file until the range deletions are no longer needed by other levels. + interleaving keyspan.InterleavingIter + + // internalOpts holds the internal iterator options to pass to the table + // cache when constructing new table iterators. + internalOpts internalIterOpts + + // Scratch space for the obsolete keys filter, when there are no other block + // property filters specified. See the performance note where + // IterOptions.PointKeyFilters is declared. + filtersBuf [1]BlockPropertyFilter + + // exhaustedDir is set to +1 or -1 when the levelIter has been exhausted in + // the forward or backward direction respectively. It is set when the + // underlying data is exhausted or when iteration has reached the upper or + // lower boundary and interleaved a synthetic iterator bound key. When the + // iterator is exhausted and Next or Prev is called, the levelIter uses + // exhaustedDir to determine whether the iterator should step on to the + // first or last key within iteration bounds. + exhaustedDir int8 + + // Disable invariant checks even if they are otherwise enabled. Used by tests + // which construct "impossible" situations (e.g. seeking to a key before the + // lower bound). + disableInvariants bool +} + +type rangeDelIterSetter interface { + setRangeDelIter(rangeDelIter keyspan.FragmentIterator) +} + +// levelIter implements the base.InternalIterator interface. +var _ base.InternalIterator = (*levelIter)(nil) + +// newLevelIter returns a levelIter. It is permissible to pass a nil split +// parameter if the caller is never going to call SeekPrefixGE. +func newLevelIter( + ctx context.Context, + opts IterOptions, + comparer *Comparer, + newIters tableNewIters, + files manifest.LevelIterator, + layer manifest.Layer, + internalOpts internalIterOpts, +) *levelIter { + l := &levelIter{} + l.init(ctx, opts, comparer, newIters, files, layer, internalOpts) + return l +} + +func (l *levelIter) init( + ctx context.Context, + opts IterOptions, + comparer *Comparer, + newIters tableNewIters, + files manifest.LevelIterator, + layer manifest.Layer, + internalOpts internalIterOpts, +) { + l.ctx = ctx + l.err = nil + l.layer = layer + l.logger = opts.getLogger() + l.prefix = nil + l.lower = opts.LowerBound + l.upper = opts.UpperBound + l.tableOpts.PointKeyFilters = opts.PointKeyFilters + if len(opts.PointKeyFilters) == 0 { + l.tableOpts.PointKeyFilters = l.filtersBuf[:0:1] + } + l.tableOpts.UseL6Filters = opts.UseL6Filters + l.tableOpts.Category = opts.Category + l.tableOpts.layer = l.layer + l.tableOpts.snapshotForHideObsoletePoints = opts.snapshotForHideObsoletePoints + l.comparer = comparer + l.cmp = comparer.Compare + l.split = comparer.Split + l.iterFile = nil + l.newIters = newIters + l.files = files + l.exhaustedDir = 0 + l.internalOpts = internalOpts +} + +// initRangeDel puts the level iterator into a mode where it interleaves range +// deletion boundaries with point keys and provides a range deletion iterator +// (through rangeDelIterFn) whenever the current file changes. +// +// The range deletion iterator passed to rangeDelIterFn is relinquished to the +// implementor who is responsible for closing it. +func (l *levelIter) initRangeDel(rangeDelSetter rangeDelIterSetter) { + l.rangeDelIterSetter = rangeDelSetter +} + +func (l *levelIter) initCombinedIterState(state *combinedIterState) { + l.combinedIterState = state +} + +func (l *levelIter) maybeTriggerCombinedIteration(file *manifest.TableMetadata, dir int) { + // If we encounter a file that contains range keys, we may need to + // trigger a switch to combined range-key and point-key iteration, + // if the *pebble.Iterator is configured for it. This switch is done + // lazily because range keys are intended to be rare, and + // constructing the range-key iterator substantially adds to the + // cost of iterator construction and seeking. + // + // If l.combinedIterState.initialized is already true, either the + // iterator is already using combined iteration or the iterator is not + // configured to observe range keys. Either way, there's nothing to do. + // If false, trigger the switch to combined iteration, using the the + // file's bounds to seek the range-key iterator appropriately. + // + // We only need to trigger combined iteration if the file contains + // RangeKeySets: if there are only Unsets and Dels, the user will observe no + // range keys regardless. If this file has table stats available, they'll + // tell us whether the file has any RangeKeySets. Otherwise, we must + // fallback to assuming it does if HasRangeKeys=true. + if file != nil && file.HasRangeKeys && l.combinedIterState != nil && !l.combinedIterState.initialized && + (l.upper == nil || l.cmp(file.RangeKeyBounds.SmallestUserKey(), l.upper) < 0) && + (l.lower == nil || l.cmp(file.RangeKeyBounds.LargestUserKey(), l.lower) > 0) && + (!file.StatsValid() || file.Stats.NumRangeKeySets > 0) { + // The file contains range keys, and we're not using combined iteration yet. + // Trigger a switch to combined iteration. It's possible that a switch has + // already been triggered if multiple levels encounter files containing + // range keys while executing a single mergingIter operation. In this case, + // we need to compare the existing key recorded to l.combinedIterState.key, + // adjusting it if our key is smaller (forward iteration) or larger + // (backward iteration) than the existing key. + // + // These key comparisons are only required during a single high-level + // iterator operation. When the high-level iter op completes, + // iinitialized will be true, and future calls to this function will be + // no-ops. + switch dir { + case +1: + if !l.combinedIterState.triggered { + l.combinedIterState.triggered = true + l.combinedIterState.key = file.RangeKeyBounds.SmallestUserKey() + } else if l.cmp(l.combinedIterState.key, file.RangeKeyBounds.SmallestUserKey()) > 0 { + l.combinedIterState.key = file.RangeKeyBounds.SmallestUserKey() + } + case -1: + if !l.combinedIterState.triggered { + l.combinedIterState.triggered = true + l.combinedIterState.key = file.RangeKeyBounds.LargestUserKey() + } else if l.cmp(l.combinedIterState.key, file.RangeKeyBounds.LargestUserKey()) < 0 { + l.combinedIterState.key = file.RangeKeyBounds.LargestUserKey() + } + } + } +} + +func (l *levelIter) findFileGE(key []byte, flags base.SeekGEFlags) *manifest.TableMetadata { + // Find the earliest file whose largest key is >= key. + + // NB: if flags.TrySeekUsingNext()=true, the levelIter must respect it. If + // the levelIter is positioned at the key P, it must return a key ≥ P. If + // used within a merging iterator, the merging iterator will depend on the + // levelIter only moving forward to maintain heap invariants. + + // Ordinarily we seek the LevelIterator using SeekGE. In some instances, we + // Next instead. In other instances, we try Next-ing first, falling back to + // seek: + // a) flags.TrySeekUsingNext(): The top-level Iterator knows we're seeking + // to a key later than the current iterator position. We don't know how + // much later the seek key is, so it's possible there are many sstables + // between the current position and the seek key. However in most real- + // world use cases, the seek key is likely to be nearby. Rather than + // performing a log(N) seek through the table metadata, we next a few + // times from our existing location. If we don't find a file whose + // largest is >= key within a few nexts, we fall back to seeking. + // + // Note that in this case, the file returned by findFileGE may be + // different than the file returned by a raw binary search (eg, when + // TrySeekUsingNext=false). This is possible because the most recent + // positioning operation may have already determined that previous + // files' keys that are ≥ key are all deleted. This information is + // encoded within the iterator's current iterator position and is + // unavailable to a fresh binary search. + // + // b) flags.RelativeSeek(): The merging iterator decided to re-seek this + // level according to a range tombstone. When lazy combined iteration + // is enabled, the level iterator is responsible for watching for + // files containing range keys and triggering the switch to combined + // iteration when such a file is observed. If a range deletion was + // observed in a higher level causing the merging iterator to seek the + // level to the range deletion's end key, we need to check whether all + // of the files between the old position and the new position contain + // any range keys. + // + // In this scenario, we don't seek the LevelIterator and instead we + // Next it, one file at a time, checking each for range keys. The + // merging iterator sets this flag to inform us that we're moving + // forward relative to the existing position and that we must examine + // each intermediate sstable's metadata for lazy-combined iteration. + // In this case, we only Next and never Seek. We set nextsUntilSeek=-1 + // to signal this intention. + // + // NB: At most one of flags.RelativeSeek() and flags.TrySeekUsingNext() may + // be set, because the merging iterator re-seeks relative seeks with + // explicitly only the RelativeSeek flag set. + var nextsUntilSeek int + var nextInsteadOfSeek bool + if flags.TrySeekUsingNext() { + nextInsteadOfSeek = true + nextsUntilSeek = 4 // arbitrary + } + if flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized { + nextInsteadOfSeek = true + nextsUntilSeek = -1 + } + + var m *manifest.TableMetadata + if nextInsteadOfSeek { + m = l.iterFile + } else { + m = l.files.SeekGE(l.cmp, key) + } + // The below loop has a bit of an unusual organization. There are several + // conditions under which we need to Next to a later file. If none of those + // conditions are met, the file in `m` is okay to return. The loop body is + // structured with a series of if statements, each of which may continue the + // loop to the next file. If none of the statements are met, the end of the + // loop body is a break. + for m != nil { + if m.HasRangeKeys { + l.maybeTriggerCombinedIteration(m, +1) + + // Some files may only contain range keys, which we can skip. + // NB: HasPointKeys=true if the file contains any points or range + // deletions (which delete points). + if !m.HasPointKeys { + m = l.files.Next() + continue + } + } + + // This file has point keys. + // + // However, there are a couple reasons why `m` may not be positioned ≥ + // `key` yet: + // + // 1. If SeekGE(key) landed on a file containing range keys, the file + // may contain range keys ≥ `key` but no point keys ≥ `key`. + // 2. When nexting instead of seeking, we must check to see whether + // we've nexted sufficiently far, or we need to next again. + // + // If the file does not contain point keys ≥ `key`, next to continue + // looking for a file that does. + if (m.HasRangeKeys || nextInsteadOfSeek) && l.cmp(m.PointKeyBounds.LargestUserKey(), key) < 0 { + // If nextInsteadOfSeek is set and nextsUntilSeek is non-negative, + // the iterator has been nexting hoping to discover the relevant + // file without seeking. It's exhausted the allotted nextsUntilSeek + // and should seek to the sought key. + if nextInsteadOfSeek && nextsUntilSeek == 0 { + nextInsteadOfSeek = false + m = l.files.SeekGE(l.cmp, key) + continue + } else if nextsUntilSeek > 0 { + nextsUntilSeek-- + } + m = l.files.Next() + continue + } + + // This file has a point key bound ≥ `key`. But the largest point key + // bound may still be a range deletion sentinel, which is exclusive. In + // this case, the file doesn't actually contain any point keys equal to + // `key`. We next to keep searching for a file that actually contains + // point keys ≥ key. + // + // Additionally, this prevents loading untruncated range deletions from + // a table which can't possibly contain the target key and is required + // for correctness by mergingIter.SeekGE (see the comment in that + // function). + if m.PointKeyBounds.Largest().IsExclusiveSentinel() && l.cmp(m.PointKeyBounds.LargestUserKey(), key) == 0 { + m = l.files.Next() + continue + } + + // This file contains point keys ≥ `key`. Break and return it. + break + } + return m +} + +func (l *levelIter) findFileLT(key []byte, flags base.SeekLTFlags) *manifest.TableMetadata { + // Find the last file whose smallest key is < ikey. + + // Ordinarily we seek the LevelIterator using SeekLT. + // + // When lazy combined iteration is enabled, there's a complication. The + // level iterator is responsible for watching for files containing range + // keys and triggering the switch to combined iteration when such a file is + // observed. If a range deletion was observed in a higher level causing the + // merging iterator to seek the level to the range deletion's start key, we + // need to check whether all of the files between the old position and the + // new position contain any range keys. + // + // In this scenario, we don't seek the LevelIterator and instead we Prev it, + // one file at a time, checking each for range keys. + prevInsteadOfSeek := flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized + + var m *manifest.TableMetadata + if prevInsteadOfSeek { + m = l.iterFile + } else { + m = l.files.SeekLT(l.cmp, key) + } + // The below loop has a bit of an unusual organization. There are several + // conditions under which we need to Prev to a previous file. If none of + // those conditions are met, the file in `m` is okay to return. The loop + // body is structured with a series of if statements, each of which may + // continue the loop to the previous file. If none of the statements are + // met, the end of the loop body is a break. + for m != nil { + if m.HasRangeKeys { + l.maybeTriggerCombinedIteration(m, -1) + + // Some files may only contain range keys, which we can skip. + // NB: HasPointKeys=true if the file contains any points or range + // deletions (which delete points). + if !m.HasPointKeys { + m = l.files.Prev() + continue + } + } + + // This file has point keys. + // + // However, there are a couple reasons why `m` may not be positioned < + // `key` yet: + // + // 1. If SeekLT(key) landed on a file containing range keys, the file + // may contain range keys < `key` but no point keys < `key`. + // 2. When preving instead of seeking, we must check to see whether + // we've preved sufficiently far, or we need to prev again. + // + // If the file does not contain point keys < `key`, prev to continue + // looking for a file that does. + if (m.HasRangeKeys || prevInsteadOfSeek) && l.cmp(m.PointKeyBounds.SmallestUserKey(), key) >= 0 { + m = l.files.Prev() + continue + } + + // This file contains point keys < `key`. Break and return it. + break + } + return m +} + +// Init the iteration bounds for the current table. Returns -1 if the table +// lies fully before the lower bound, +1 if the table lies fully after the +// upper bound, and 0 if the table overlaps the iteration bounds. +func (l *levelIter) initTableBounds(f *manifest.TableMetadata) int { + l.tableOpts.LowerBound = l.lower + if l.tableOpts.LowerBound != nil { + if l.cmp(f.PointKeyBounds.LargestUserKey(), l.tableOpts.LowerBound) < 0 { + // The largest key in the sstable is smaller than the lower bound. + return -1 + } + if l.cmp(l.tableOpts.LowerBound, f.PointKeyBounds.SmallestUserKey()) <= 0 { + // The lower bound is smaller or equal to the smallest key in the + // table. Iteration within the table does not need to check the lower + // bound. + l.tableOpts.LowerBound = nil + } + } + l.tableOpts.UpperBound = l.upper + if l.tableOpts.UpperBound != nil { + if l.cmp(f.PointKeyBounds.SmallestUserKey(), l.tableOpts.UpperBound) >= 0 { + // The smallest key in the sstable is greater than or equal to the upper + // bound. + return 1 + } + if l.cmp(l.tableOpts.UpperBound, f.PointKeyBounds.LargestUserKey()) > 0 { + // The upper bound is greater than the largest key in the + // table. Iteration within the table does not need to check the upper + // bound. NB: tableOpts.UpperBound is exclusive and f.PointKeyBounds.Largest() is + // inclusive. + l.tableOpts.UpperBound = nil + } + } + return 0 +} + +type loadFileReturnIndicator int8 + +const ( + noFileLoaded loadFileReturnIndicator = iota + fileAlreadyLoaded + newFileLoaded +) + +func (l *levelIter) loadFile(file *manifest.TableMetadata, dir int) loadFileReturnIndicator { + if l.iterFile == file { + if l.err != nil { + return noFileLoaded + } + if l.iter != nil { + // We don't bother comparing the file bounds with the iteration bounds when we have + // an already open iterator. It is possible that the iter may not be relevant given the + // current iteration bounds, but it knows those bounds, so it will enforce them. + + // There are a few reasons we might not have triggered combined + // iteration yet, even though we already had `file` open. + // 1. If the bounds changed, we might have previously avoided + // switching to combined iteration because the bounds excluded + // the range keys contained in this file. + // 2. If an existing iterator was reconfigured to iterate over range + // keys (eg, using SetOptions), then we wouldn't have triggered + // the switch to combined iteration yet. + l.maybeTriggerCombinedIteration(file, dir) + return fileAlreadyLoaded + } + // We were already at file, but don't have an iterator, probably because the file was + // beyond the iteration bounds. It may still be, but it is also possible that the bounds + // have changed. We handle that below. + } + + // Close iter and send a nil iterator through rangeDelIterFn.rangeDelIterFn. + if err := l.Close(); err != nil { + return noFileLoaded + } + + for { + l.iterFile = file + if file == nil { + return noFileLoaded + } + + l.maybeTriggerCombinedIteration(file, dir) + if !file.HasPointKeys { + switch dir { + case +1: + file = l.files.Next() + continue + case -1: + file = l.files.Prev() + continue + } + } + + switch l.initTableBounds(file) { + case -1: + // The largest key in the sstable is smaller than the lower bound. + if dir < 0 { + return noFileLoaded + } + file = l.files.Next() + continue + case +1: + // The smallest key in the sstable is greater than or equal to the upper + // bound. + if dir > 0 { + return noFileLoaded + } + file = l.files.Prev() + continue + } + // If we're in prefix iteration, it's possible this file's smallest + // boundary is large enough to prove the file cannot possibly contain + // any keys within the iteration prefix. Loading the next file is + // unnecessary. This has been observed in practice on slow shared + // storage. See #3575. + if l.prefix != nil && l.cmp(l.split.Prefix(file.PointKeyBounds.SmallestUserKey()), l.prefix) > 0 { + // Note that because l.iter is nil, a subsequent call to + // SeekPrefixGE with TrySeekUsingNext()=true will load the file + // (returning newFileLoaded) and disable TrySeekUsingNext before + // performing a seek in the file. + return noFileLoaded + } + + iterKinds := iterPointKeys + if l.rangeDelIterSetter != nil { + iterKinds |= iterRangeDeletions + } + + var iters iterSet + iters, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts, iterKinds) + if l.err != nil { + if l.rangeDelIterSetter != nil { + l.rangeDelIterSetter.setRangeDelIter(nil) + } + return noFileLoaded + } + l.iter = iters.Point() + if l.rangeDelIterSetter != nil && iters.rangeDeletion != nil { + // If this file has range deletions, interleave the bounds of the + // range deletions among the point keys. When used with a + // mergingIter, this ensures we don't move beyond a file with range + // deletions until its range deletions are no longer relevant. + // + // For now, we open a second range deletion iterator. Future work + // will avoid the need to open a second range deletion iterator, and + // avoid surfacing the file's range deletion iterator via rangeDelIterFn. + itersForBounds, err := l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts, iterRangeDeletions) + if err != nil { + l.iter = nil + l.err = errors.CombineErrors(err, iters.CloseAll()) + return noFileLoaded + } + l.interleaving.Init(l.comparer, l.iter, itersForBounds.RangeDeletion(), keyspan.InterleavingIterOpts{ + LowerBound: l.tableOpts.LowerBound, + UpperBound: l.tableOpts.UpperBound, + InterleaveEndKeys: true, + }) + l.iter = &l.interleaving + + // Relinquish iters.rangeDeletion to the caller. + l.rangeDelIterSetter.setRangeDelIter(iters.rangeDeletion) + } + return newFileLoaded + } +} + +// In race builds we verify that the keys returned by levelIter lie within +// [lower,upper). +func (l *levelIter) verify(kv *base.InternalKV) *base.InternalKV { + // Note that invariants.Enabled is a compile time constant, which means the + // block of code will be compiled out of normal builds making this method + // eligible for inlining. Do not change this to use a variable. + if invariants.Enabled && !l.disableInvariants && kv != nil { + // We allow returning a boundary key that is outside of the lower/upper + // bounds as such keys are always range tombstones which will be skipped + // by the Iterator. + if l.lower != nil && kv != nil && !kv.K.IsExclusiveSentinel() && l.cmp(kv.K.UserKey, l.lower) < 0 { + l.logger.Fatalf("levelIter %s: lower bound violation: %s < %s\n%s", l.layer, kv, l.lower, debug.Stack()) + } + if l.upper != nil && kv != nil && !kv.K.IsExclusiveSentinel() && l.cmp(kv.K.UserKey, l.upper) > 0 { + l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.layer, kv, l.upper, debug.Stack()) + } + } + return kv +} + +func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { + if invariants.Enabled && l.lower != nil && l.cmp(key, l.lower) < 0 { + panic(errors.AssertionFailedf("levelIter SeekGE to key %q violates lower bound %q", key, l.lower)) + } + + l.err = nil // clear cached iteration error + l.exhaustedDir = 0 + l.prefix = nil + // NB: the top-level Iterator has already adjusted key based on + // IterOptions.LowerBound. + loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) + if loadFileIndicator == noFileLoaded { + l.exhaustedForward() + return nil + } + if loadFileIndicator == newFileLoaded { + // File changed, so l.iter has changed, and that iterator is not + // positioned appropriately. + flags = flags.DisableTrySeekUsingNext() + } + if kv := l.iter.SeekGE(key, flags); kv != nil { + return l.verify(kv) + } + return l.verify(l.skipEmptyFileForward()) +} + +func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + if invariants.Enabled && l.lower != nil && l.cmp(key, l.lower) < 0 { + panic(errors.AssertionFailedf("levelIter SeekGE to key %q violates lower bound %q", key, l.lower)) + } + l.err = nil // clear cached iteration error + l.exhaustedDir = 0 + l.prefix = prefix + + // NB: the top-level Iterator has already adjusted key based on + // IterOptions.LowerBound. + loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) + if loadFileIndicator == noFileLoaded { + l.exhaustedForward() + return nil + } + if loadFileIndicator == newFileLoaded { + // File changed, so l.iter has changed, and that iterator is not + // positioned appropriately. + flags = flags.DisableTrySeekUsingNext() + } + if kv := l.iter.SeekPrefixGE(prefix, key, flags); kv != nil { + return l.verify(kv) + } + if err := l.iter.Error(); err != nil { + return nil + } + return l.verify(l.skipEmptyFileForward()) +} + +func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { + if invariants.Enabled && l.upper != nil && l.cmp(key, l.upper) > 0 { + panic(errors.AssertionFailedf("levelIter SeekLT to key %q violates upper bound %q", key, l.upper)) + } + + l.err = nil // clear cached iteration error + l.exhaustedDir = 0 + l.prefix = nil + + // NB: the top-level Iterator has already adjusted key based on + // IterOptions.UpperBound. + if l.loadFile(l.findFileLT(key, flags), -1) == noFileLoaded { + l.exhaustedBackward() + return nil + } + if kv := l.iter.SeekLT(key, flags); kv != nil { + return l.verify(kv) + } + return l.verify(l.skipEmptyFileBackward()) +} + +func (l *levelIter) First() *base.InternalKV { + if invariants.Enabled && l.lower != nil { + panic(errors.AssertionFailedf("levelIter First called while lower bound %q is set", l.lower)) + } + + l.err = nil // clear cached iteration error + l.exhaustedDir = 0 + l.prefix = nil + + // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is + // set. + if l.loadFile(l.files.First(), +1) == noFileLoaded { + l.exhaustedForward() + return nil + } + if kv := l.iter.First(); kv != nil { + return l.verify(kv) + } + return l.verify(l.skipEmptyFileForward()) +} + +func (l *levelIter) Last() *base.InternalKV { + if invariants.Enabled && l.upper != nil { + panic(errors.AssertionFailedf("levelIter Last called while upper bound %q is set", l.upper)) + } + + l.err = nil // clear cached iteration error + l.exhaustedDir = 0 + l.prefix = nil + + // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is + // set. + if l.loadFile(l.files.Last(), -1) == noFileLoaded { + l.exhaustedBackward() + return nil + } + if kv := l.iter.Last(); kv != nil { + return l.verify(kv) + } + return l.verify(l.skipEmptyFileBackward()) +} + +func (l *levelIter) Next() *base.InternalKV { + if l.exhaustedDir == -1 { + if l.lower != nil { + return l.SeekGE(l.lower, base.SeekGEFlagsNone) + } + return l.First() + } + if l.err != nil || l.iter == nil { + return nil + } + if kv := l.iter.Next(); kv != nil { + return l.verify(kv) + } + return l.verify(l.skipEmptyFileForward()) +} + +func (l *levelIter) NextPrefix(succKey []byte) *base.InternalKV { + if l.err != nil || l.iter == nil { + return nil + } + + if kv := l.iter.NextPrefix(succKey); kv != nil { + return l.verify(kv) + } + if l.iter.Error() != nil { + return nil + } + if l.tableOpts.UpperBound != nil { + // The UpperBound was within this file, so don't load the next file. + l.exhaustedForward() + return nil + } + + // Seek the manifest level iterator using TrySeekUsingNext=true and + // RelativeSeek=true so that we take advantage of the knowledge that + // `succKey` can only be contained in later files. + metadataSeekFlags := base.SeekGEFlagsNone.EnableTrySeekUsingNext().EnableRelativeSeek() + if l.loadFile(l.findFileGE(succKey, metadataSeekFlags), +1) != noFileLoaded { + // NB: The SeekGE on the file's iterator must not set TrySeekUsingNext, + // because l.iter is unpositioned. + if kv := l.iter.SeekGE(succKey, base.SeekGEFlagsNone); kv != nil { + return l.verify(kv) + } + return l.verify(l.skipEmptyFileForward()) + } + l.exhaustedForward() + return nil +} + +func (l *levelIter) Prev() *base.InternalKV { + if l.exhaustedDir == +1 { + if l.upper != nil { + return l.SeekLT(l.upper, base.SeekLTFlagsNone) + } + return l.Last() + } + if l.err != nil || l.iter == nil { + return nil + } + if kv := l.iter.Prev(); kv != nil { + return l.verify(kv) + } + return l.verify(l.skipEmptyFileBackward()) +} + +func (l *levelIter) skipEmptyFileForward() *base.InternalKV { + var kv *base.InternalKV + // The first iteration of this loop starts with an already exhausted l.iter. + // The reason for the exhaustion is either that we iterated to the end of + // the sstable, or our iteration was terminated early due to the presence of + // an upper-bound or the use of SeekPrefixGE. + // + // Subsequent iterations will examine consecutive files such that the first + // file that does not have an exhausted iterator causes the code to return + // that key. + for ; kv == nil; kv = l.iter.First() { + if l.iter.Error() != nil { + return nil + } + // If an upper bound is present and the upper bound lies within the + // current sstable, then we will have reached the upper bound rather + // than the end of the sstable. + if l.tableOpts.UpperBound != nil { + l.exhaustedForward() + return nil + } + + // If the iterator is in prefix iteration mode, it's possible that we + // are here because bloom filter matching failed. In that case it is + // likely that all keys matching the prefix are wholly within the + // current file and cannot be in a subsequent file. In that case we + // don't want to go to the next file, since loading and seeking in there + // has some cost. + // + // This is not just an optimization. We must not advance to the next + // file if the current file might possibly contain keys relevant to any + // prefix greater than our current iteration prefix. If we did, a + // subsequent SeekPrefixGE with TrySeekUsingNext could mistakenly skip + // the file's relevant keys. + if l.prefix != nil { + if l.cmp(l.split.Prefix(l.iterFile.PointKeyBounds.LargestUserKey()), l.prefix) > 0 { + l.exhaustedForward() + return nil + } + } + + // Current file was exhausted. Move to the next file. + if l.loadFile(l.files.Next(), +1) == noFileLoaded { + l.exhaustedForward() + return nil + } + } + return kv +} + +func (l *levelIter) skipEmptyFileBackward() *base.InternalKV { + var kv *base.InternalKV + // The first iteration of this loop starts with an already exhausted + // l.iter. The reason for the exhaustion is either that we iterated to the + // end of the sstable, or our iteration was terminated early due to the + // presence of a lower-bound. + // + // Subsequent iterations will examine consecutive files such that the first + // file that does not have an exhausted iterator causes the code to return + // that key. + for ; kv == nil; kv = l.iter.Last() { + if l.iter.Error() != nil { + return nil + } + // If a lower bound is present and the lower bound lies within the + // current sstable, then we will have reached the lowerr bound rather + // than the end of the sstable. + if l.tableOpts.LowerBound != nil { + l.exhaustedBackward() + return nil + } + // Current file was exhausted. Move to the previous file. + if l.loadFile(l.files.Prev(), -1) == noFileLoaded { + l.exhaustedBackward() + return nil + } + } + return kv +} + +func (l *levelIter) exhaustedForward() { + l.exhaustedDir = +1 +} + +func (l *levelIter) exhaustedBackward() { + l.exhaustedDir = -1 +} + +func (l *levelIter) Error() error { + if l.err != nil || l.iter == nil { + return l.err + } + return l.iter.Error() +} + +func (l *levelIter) Close() error { + if l.iter != nil { + l.err = l.iter.Close() + l.iter = nil + } + if l.rangeDelIterSetter != nil { + l.rangeDelIterSetter.setRangeDelIter(nil) + } + return l.err +} + +func (l *levelIter) SetBounds(lower, upper []byte) { + l.lower = lower + l.upper = upper + + if l.iter == nil { + return + } + + // Update tableOpts.{Lower,Upper}Bound in case the new boundaries fall within + // the boundaries of the current table. + if l.initTableBounds(l.iterFile) != 0 { + // The table does not overlap the bounds. Close() will set levelIter.err if + // an error occurs. + _ = l.Close() + return + } + + l.iter.SetBounds(l.tableOpts.LowerBound, l.tableOpts.UpperBound) +} + +func (l *levelIter) SetContext(ctx context.Context) { + l.ctx = ctx + if l.iter != nil { + // TODO(sumeer): this is losing the ctx = objiotracing.WithLevel(ctx, + // manifest.LevelToInt(opts.level)) that happens in table_cache.go. + l.iter.SetContext(ctx) + } +} + +// DebugTree is part of the InternalIterator interface. +func (l *levelIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p) %s", l, l, l.String()) + if l.iter != nil { + l.iter.DebugTree(n) + } +} + +func (l *levelIter) String() string { + if l.iterFile != nil { + return fmt.Sprintf("%s: fileNum=%s", l.layer, l.iterFile.TableNum.String()) + } + return fmt.Sprintf("%s: fileNum=", l.layer) +} + +var _ internalIterator = &levelIter{} diff --git a/vendor/github.com/cockroachdb/pebble/logger.go b/vendor/github.com/cockroachdb/pebble/v2/logger.go similarity index 89% rename from vendor/github.com/cockroachdb/pebble/logger.go rename to vendor/github.com/cockroachdb/pebble/v2/logger.go index 5e3e92e..e14684d 100644 --- a/vendor/github.com/cockroachdb/pebble/logger.go +++ b/vendor/github.com/cockroachdb/pebble/v2/logger.go @@ -4,7 +4,7 @@ package pebble -import "github.com/cockroachdb/pebble/internal/base" +import "github.com/cockroachdb/pebble/v2/internal/base" // Logger defines an interface for writing log messages. type Logger = base.Logger diff --git a/vendor/github.com/cockroachdb/pebble/v2/lsm_view.go b/vendor/github.com/cockroachdb/pebble/v2/lsm_view.go new file mode 100644 index 0000000..fa2f415 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/lsm_view.go @@ -0,0 +1,278 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + "fmt" + "slices" + "strings" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/lsmview" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/objstorage" +) + +// LSMViewURL returns an URL which shows a diagram of the LSM. +func (d *DB) LSMViewURL() string { + v := func() *manifest.Version { + d.mu.Lock() + defer d.mu.Unlock() + + v := d.mu.versions.currentVersion() + v.Ref() + return v + }() + defer v.Unref() + + b := lsmViewBuilder{ + cmp: d.opts.Comparer.Compare, + fmtKey: d.opts.Comparer.FormatKey, + } + if b.fmtKey == nil { + b.fmtKey = DefaultComparer.FormatKey + } + b.InitLevels(v) + b.PopulateKeys() + data := b.Build(d.objProvider, d.newIters) + url, err := lsmview.GenerateURL(data) + if err != nil { + return fmt.Sprintf("error: %s", err) + } + return url.String() +} + +type lsmViewBuilder struct { + cmp base.Compare + fmtKey base.FormatKey + + levelNames []string + levels [][]*manifest.TableMetadata + + // The keys that appear as Smallest/Largest, sorted and formatted. + sortedKeys []string + // keys[k] is the position of key k in the sortedKeys list. + keys map[string]int + + // scanTables is set during Build. If we don't have too many tables, we will + // create iterators and show some of the keys. + scanTables bool +} + +// InitLevels gets the metadata for the tables in the LSM and populates +// levelNames and levels. +func (b *lsmViewBuilder) InitLevels(v *manifest.Version) { + var levelNames []string + var levels [][]*manifest.TableMetadata + for sublevel := len(v.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- { + var files []*manifest.TableMetadata + for f := range v.L0SublevelFiles[sublevel].All() { + files = append(files, f) + } + + levelNames = append(levelNames, fmt.Sprintf("L0.%d", sublevel)) + levels = append(levels, files) + } + if len(levels) == 0 { + levelNames = append(levelNames, "L0") + levels = append(levels, nil) + } + for level := 1; level < len(v.Levels); level++ { + var files []*manifest.TableMetadata + for f := range v.Levels[level].All() { + files = append(files, f) + } + levelNames = append(levelNames, fmt.Sprintf("L%d", level)) + levels = append(levels, files) + } + b.levelNames = levelNames + b.levels = levels +} + +// PopulateKeys initializes the sortedKeys and keys fields. +func (b *lsmViewBuilder) PopulateKeys() { + // keys[k] will hold the position of k into sortedKeys. + keys := make(map[string]int) + for _, l := range b.levels { + for _, f := range l { + keys[string(f.Smallest().UserKey)] = -1 + keys[string(f.Largest().UserKey)] = -1 + } + } + + sortedKeys := make([]string, 0, len(keys)) + for s := range keys { + sortedKeys = append(sortedKeys, s) + } + slices.SortFunc(sortedKeys, func(k1, k2 string) int { + return b.cmp([]byte(k1), []byte(k2)) + }) + sortedKeys = slices.CompactFunc(sortedKeys, func(k1, k2 string) bool { + return b.cmp([]byte(k1), []byte(k2)) == 0 + }) + for i, k := range sortedKeys { + keys[k] = i + } + for i := range sortedKeys { + sortedKeys[i] = fmt.Sprintf("%v", b.fmtKey([]byte(sortedKeys[i]))) + } + b.sortedKeys = sortedKeys + b.keys = keys +} + +func (b *lsmViewBuilder) Build( + objProvider objstorage.Provider, newIters tableNewIters, +) lsmview.Data { + n := 0 + for _, l := range b.levels { + n += len(l) + } + const scanTablesThreshold = 100 + b.scanTables = n <= scanTablesThreshold + + var data lsmview.Data + data.Keys = b.sortedKeys + data.Levels = make([]lsmview.Level, len(b.levels)) + for i, files := range b.levels { + l := &data.Levels[i] + l.Name = b.levelNames[i] + l.Tables = make([]lsmview.Table, len(files)) + for j, f := range files { + t := &l.Tables[j] + if !f.Virtual { + t.Label = fmt.Sprintf("%d", f.TableNum) + } else { + t.Label = fmt.Sprintf("%d (%d)", f.TableNum, f.TableBacking.DiskFileNum) + } + + t.Size = f.Size + t.SmallestKey = b.keys[string(f.Smallest().UserKey)] + t.LargestKey = b.keys[string(f.Largest().UserKey)] + t.Details = b.tableDetails(f, objProvider, newIters) + } + } + return data +} + +func (b *lsmViewBuilder) tableDetails( + m *manifest.TableMetadata, objProvider objstorage.Provider, newIters tableNewIters, +) []string { + res := make([]string, 0, 10) + outf := func(format string, args ...any) { + res = append(res, fmt.Sprintf(format, args...)) + } + + outf("%s: %s - %s", m.TableNum, m.Smallest().Pretty(b.fmtKey), m.Largest().Pretty(b.fmtKey)) + outf("size: %s", humanize.Bytes.Uint64(m.Size)) + if m.Virtual { + meta, err := objProvider.Lookup(base.FileTypeTable, m.TableBacking.DiskFileNum) + var backingInfo string + switch { + case err != nil: + backingInfo = fmt.Sprintf(" (error looking up object: %v)", err) + case meta.IsShared(): + backingInfo = "shared; " + case meta.IsExternal(): + backingInfo = "external; " + } + outf("virtual; backed by %s (%ssize: %s)", m.TableBacking.DiskFileNum, backingInfo, humanize.Bytes.Uint64(m.TableBacking.Size)) + } + outf("seqnums: %d - %d", m.SmallestSeqNum, m.LargestSeqNum) + if m.SyntheticPrefixAndSuffix.HasPrefix() { + // Note: we are abusing the key formatter by passing just the prefix. + outf("synthetic prefix: %s", b.fmtKey(m.SyntheticPrefixAndSuffix.Prefix())) + } + if m.SyntheticPrefixAndSuffix.HasSuffix() { + // Note: we are abusing the key formatter by passing just the suffix. + outf("synthetic suffix: %s", b.fmtKey(m.SyntheticPrefixAndSuffix.Suffix())) + } + var iters iterSet + if b.scanTables { + var err error + iters, err = newIters(context.Background(), m, nil /* opts */, internalIterOpts{}, iterPointKeys|iterRangeDeletions|iterRangeKeys) + if err != nil { + outf("error opening table: %v", err) + } else { + defer func() { _ = iters.CloseAll() }() + } + } + const maxPoints = 14 + const maxRangeDels = 10 + const maxRangeKeys = 10 + if m.HasPointKeys { + outf("points: %s - %s", m.PointKeyBounds.Smallest().Pretty(b.fmtKey), m.PointKeyBounds.Largest().Pretty(b.fmtKey)) + if b.scanTables { + n := 0 + if it := iters.point; it != nil { + for kv := it.First(); kv != nil; kv = it.Next() { + if n == maxPoints { + outf(" ...") + break + } + outf(" %s", kv.K.Pretty(b.fmtKey)) + n++ + } + if err := it.Error(); err != nil { + outf(" error scanning points: %v", err) + } + } + if n == 0 { + outf(" no points") + } + + n = 0 + if it := iters.rangeDeletion; it != nil { + span, err := it.First() + for ; span != nil; span, err = it.Next() { + if n == maxRangeDels { + outf(" ...") + break + } + seqNums := make([]string, len(span.Keys)) + for i, k := range span.Keys { + seqNums[i] = fmt.Sprintf("#%d", k.SeqNum()) + } + outf(" [%s - %s): %s", b.fmtKey(span.Start), b.fmtKey(span.End), strings.Join(seqNums, ",")) + n++ + } + if err != nil { + outf("error scanning range dels: %v", err) + } + } + if n == 0 { + outf(" no range dels") + } + } + } + if m.HasRangeKeys { + outf("range keys: %s - %s", m.RangeKeyBounds.Smallest().Pretty(b.fmtKey), m.RangeKeyBounds.Largest().Pretty(b.fmtKey)) + n := 0 + if it := iters.rangeKey; it != nil { + span, err := it.First() + for ; span != nil; span, err = it.Next() { + if n == maxRangeKeys { + outf(" ...") + break + } + keys := make([]string, len(span.Keys)) + for i, k := range span.Keys { + keys[i] = k.String() + } + outf(" [%s, %s): {%s}", b.fmtKey(span.Start), b.fmtKey(span.End), strings.Join(keys, " ")) + n++ + } + if err != nil { + outf("error scanning range keys: %v", err) + } + } + if n == 0 { + outf(" no range keys") + } + } + + return res +} diff --git a/vendor/github.com/cockroachdb/pebble/mem_table.go b/vendor/github.com/cockroachdb/pebble/v2/mem_table.go similarity index 86% rename from vendor/github.com/cockroachdb/pebble/mem_table.go rename to vendor/github.com/cockroachdb/pebble/v2/mem_table.go index e728e94..d787f26 100644 --- a/vendor/github.com/cockroachdb/pebble/mem_table.go +++ b/vendor/github.com/cockroachdb/pebble/v2/mem_table.go @@ -12,12 +12,12 @@ import ( "sync/atomic" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/arenaskl" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manual" - "github.com/cockroachdb/pebble/internal/rangedel" - "github.com/cockroachdb/pebble/internal/rangekey" + "github.com/cockroachdb/pebble/v2/internal/arenaskl" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manual" + "github.com/cockroachdb/pebble/v2/internal/rangedel" + "github.com/cockroachdb/pebble/v2/internal/rangekey" ) func memTableEntrySize(keyBytes, valueBytes int) uint64 { @@ -66,7 +66,7 @@ type memTable struct { cmp Compare formatKey base.FormatKey equal Equal - arenaBuf []byte + arenaBuf manual.Buf skl arenaskl.Skiplist rangeDelSkl arenaskl.Skiplist rangeKeySkl arenaskl.Skiplist @@ -85,15 +85,15 @@ type memTable struct { rangeKeys keySpanCache // The current logSeqNum at the time the memtable was created. This is // guaranteed to be less than or equal to any seqnum stored in the memtable. - logSeqNum uint64 + logSeqNum base.SeqNum releaseAccountingReservation func() } func (m *memTable) free() { if m != nil { m.releaseAccountingReservation() - manual.Free(m.arenaBuf) - m.arenaBuf = nil + manual.Free(manual.MemTable, m.arenaBuf) + m.arenaBuf = manual.Buf{} } } @@ -102,16 +102,16 @@ func (m *memTable) free() { // which is used by tests. type memTableOptions struct { *Options - arenaBuf []byte + arenaBuf manual.Buf size int - logSeqNum uint64 + logSeqNum base.SeqNum releaseAccountingReservation func() } func checkMemTable(obj interface{}) { m := obj.(*memTable) - if m.arenaBuf != nil { - fmt.Fprintf(os.Stderr, "%p: memTable buffer was not freed\n", m.arenaBuf) + if m.arenaBuf.Data() != nil { + fmt.Fprintf(os.Stderr, "%v: memTable buffer was not freed\n", m.arenaBuf) os.Exit(1) } } @@ -119,7 +119,10 @@ func checkMemTable(obj interface{}) { // newMemTable returns a new MemTable of the specified size. If size is zero, // Options.MemTableSize is used instead. func newMemTable(opts memTableOptions) *memTable { - opts.Options = opts.Options.EnsureDefaults() + if opts.Options == nil { + opts.Options = &Options{} + } + opts.Options.EnsureDefaults() m := new(memTable) m.init(opts) return m @@ -151,11 +154,11 @@ func (m *memTable) init(opts memTableOptions) { constructSpan: rangekey.Decode, } - if m.arenaBuf == nil { - m.arenaBuf = make([]byte, opts.size) + if m.arenaBuf.Data() == nil { + m.arenaBuf = manual.New(manual.MemTable, uintptr(opts.size)) } - arena := arenaskl.NewArena(m.arenaBuf) + arena := arenaskl.NewArena(m.arenaBuf.Slice()) m.skl.Reset(arena, m.cmp) m.rangeDelSkl.Reset(arena, m.cmp) m.rangeKeySkl.Reset(arena, m.cmp) @@ -201,7 +204,7 @@ func (m *memTable) prepare(batch *Batch) error { return nil } -func (m *memTable) apply(batch *Batch, seqNum uint64) error { +func (m *memTable) apply(batch *Batch, seqNum base.SeqNum) error { if seqNum < m.logSeqNum { return base.CorruptionErrorf("pebble: batch seqnum %d is less than memtable creation seqnum %d", errors.Safe(seqNum), errors.Safe(m.logSeqNum)) @@ -230,8 +233,8 @@ func (m *memTable) apply(batch *Batch, seqNum uint64) error { // Don't increment seqNum for LogData, since these are not applied // to the memtable. seqNum-- - case InternalKeyKindIngestSST: - panic("pebble: cannot apply ingested sstable key kind to memtable") + case InternalKeyKindIngestSST, InternalKeyKindExcise: + panic("pebble: cannot apply ingested sstable or excise kind keys to memtable") default: err = ins.Add(&m.skl, ikey, value) } @@ -239,9 +242,9 @@ func (m *memTable) apply(batch *Batch, seqNum uint64) error { return err } } - if seqNum != startSeqNum+uint64(batch.Count()) { + if seqNum != startSeqNum+base.SeqNum(batch.Count()) { return base.CorruptionErrorf("pebble: inconsistent batch count: %d vs %d", - errors.Safe(seqNum), errors.Safe(startSeqNum+uint64(batch.Count()))) + errors.Safe(seqNum), errors.Safe(startSeqNum+base.SeqNum(batch.Count()))) } if tombstoneCount != 0 { m.tombstones.invalidate(tombstoneCount) @@ -260,8 +263,8 @@ func (m *memTable) newIter(o *IterOptions) internalIterator { } // newFlushIter is part of the flushable interface. -func (m *memTable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator { - return m.skl.NewFlushIter(bytesFlushed) +func (m *memTable) newFlushIter(o *IterOptions) internalIterator { + return m.skl.NewFlushIter() } // newRangeDelIter is part of the flushable interface. @@ -290,6 +293,12 @@ func (m *memTable) containsRangeKeys() bool { func (m *memTable) availBytes() uint32 { a := m.skl.Arena() if m.writerRefs.Load() == 1 { + // Note that one ref is maintained as long as the memtable is the + // current mutable memtable, so when evaluating whether the current + // mutable memtable has sufficient space for committing a batch, it is + // guaranteed that m.writerRefs() >= 1. This means a writerRefs() of 1 + // indicates there are no other concurrent apply operations. + // // If there are no other concurrent apply operations, we can update the // reserved bytes setting to accurately reflect how many bytes of been // allocated vs the over-estimation present in memTableEntrySize. @@ -313,6 +322,11 @@ func (m *memTable) empty() bool { return m.skl.Size() == memTableEmptySize } +// computePossibleOverlaps is part of the flushable interface. +func (m *memTable) computePossibleOverlaps(fn func(bounded) shouldContinue, bounded ...bounded) { + computePossibleOverlapsGenericImpl[*memTable](m, m.cmp, fn, bounded) +} + // A keySpanFrags holds a set of fragmented keyspan.Spans with a particular key // kind at a particular moment for a memtable. // @@ -363,8 +377,8 @@ func (f *keySpanFrags) get( } it := skl.NewIter(nil, nil) var keysDst []keyspan.Key - for key, val := it.First(); key != nil; key, val = it.Next() { - s, err := constructSpan(*key, val.InPlaceValue(), keysDst) + for kv := it.First(); kv != nil; kv = it.Next() { + s, err := constructSpan(kv.K, kv.InPlaceValue(), keysDst) if err != nil { panic(err) } diff --git a/vendor/github.com/cockroachdb/pebble/merger.go b/vendor/github.com/cockroachdb/pebble/v2/merger.go similarity index 95% rename from vendor/github.com/cockroachdb/pebble/merger.go rename to vendor/github.com/cockroachdb/pebble/v2/merger.go index 26f6ee6..e4d27f2 100644 --- a/vendor/github.com/cockroachdb/pebble/merger.go +++ b/vendor/github.com/cockroachdb/pebble/v2/merger.go @@ -7,7 +7,7 @@ package pebble import ( "io" - "github.com/cockroachdb/pebble/internal/base" + "github.com/cockroachdb/pebble/v2/internal/base" ) // Merge exports the base.Merge type. diff --git a/vendor/github.com/cockroachdb/pebble/merging_iter.go b/vendor/github.com/cockroachdb/pebble/v2/merging_iter.go similarity index 58% rename from vendor/github.com/cockroachdb/pebble/merging_iter.go rename to vendor/github.com/cockroachdb/pebble/v2/merging_iter.go index a076686..2928053 100644 --- a/vendor/github.com/cockroachdb/pebble/merging_iter.go +++ b/vendor/github.com/cockroachdb/pebble/v2/merging_iter.go @@ -6,14 +6,16 @@ package pebble import ( "bytes" + "context" "fmt" "runtime/debug" "unsafe" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" ) type mergingIterLevel struct { @@ -24,19 +26,15 @@ type mergingIterLevel struct { // are crossed. See levelIter.initRangeDel and the Range Deletions comment // below. rangeDelIter keyspan.FragmentIterator - // iterKey and iterValue cache the current key and value iter are pointed at. - iterKey *InternalKey - iterValue base.LazyValue + // rangeDelIterGeneration is incremented whenever rangeDelIter changes. + rangeDelIterGeneration int + // iterKV caches the current key-value pair iter points to. + iterKV *base.InternalKV // levelIter is non-nil if this level's iter is ultimately backed by a // *levelIter. The handle in iter may have wrapped the levelIter with // intermediary internalIterator implementations. levelIter *levelIter - // levelIterBoundaryContext's fields are set when using levelIter, in order - // to surface sstable boundary keys and file-level context. See levelIter - // comment and the Range Deletions comment below. - levelIterBoundaryContext - // tombstone caches the tombstone rangeDelIter is currently pointed at. If // tombstone is nil, there are no further tombstones within the // current sstable in the current iterator direction. The cached tombstone is @@ -46,30 +44,16 @@ type mergingIterLevel struct { tombstone *keyspan.Span } -type levelIterBoundaryContext struct { - // smallestUserKey and largestUserKey are populated with the smallest and - // largest boundaries of the current file. - smallestUserKey, largestUserKey []byte - // isLargestUserKeyExclusive is set to true when a file's largest boundary - // is an exclusive key, (eg, a range deletion sentinel). If true, the file - // does not contain any keys with the provided user key, and the - // largestUserKey bound is exclusive. - isLargestUserKeyExclusive bool - // isSyntheticIterBoundsKey is set to true iff the key returned by the level - // iterator is a synthetic key derived from the iterator bounds. This is used - // to prevent the mergingIter from being stuck at such a synthetic key if it - // becomes the top element of the heap. When used with a user-facing Iterator, - // the only range deletions exposed by this mergingIter should be those with - // `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`. - isSyntheticIterBoundsKey bool - // isIgnorableBoundaryKey is set to true iff the key returned by the level - // iterator is a file boundary key that should be ignored when returning to - // the parent iterator. File boundary keys are used by the level iter to - // keep a levelIter file's range deletion iterator open as long as other - // levels within the merging iterator require it. When used with a user-facing - // Iterator, the only range deletions exposed by this mergingIter should be - // those with `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`. - isIgnorableBoundaryKey bool +// Assert that *mergingIterLevel implements rangeDelIterSetter. +var _ rangeDelIterSetter = (*mergingIterLevel)(nil) + +func (ml *mergingIterLevel) setRangeDelIter(iter keyspan.FragmentIterator) { + ml.tombstone = nil + if ml.rangeDelIter != nil { + ml.rangeDelIter.Close() + } + ml.rangeDelIter = iter + ml.rangeDelIterGeneration++ } // mergingIter provides a merged view of multiple iterators from different @@ -237,6 +221,21 @@ type levelIterBoundaryContext struct { // to "n" which is covered by the range tombstone [m,q) causing the iterator to // advance to "o" which is visible. // +// # Error handling +// +// Any iterator operation may fail. The InternalIterator contract dictates that +// an iterator must return a nil internal key when an error occurs, and a +// subsequent call to Error() should return the error value. The exported +// merging iterator positioning methods must adhere to this contract by setting +// m.err to hold any error encountered by the individual level iterators and +// returning a nil internal key. Some internal helpers (eg, +// find[Next|Prev]Entry) also adhere to this contract, setting m.err directly). +// Other internal functions return an explicit error return value and DO NOT set +// m.err, relying on the caller to set m.err appropriately. +// +// TODO(jackson): Update the InternalIterator interface to return explicit error +// return values (and an *InternalKV pointer). +// // TODO(peter,rangedel): For testing, advance the iterator through various // scenarios and have each step display the current state (i.e. the current // heap and range-del iterator positioning). @@ -244,8 +243,8 @@ type mergingIter struct { logger Logger split Split dir int - snapshot uint64 - batchSnapshot uint64 + snapshot base.SeqNum + batchSnapshot base.SeqNum levels []mergingIterLevel heap mergingIterHeap err error @@ -253,6 +252,7 @@ type mergingIter struct { lower []byte upper []byte stats *InternalIteratorStats + seekKeyBuf []byte // levelsPositioned, if non-nil, is a slice of the same length as levels. // It's used by NextPrefix to record which levels have already been @@ -307,14 +307,14 @@ func (m *mergingIter) init( m.lower = opts.LowerBound m.upper = opts.UpperBound } - m.snapshot = InternalKeySeqNumMax - m.batchSnapshot = InternalKeySeqNumMax + m.snapshot = base.SeqNumMax + m.batchSnapshot = base.SeqNumMax m.levels = levels m.heap.cmp = cmp m.split = split m.stats = stats if cap(m.heap.items) < len(levels) { - m.heap.items = make([]*mergingIterLevel, 0, len(levels)) + m.heap.items = make([]mergingIterHeapItem, 0, len(levels)) } else { m.heap.items = m.heap.items[:0] } @@ -326,23 +326,18 @@ func (m *mergingIter) init( func (m *mergingIter) initHeap() { m.heap.items = m.heap.items[:0] for i := range m.levels { - if l := &m.levels[i]; l.iterKey != nil { - m.heap.items = append(m.heap.items, l) - } else { - m.err = firstError(m.err, l.iter.Error()) - if m.err != nil { - return - } + if l := &m.levels[i]; l.iterKV != nil { + m.heap.items = append(m.heap.items, mergingIterHeapItem{mergingIterLevel: l}) } } m.heap.init() } -func (m *mergingIter) initMinHeap() { +func (m *mergingIter) initMinHeap() error { m.dir = 1 m.heap.reverse = false m.initHeap() - m.initMinRangeDelIters(-1) + return m.initMinRangeDelIters(-1) } // The level of the previous top element was oldTopLevel. Note that all range delete @@ -350,9 +345,9 @@ func (m *mergingIter) initMinHeap() { // the range delete iterator == oldTopLevel is positioned at or past the key of the // previous top element. We need to position the range delete iterators from oldTopLevel + 1 // to the level of the current top element. -func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) { +func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) error { if m.heap.len() == 0 { - return + return nil } // Position the range-del iterators at levels <= m.heap.items[0].index. @@ -362,15 +357,20 @@ func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) { if l.rangeDelIter == nil { continue } - l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey) + var err error + l.tombstone, err = l.rangeDelIter.SeekGE(item.iterKV.K.UserKey) + if err != nil { + return err + } } + return nil } -func (m *mergingIter) initMaxHeap() { +func (m *mergingIter) initMaxHeap() error { m.dir = -1 m.heap.reverse = true m.initHeap() - m.initMaxRangeDelIters(-1) + return m.initMaxRangeDelIters(-1) } // The level of the previous top element was oldTopLevel. Note that all range delete @@ -378,9 +378,9 @@ func (m *mergingIter) initMaxHeap() { // the range delete iterator == oldTopLevel is positioned at or before the key of the // previous top element. We need to position the range delete iterators from oldTopLevel + 1 // to the level of the current top element. -func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) { +func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) error { if m.heap.len() == 0 { - return + return nil } // Position the range-del iterators at levels <= m.heap.items[0].index. item := m.heap.items[0] @@ -389,18 +389,23 @@ func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) { if l.rangeDelIter == nil { continue } - l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey) + tomb, err := keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKV.K.UserKey) + if err != nil { + return err + } + l.tombstone = tomb } + return nil } -func (m *mergingIter) switchToMinHeap() { +func (m *mergingIter) switchToMinHeap() error { if m.heap.len() == 0 { if m.lower != nil { m.SeekGE(m.lower, base.SeekGEFlagsNone) } else { m.First() } - return + return m.err } // We're switching from using a max heap to a min heap. We need to advance @@ -413,78 +418,47 @@ func (m *mergingIter) switchToMinHeap() { // The current key is a:2 and i2 is pointed at a:1. When we switch to forward // iteration, we want to return a key that is greater than a:2. - key := m.heap.items[0].iterKey - cur := m.heap.items[0] + key := m.heap.items[0].iterKV.K + cur := m.heap.items[0].mergingIterLevel for i := range m.levels { l := &m.levels[i] if l == cur { continue } - - // If the iterator is exhausted, it may be out of bounds if range - // deletions modified our search key as we descended. we need to - // reposition it within the search bounds. If the current key is a - // range tombstone, the iterator might still be exhausted but at a - // sstable boundary sentinel. It would be okay to reposition an - // interator like this only through successive Next calls, except that - // it would violate the levelIter's invariants by causing it to return - // a key before the lower bound. - // - // bounds = [ f, _ ) - // L0: [ b ] [ f* z ] - // L1: [ a |----| k y ] - // L2: [ c (d) ] [ e g m ] - // L3: [ x ] - // - // * - current key [] - table bounds () - heap item - // - // In the above diagram, the L2 iterator is positioned at a sstable - // boundary (d) outside the lower bound (f). It arrived here from a - // seek whose seek-key was modified by a range tombstone. If we called - // Next on the L2 iterator, it would return e, violating its lower - // bound. Instead, we seek it to >= f and Next from there. - - if l.iterKey == nil || (m.lower != nil && l.isSyntheticIterBoundsKey && - l.iterKey.IsExclusiveSentinel() && - m.heap.cmp(l.iterKey.UserKey, m.lower) <= 0) { - if m.lower != nil { - l.iterKey, l.iterValue = l.iter.SeekGE(m.lower, base.SeekGEFlagsNone) - } else { - l.iterKey, l.iterValue = l.iter.First() - } - } - for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Next() { - if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) < 0 { + for l.iterKV = l.iter.Next(); l.iterKV != nil; l.iterKV = l.iter.Next() { + if base.InternalCompare(m.heap.cmp, key, l.iterKV.K) < 0 { // key < iter-key break } // key >= iter-key } + if l.iterKV == nil { + if err := l.iter.Error(); err != nil { + return err + } + } } // Special handling for the current iterator because we were using its key - // above. The iterator cur.iter may still be exhausted at a sstable boundary - // sentinel. Similar to the logic applied to the other levels, in these - // cases we seek the iterator to the first key in order to avoid violating - // levelIter's invariants. See the example in the for loop above. - if m.lower != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() && - m.heap.cmp(cur.iterKey.UserKey, m.lower) <= 0 { - cur.iterKey, cur.iterValue = cur.iter.SeekGE(m.lower, base.SeekGEFlagsNone) - } else { - cur.iterKey, cur.iterValue = cur.iter.Next() + // above. + cur.iterKV = cur.iter.Next() + if cur.iterKV == nil { + if err := cur.iter.Error(); err != nil { + return err + } } - m.initMinHeap() + return m.initMinHeap() } -func (m *mergingIter) switchToMaxHeap() { +func (m *mergingIter) switchToMaxHeap() error { if m.heap.len() == 0 { if m.upper != nil { m.SeekLT(m.upper, base.SeekLTFlagsNone) } else { m.Last() } - return + return m.err } // We're switching from using a min heap to a max heap. We need to backup any @@ -496,8 +470,8 @@ func (m *mergingIter) switchToMaxHeap() { // // The current key is b:2 and i2 is pointing at b:1. When we switch to // reverse iteration, we want to return a key that is less than b:2. - key := m.heap.items[0].iterKey - cur := m.heap.items[0] + key := m.heap.items[0].iterKV.K + cur := m.heap.items[0].mergingIterLevel for i := range m.levels { l := &m.levels[i] @@ -505,87 +479,34 @@ func (m *mergingIter) switchToMaxHeap() { continue } - // If the iterator is exhausted, it may be out of bounds if range - // deletions modified our search key as we descended. we need to - // reposition it within the search bounds. If the current key is a - // range tombstone, the iterator might still be exhausted but at a - // sstable boundary sentinel. It would be okay to reposition an - // interator like this only through successive Prev calls, except that - // it would violate the levelIter's invariants by causing it to return - // a key beyond the upper bound. - // - // bounds = [ _, g ) - // L0: [ b ] [ f* z ] - // L1: [ a |-------| k y ] - // L2: [ c d ] h [(i) m ] - // L3: [ e x ] - // - // * - current key [] - table bounds () - heap item - // - // In the above diagram, the L2 iterator is positioned at a sstable - // boundary (i) outside the upper bound (g). It arrived here from a - // seek whose seek-key was modified by a range tombstone. If we called - // Prev on the L2 iterator, it would return h, violating its upper - // bound. Instead, we seek it to < g, and Prev from there. - - if l.iterKey == nil || (m.upper != nil && l.isSyntheticIterBoundsKey && - l.iterKey.IsExclusiveSentinel() && m.heap.cmp(l.iterKey.UserKey, m.upper) >= 0) { - if m.upper != nil { - l.iterKey, l.iterValue = l.iter.SeekLT(m.upper, base.SeekLTFlagsNone) - } else { - l.iterKey, l.iterValue = l.iter.Last() - } - } - for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Prev() { - if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) > 0 { + for l.iterKV = l.iter.Prev(); l.iterKV != nil; l.iterKV = l.iter.Prev() { + if base.InternalCompare(m.heap.cmp, key, l.iterKV.K) > 0 { // key > iter-key break } // key <= iter-key } + if l.iterKV == nil { + if err := l.iter.Error(); err != nil { + return err + } + } } // Special handling for the current iterator because we were using its key - // above. The iterator cur.iter may still be exhausted at a sstable boundary - // sentinel. Similar to the logic applied to the other levels, in these - // cases we seek the iterator to in order to avoid violating levelIter's - // invariants by Prev-ing through files. See the example in the for loop // above. - if m.upper != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() && - m.heap.cmp(cur.iterKey.UserKey, m.upper) >= 0 { - cur.iterKey, cur.iterValue = cur.iter.SeekLT(m.upper, base.SeekLTFlagsNone) - } else { - cur.iterKey, cur.iterValue = cur.iter.Prev() - } - m.initMaxHeap() -} - -// maybeNextEntryWithinPrefix steps to the next entry, as long as the iteration -// prefix has not already been exceeded. If it has, it exhausts the iterator by -// resetting the heap to empty. -func (m *mergingIter) maybeNextEntryWithinPrefix(l *mergingIterLevel) { - if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) { - // The item at the root of the heap already exceeds the iteration - // prefix. We should not advance any more. Clear the heap to reflect - // that the iterator is now exhausted (within this prefix, at - // least). - m.heap.items = m.heap.items[:0] - return + cur.iterKV = cur.iter.Prev() + if cur.iterKV == nil { + if err := cur.iter.Error(); err != nil { + return err + } } - m.nextEntry(l, nil /* succKey */) + return m.initMaxHeap() } // nextEntry unconditionally steps to the next entry. item is the current top // item in the heap. -// -// nextEntry should be called directly when not in prefix-iteration mode, or by -// Next. During prefix iteration mode, all other callers should use -// maybeNextEntryWithinPrefix which will avoid advancing the iterator if the -// current iteration prefix has been exhausted. See the comment within -// nextEntry's body for an explanation of why other callers should call -// maybeNextEntryWithinPrefix, which will ensure the documented invariant is -// preserved. -func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) { +func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) error { // INVARIANT: If in prefix iteration mode, item.iterKey must have a prefix equal // to m.prefix. This invariant is important for ensuring TrySeekUsingNext // optimizations behave correctly. @@ -607,52 +528,58 @@ func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) { // prefix. If nextEntry is ever invoked while we're already beyond the // current prefix, we're violating the invariant. if invariants.Enabled && m.prefix != nil { - if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) { + if p := m.split.Prefix(l.iterKV.K.UserKey); !bytes.Equal(m.prefix, p) { m.logger.Fatalf("mergingIter: prefix violation: nexting beyond prefix %q; existing heap root %q\n%s", - m.prefix, l.iterKey, debug.Stack()) + m.prefix, l.iterKV, debug.Stack()) } } oldTopLevel := l.index - oldRangeDelIter := l.rangeDelIter + oldRangeDelIterGeneration := l.rangeDelIterGeneration if succKey == nil { - l.iterKey, l.iterValue = l.iter.Next() + l.iterKV = l.iter.Next() } else { - l.iterKey, l.iterValue = l.iter.NextPrefix(succKey) + l.iterKV = l.iter.NextPrefix(succKey) } - if l.iterKey != nil { - if m.heap.len() > 1 { - m.heap.fix(0) + if l.iterKV == nil { + if err := l.iter.Error(); err != nil { + return err } - if l.rangeDelIter != oldRangeDelIter { + m.heap.pop() + } else { + if m.prefix != nil && !bytes.Equal(m.prefix, m.split.Prefix(l.iterKV.K.UserKey)) { + // Set keys without a matching prefix to their zero values when in prefix + // iteration mode and remove iterated level from heap. + l.iterKV = nil + m.heap.pop() + } else if m.heap.len() > 1 { + m.heap.fixTop() + } + if l.rangeDelIterGeneration != oldRangeDelIterGeneration { // The rangeDelIter changed which indicates that the l.iter moved to the // next sstable. We have to update the tombstone for oldTopLevel as well. oldTopLevel-- } - } else { - m.err = l.iter.Error() - if m.err == nil { - m.heap.pop() - } } // The cached tombstones are only valid for the levels // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range // [oldTopLevel+1,heap[0].index]. - m.initMinRangeDelIters(oldTopLevel) + return m.initMinRangeDelIters(oldTopLevel) } // isNextEntryDeleted starts from the current entry (as the next entry) and if // it is deleted, moves the iterators forward as needed and returns true, else -// it returns false. item is the top item in the heap. +// it returns false. item is the top item in the heap. If any of the required +// iterator operations error, the error is returned without updating m.err. // // During prefix iteration mode, isNextEntryDeleted will exhaust the iterator by // clearing the heap if the deleted key(s) extend beyond the iteration prefix // during prefix-iteration mode. -func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) bool { - // Look for a range deletion tombstone containing item.iterKey at higher +func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) (bool, error) { + // Look for a range deletion tombstone containing item.iterKV at higher // levels (level < item.index). If we find such a range tombstone we know // it deletes the key in the current level. Also look for a range // deletion at the current level (level == item.index). If we find such a @@ -666,68 +593,44 @@ func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) bool { // direction. continue } - if m.heap.cmp(l.tombstone.End, item.iterKey.UserKey) <= 0 { + if m.heap.cmp(l.tombstone.End, item.iterKV.K.UserKey) <= 0 { // The current key is at or past the tombstone end key. // // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that - // the levelIter must be positioned at a key >= item.iterKey. So it is sufficient to seek the + // the levelIter must be positioned at a key >= item.iterKV. So it is sufficient to seek the // current l.rangeDelIter (since any range del iterators that will be provided by the - // levelIter in the future cannot contain item.iterKey). Also, it is possible that we + // levelIter in the future cannot contain item.iterKV). Also, it is possible that we // will encounter parts of the range delete that should be ignored -- we handle that // below. - l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey) + var err error + l.tombstone, err = l.rangeDelIter.SeekGE(item.iterKV.K.UserKey) + if err != nil { + return false, err + } } if l.tombstone == nil { continue } - // Reasoning for correctness of untruncated tombstone handling when the untruncated - // tombstone is at a higher level: - // The iterator corresponding to this tombstone is still in the heap so it must be - // positioned >= item.iterKey. Which means the Largest key bound of the sstable containing this - // tombstone is >= item.iterKey. So the upper limit of this tombstone cannot be file-bounds-constrained - // to < item.iterKey. But it is possible that item.key < smallestUserKey, in which - // case this tombstone should be ignored. - // - // Example 1: - // sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is c#6. The - // smallestUserKey is c, so we know the key is within the file bounds and the tombstone - // [b, i) covers it. - // - // Example 2: - // Same sstable bounds but key is b#10. The smallestUserKey is c, so the tombstone [b, i) - // does not cover this key. - // - // For a tombstone at the same level as the key, the file bounds are trivially satisfied. - if (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, item.iterKey.UserKey) <= 0) && - l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) { + if l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.Start, item.iterKV.K.UserKey) <= 0 { if level < item.index { // We could also do m.seekGE(..., level + 1). The levels from - // [level + 1, item.index) are already after item.iterKey so seeking them may be + // [level + 1, item.index) are already after item.iterKV so seeking them may be // wasteful. - // We can seek up to the min of largestUserKey and tombstone.End. - // - // Using example 1 above, we can seek to the smaller of g and i, which is g. + // We can seek up to tombstone.End. // - // Another example, where the sstable bounds are [c#8, i#InternalRangeDelSentinel], - // and the tombstone is [b, i)#8. Seeking to i is correct since it is seeking up to - // the exclusive bound of the tombstone. We do not need to look at - // isLargestKeyRangeDelSentinel. - // - // Progress argument: Since this file is at a higher level than item.iterKey we know + // Progress argument: Since this file is at a higher level than item.iterKV we know // that the iterator in this file must be positioned within its bounds and at a key - // X > item.iterKey (otherwise it would be the min of the heap). It is not - // possible for X.UserKey == item.iterKey.UserKey, since it is incompatible with - // X > item.iterKey (a lower version cannot be in a higher sstable), so it must be that - // X.UserKey > item.iterKey.UserKey. Which means l.largestUserKey > item.key.UserKey. - // We also know that l.tombstone.End > item.iterKey.UserKey. So the min of these, - // seekKey, computed below, is > item.iterKey.UserKey, so the call to seekGE() will + // X > item.iterKV (otherwise it would be the min of the heap). It is not + // possible for X.UserKey == item.iterKV.UserKey, since it is incompatible with + // X > item.iterKV (a lower version cannot be in a higher sstable), so it must be that + // X.UserKey > item.iterKV.UserKey. Which means l.largestUserKey > item.key.UserKey. + // We also know that l.tombstone.End > item.iterKV.UserKey. So the min of these, + // seekKey, computed below, is > item.iterKV.UserKey, so the call to seekGE() will // make forward progress. - seekKey := l.tombstone.End - if l.largestUserKey != nil && m.heap.cmp(l.largestUserKey, seekKey) < 0 { - seekKey = l.largestUserKey - } + m.seekKeyBuf = append(m.seekKeyBuf[:0], l.tombstone.End...) + seekKey := m.seekKeyBuf // This seek is not directly due to a SeekGE call, so we don't know // enough about the underlying iterator positions, and so we keep the // try-seek-using-next optimization disabled. Additionally, if we're in @@ -747,18 +650,18 @@ func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) bool { // file the seek will land on, we need to detect it in order to // trigger construction of the combined iterator. if m.prefix != nil { - if n := m.split(seekKey); !bytes.Equal(m.prefix, seekKey[:n]) { + if !bytes.Equal(m.prefix, m.split.Prefix(seekKey)) { for i := item.index; i < len(m.levels); i++ { - // Remove this level from the heap. Setting iterKey and iterValue - // to their zero values should be sufficient for initMinHeap to not - // re-initialize the heap with them in it. Other fields in - // mergingIterLevel can remain as-is; the iter/rangeDelIter needs - // to stay intact for future trySeekUsingNexts to work, the level - // iter boundary context is owned by the levelIter which is not - // being repositioned, and any tombstones in these levels will be - // irrelevant for us anyway. - m.levels[i].iterKey = nil - m.levels[i].iterValue = base.LazyValue{} + // Remove this level from the heap. Setting iterKV + // to nil should be sufficient for initMinHeap to + // not re-initialize the heap with them in it. Other + // fields in mergingIterLevel can remain as-is; the + // iter/rangeDelIter needs to stay intact for future + // trySeekUsingNexts to work, the level iter + // boundary context is owned by the levelIter which + // is not being repositioned, and any tombstones in + // these levels will be irrelevant for us anyway. + m.levels[i].iterKV = nil } // TODO(bilal): Consider a more efficient way of removing levels from // the heap without reinitializing all of it. This would likely @@ -766,105 +669,123 @@ func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) bool { // item in the mergingIterLevel, and then swapping that item in the // heap with the last-positioned heap item, and shrinking the heap by // one. - m.initMinHeap() - return true + if err := m.initMinHeap(); err != nil { + return false, err + } + return true, nil } } - m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek()) - return true + if err := m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek()); err != nil { + return false, err + } + return true, nil } - if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) { - if m.prefix == nil { - m.nextEntry(item, nil /* succKey */) - } else { - m.maybeNextEntryWithinPrefix(item) + if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { + if err := m.nextEntry(item, nil /* succKey */); err != nil { + return false, err } - return true + return true, nil } } } - return false + return false, nil } // Starting from the current entry, finds the first (next) entry that can be returned. -func (m *mergingIter) findNextEntry() (*InternalKey, base.LazyValue) { +// +// If an error occurs, m.err is updated to hold the error and findNextentry +// returns a nil internal key. +func (m *mergingIter) findNextEntry() *base.InternalKV { for m.heap.len() > 0 && m.err == nil { - item := m.heap.items[0] - if m.levels[item.index].isSyntheticIterBoundsKey { - break - } - - m.addItemStats(item) - - // Skip ignorable boundary keys. These are not real keys and exist to - // keep sstables open until we've surpassed their end boundaries so that - // their range deletions are visible. - if m.levels[item.index].isIgnorableBoundaryKey { - if m.prefix == nil { - m.nextEntry(item, nil /* succKey */) - } else { - m.maybeNextEntryWithinPrefix(item) + item := m.heap.items[0].mergingIterLevel + + // The levelIter internal iterator will interleave exclusive sentinel + // keys to keep files open until their range deletions are no longer + // necessary. Sometimes these are interleaved with the user key of a + // file's largest key, in which case they may simply be stepped over to + // move to the next file in the forward direction. Other times they're + // interleaved at the user key of the user-iteration boundary, if that + // falls within the bounds of a file. In the latter case, there are no + // more keys < m.upper, and we can stop iterating. + // + // We perform a key comparison to differentiate between these two cases. + // This key comparison is considered okay because it only happens for + // sentinel keys. It may be eliminated after #2863. + if m.levels[item.index].iterKV.K.IsExclusiveSentinel() { + if m.upper != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.upper) >= 0 { + break + } + // This key is the largest boundary of a file and can be skipped now + // that the file's range deletions are no longer relevant. + m.err = m.nextEntry(item, nil /* succKey */) + if m.err != nil { + return nil } continue } + m.addItemStats(item) + // Check if the heap root key is deleted by a range tombstone in a // higher level. If it is, isNextEntryDeleted will advance the iterator // to a later key (through seeking or nexting). - if m.isNextEntryDeleted(item) { + isDeleted, err := m.isNextEntryDeleted(item) + if err != nil { + m.err = err + return nil + } else if isDeleted { m.stats.PointsCoveredByRangeTombstones++ continue } // Check if the key is visible at the iterator sequence numbers. - if !item.iterKey.Visible(m.snapshot, m.batchSnapshot) { - if m.prefix == nil { - m.nextEntry(item, nil /* succKey */) - } else { - m.maybeNextEntryWithinPrefix(item) + if !item.iterKV.Visible(m.snapshot, m.batchSnapshot) { + m.err = m.nextEntry(item, nil /* succKey */) + if m.err != nil { + return nil } continue } // The heap root is visible and not deleted by any range tombstones. // Return it. - return item.iterKey, item.iterValue + return item.iterKV } - return nil, base.LazyValue{} + return nil } // Steps to the prev entry. item is the current top item in the heap. -func (m *mergingIter) prevEntry(l *mergingIterLevel) { +func (m *mergingIter) prevEntry(l *mergingIterLevel) error { oldTopLevel := l.index - oldRangeDelIter := l.rangeDelIter - if l.iterKey, l.iterValue = l.iter.Prev(); l.iterKey != nil { + oldRangeDelIterGeneration := l.rangeDelIterGeneration + if l.iterKV = l.iter.Prev(); l.iterKV != nil { if m.heap.len() > 1 { - m.heap.fix(0) + m.heap.fixTop() } - if l.rangeDelIter != oldRangeDelIter && l.rangeDelIter != nil { + if l.rangeDelIterGeneration != oldRangeDelIterGeneration && l.rangeDelIter != nil { // The rangeDelIter changed which indicates that the l.iter moved to the // previous sstable. We have to update the tombstone for oldTopLevel as // well. oldTopLevel-- } } else { - m.err = l.iter.Error() - if m.err == nil { - m.heap.pop() + if err := l.iter.Error(); err != nil { + return err } + m.heap.pop() } // The cached tombstones are only valid for the levels // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range // [oldTopLevel+1,heap[0].index]. - m.initMaxRangeDelIters(oldTopLevel) + return m.initMaxRangeDelIters(oldTopLevel) } // isPrevEntryDeleted() starts from the current entry (as the prev entry) and if it is deleted, // moves the iterators backward as needed and returns true, else it returns false. item is the top // item in the heap. -func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) bool { - // Look for a range deletion tombstone containing item.iterKey at higher +func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) (bool, error) { + // Look for a range deletion tombstone containing item.iterKV at higher // levels (level < item.index). If we find such a range tombstone we know // it deletes the key in the current level. Also look for a range // deletion at the current level (level == item.index). If we find such a @@ -878,114 +799,115 @@ func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) bool { // direction. continue } - if m.heap.cmp(item.iterKey.UserKey, l.tombstone.Start) < 0 { + if m.heap.cmp(item.iterKV.K.UserKey, l.tombstone.Start) < 0 { // The current key is before the tombstone start key. // // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that - // the levelIter must be positioned at a key < item.iterKey. So it is sufficient to seek the + // the levelIter must be positioned at a key < item.iterKV. So it is sufficient to seek the // current l.rangeDelIter (since any range del iterators that will be provided by the - // levelIter in the future cannot contain item.iterKey). Also, it is it is possible that we + // levelIter in the future cannot contain item.iterKV). Also, it is it is possible that we // will encounter parts of the range delete that should be ignored -- we handle that // below. - l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey) + + tomb, err := keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKV.K.UserKey) + if err != nil { + return false, err + } + l.tombstone = tomb } if l.tombstone == nil { continue } - - // Reasoning for correctness of untruncated tombstone handling when the untruncated - // tombstone is at a higher level: - // - // The iterator corresponding to this tombstone is still in the heap so it must be - // positioned <= item.iterKey. Which means the Smallest key bound of the sstable containing this - // tombstone is <= item.iterKey. So the lower limit of this tombstone cannot have been - // file-bounds-constrained to > item.iterKey. But it is possible that item.key >= Largest - // key bound of this sstable, in which case this tombstone should be ignored. - // - // Example 1: - // sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is f#6. The - // largestUserKey is g, so we know the key is within the file bounds and the tombstone - // [b, i) covers it. - // - // Example 2: - // Same sstable but the key is g#6. This cannot happen since the [b, i)#7 untruncated - // tombstone was involved in a compaction which must have had a file to the right of this - // sstable that is part of the same atomic compaction group for future compactions. That - // file must have bounds that cover g#6 and this levelIter must be at that file. - // - // Example 3: - // sstable bounds [c#8, g#RangeDelSentinel] containing [b, i)#7 and the key is g#10. - // This key is not deleted by this tombstone. We need to look at - // isLargestUserKeyExclusive. - // - // For a tombstone at the same level as the key, the file bounds are trivially satisfied. - - // Default to within bounds. - withinLargestSSTableBound := true - if l.largestUserKey != nil { - cmpResult := m.heap.cmp(l.largestUserKey, item.iterKey.UserKey) - withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive) - } - if withinLargestSSTableBound && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) && l.tombstone.VisibleAt(m.snapshot) { + if l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.End, item.iterKV.K.UserKey) > 0 { if level < item.index { // We could also do m.seekLT(..., level + 1). The levels from - // [level + 1, item.index) are already before item.iterKey so seeking them may be + // [level + 1, item.index) are already before item.iterKV so seeking them may be // wasteful. - // We can seek up to the max of smallestUserKey and tombstone.Start.UserKey. - // - // Using example 1 above, we can seek to the larger of c and b, which is c. + // We can seek up to tombstone.Start.UserKey. // // Progress argument: We know that the iterator in this file is positioned within - // its bounds and at a key X < item.iterKey (otherwise it would be the max of the heap). - // So smallestUserKey <= item.iterKey.UserKey and we already know that - // l.tombstone.Start.UserKey <= item.iterKey.UserKey. So the seekKey computed below - // is <= item.iterKey.UserKey, and since we do a seekLT() we will make backwards + // its bounds and at a key X < item.iterKV (otherwise it would be the max of the heap). + // So smallestUserKey <= item.iterKV.UserKey and we already know that + // l.tombstone.Start.UserKey <= item.iterKV.UserKey. So the seekKey computed below + // is <= item.iterKV.UserKey, and since we do a seekLT() we will make backwards // progress. - seekKey := l.tombstone.Start - if l.smallestUserKey != nil && m.heap.cmp(l.smallestUserKey, seekKey) > 0 { - seekKey = l.smallestUserKey - } + m.seekKeyBuf = append(m.seekKeyBuf[:0], l.tombstone.Start...) + seekKey := m.seekKeyBuf // We set the relative-seek flag. This is important when // iterating with lazy combined iteration. If there's a range // key between this level's current file and the file the seek // will land on, we need to detect it in order to trigger // construction of the combined iterator. - m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek()) - return true + if err := m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek()); err != nil { + return false, err + } + return true, nil } - if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) { - m.prevEntry(item) - return true + if l.tombstone.CoversAt(m.snapshot, item.iterKV.SeqNum()) { + if err := m.prevEntry(item); err != nil { + return false, err + } + return true, nil } } } - return false + return false, nil } // Starting from the current entry, finds the first (prev) entry that can be returned. -func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) { +// +// If an error occurs, m.err is updated to hold the error and findNextentry +// returns a nil internal key. +func (m *mergingIter) findPrevEntry() *base.InternalKV { for m.heap.len() > 0 && m.err == nil { - item := m.heap.items[0] - if m.levels[item.index].isSyntheticIterBoundsKey { - break + item := m.heap.items[0].mergingIterLevel + + // The levelIter internal iterator will interleave exclusive sentinel + // keys to keep files open until their range deletions are no longer + // necessary. Sometimes these are interleaved with the user key of a + // file's smallest key, in which case they may simply be stepped over to + // move to the next file in the backward direction. Other times they're + // interleaved at the user key of the user-iteration boundary, if that + // falls within the bounds of a file. In the latter case, there are no + // more keys ≥ m.lower, and we can stop iterating. + // + // We perform a key comparison to differentiate between these two cases. + // This key comparison is considered okay because it only happens for + // sentinel keys. It may be eliminated after #2863. + if m.levels[item.index].iterKV.K.IsExclusiveSentinel() { + if m.lower != nil && m.heap.cmp(m.levels[item.index].iterKV.K.UserKey, m.lower) <= 0 { + break + } + // This key is the smallest boundary of a file and can be skipped + // now that the file's range deletions are no longer relevant. + m.err = m.prevEntry(item) + if m.err != nil { + return nil + } + continue } + m.addItemStats(item) - if m.isPrevEntryDeleted(item) { + if isDeleted, err := m.isPrevEntryDeleted(item); err != nil { + m.err = err + return nil + } else if isDeleted { m.stats.PointsCoveredByRangeTombstones++ continue } - if item.iterKey.Visible(m.snapshot, m.batchSnapshot) && - (!m.levels[item.index].isIgnorableBoundaryKey) { - return item.iterKey, item.iterValue + if item.iterKV.Visible(m.snapshot, m.batchSnapshot) { + return item.iterKV } - m.prevEntry(item) + m.err = m.prevEntry(item) } - return nil, base.LazyValue{} + return nil } // Seeks levels >= level to >= key. Additionally uses range tombstones to extend the seeks. -func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) { +// +// If an error occurs, seekGE returns the error without setting m.err. +func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) error { // When seeking, we can use tombstones to adjust the key we seek to on each // level. Consider the series of range tombstones: // @@ -1033,8 +955,7 @@ func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) { // Because the L5 iterator has already advanced to the next sstable, the // merging iterator cannot observe the [b-c) range tombstone and will // mistakenly return L6's deleted point key 'b'. - if invariants.Enabled && flags.TrySeekUsingNext() && !m.forceEnableSeekOpt && - disableSeekOpt(key, uintptr(unsafe.Pointer(m))) { + if testingDisableSeekOpt(key, uintptr(unsafe.Pointer(m))) && !m.forceEnableSeekOpt { flags = flags.DisableTrySeekUsingNext() } @@ -1045,9 +966,21 @@ func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) { l := &m.levels[level] if m.prefix != nil { - l.iterKey, l.iterValue = l.iter.SeekPrefixGE(m.prefix, key, flags) + l.iterKV = l.iter.SeekPrefixGE(m.prefix, key, flags) + if l.iterKV != nil { + if !bytes.Equal(m.prefix, m.split.Prefix(l.iterKV.K.UserKey)) { + // Prevent keys without a matching prefix from being added to the heap by setting + // iterKey and iterValue to their zero values before calling initMinHeap. + l.iterKV = nil + } + } } else { - l.iterKey, l.iterValue = l.iter.SeekGE(key, flags) + l.iterKV = l.iter.SeekGE(key, flags) + } + if l.iterKV == nil { + if err := l.iter.Error(); err != nil { + return err + } } // If this level contains overlapping range tombstones, alter the seek @@ -1060,43 +993,25 @@ func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) { (m.combinedIterState == nil || m.combinedIterState.initialized) { // The level has a range-del iterator. Find the tombstone containing // the search key. - // - // For untruncated tombstones that are possibly file-bounds-constrained, we are using a - // levelIter which will set smallestUserKey and largestUserKey. Since the levelIter - // is at this file we know that largestUserKey >= key, so we know that the - // tombstone we find cannot be file-bounds-constrained in its upper bound to something < key. - // We do need to compare with smallestUserKey to ensure that the tombstone is not - // file-bounds-constrained in its lower bound. - // - // See the detailed comments in isNextEntryDeleted() on why similar containment and - // seeking logic is correct. The subtle difference here is that key is a user key, - // so we can have a sstable with bounds [c#8, i#InternalRangeDelSentinel], and the - // tombstone is [b, k)#8 and the seek key is i: levelIter.SeekGE(i) will move past - // this sstable since it realizes the largest key is a InternalRangeDelSentinel. - l.tombstone = rangeDelIter.SeekGE(key) - if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, key) && - (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, key) <= 0) { - // NB: Based on the comment above l.largestUserKey >= key, and based on the - // containment condition tombstone.End > key, so the assignment to key results - // in a monotonically non-decreasing key across iterations of this loop. + var err error + l.tombstone, err = rangeDelIter.SeekGE(key) + if err != nil { + return err + } + if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && m.heap.cmp(l.tombstone.Start, key) <= 0 { + // Based on the containment condition tombstone.End > key, so + // the assignment to key results in a monotonically + // non-decreasing key across iterations of this loop. // - // The adjustment of key here can only move it to a larger key. Since - // the caller of seekGE guaranteed that the original key was greater - // than or equal to m.lower, the new key will continue to be greater - // than or equal to m.lower. - if l.largestUserKey != nil && - m.heap.cmp(l.largestUserKey, l.tombstone.End) < 0 { - // Truncate the tombstone for seeking purposes. Note that this can over-truncate - // but that is harmless for this seek optimization. - key = l.largestUserKey - } else { - key = l.tombstone.End - } + // The adjustment of key here can only move it to a larger key. + // Since the caller of seekGE guaranteed that the original key + // was greater than or equal to m.lower, the new key will + // continue to be greater than or equal to m.lower. + key = l.tombstone.End } } } - - m.initMinHeap() + return m.initMinHeap() } func (m *mergingIter) String() string { @@ -1106,27 +1021,42 @@ func (m *mergingIter) String() string { // SeekGE implements base.InternalIterator.SeekGE. Note that SeekGE only checks // the upper bound. It is up to the caller to ensure that key is greater than // or equal to the lower bound. -func (m *mergingIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { - m.err = nil // clear cached iteration error +func (m *mergingIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { m.prefix = nil - m.seekGE(key, 0 /* start level */, flags) + m.err = m.seekGE(key, 0 /* start level */, flags) + if m.err != nil { + return nil + } return m.findNextEntry() } -// SeekPrefixGE implements base.InternalIterator.SeekPrefixGE. Note that -// SeekPrefixGE only checks the upper bound. It is up to the caller to ensure -// that key is greater than or equal to the lower bound. -func (m *mergingIter) SeekPrefixGE( +// SeekPrefixGE implements base.InternalIterator.SeekPrefixGE. +func (m *mergingIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + return m.SeekPrefixGEStrict(prefix, key, flags) +} + +// SeekPrefixGEStrict implements topLevelIterator.SeekPrefixGEStrict. Note that +// SeekPrefixGEStrict explicitly checks that the key has a matching prefix. +func (m *mergingIter) SeekPrefixGEStrict( prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - m.err = nil // clear cached iteration error +) *base.InternalKV { m.prefix = prefix - m.seekGE(key, 0 /* start level */, flags) - return m.findNextEntry() + m.err = m.seekGE(key, 0 /* start level */, flags) + if m.err != nil { + return nil + } + + iterKV := m.findNextEntry() + if invariants.Enabled && iterKV != nil { + if !bytes.Equal(m.prefix, m.split.Prefix(iterKV.K.UserKey)) { + m.logger.Fatalf("mergingIter: prefix violation: returning key %q without prefix %q\n", iterKV, m.prefix) + } + } + return iterKV } // Seeks levels >= level to < key. Additionally uses range tombstones to extend the seeks. -func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) { +func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) error { // See the comment in seekGE regarding using tombstones to adjust the seek // target per level. m.prefix = nil @@ -1136,7 +1066,12 @@ func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) { } l := &m.levels[level] - l.iterKey, l.iterValue = l.iter.SeekLT(key, flags) + l.iterKV = l.iter.SeekLT(key, flags) + if l.iterKV == nil { + if err := l.iter.Error(); err != nil { + return err + } + } // If this level contains overlapping range tombstones, alter the seek // key accordingly. Caveat: If we're performing lazy-combined iteration, @@ -1148,119 +1083,128 @@ func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) { (m.combinedIterState == nil || m.combinedIterState.initialized) { // The level has a range-del iterator. Find the tombstone containing // the search key. - // - // For untruncated tombstones that are possibly file-bounds-constrained we are using a - // levelIter which will set smallestUserKey and largestUserKey. Since the levelIter - // is at this file we know that smallestUserKey <= key, so we know that the - // tombstone we find cannot be file-bounds-constrained in its lower bound to something > key. - // We do need to compare with largestUserKey to ensure that the tombstone is not - // file-bounds-constrained in its upper bound. - // - // See the detailed comments in isPrevEntryDeleted() on why similar containment and - // seeking logic is correct. - - // Default to within bounds. - withinLargestSSTableBound := true - if l.largestUserKey != nil { - cmpResult := m.heap.cmp(l.largestUserKey, key) - withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive) + tomb, err := keyspan.SeekLE(m.heap.cmp, rangeDelIter, key) + if err != nil { + return err } - - l.tombstone = keyspan.SeekLE(m.heap.cmp, rangeDelIter, key) + l.tombstone = tomb + // Since SeekLT is exclusive on `key` and a tombstone's end key is + // also exclusive, a seek key equal to a tombstone's end key still + // enables the seek optimization (Note this is different than the + // check performed by (*keyspan.Span).Contains). if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && - l.tombstone.Contains(m.heap.cmp, key) && withinLargestSSTableBound { - // NB: Based on the comment above l.smallestUserKey <= key, and based - // on the containment condition tombstone.Start.UserKey <= key, so the - // assignment to key results in a monotonically non-increasing key - // across iterations of this loop. + m.heap.cmp(key, l.tombstone.End) <= 0 { + // NB: Based on the containment condition + // tombstone.Start.UserKey <= key, so the assignment to key + // results in a monotonically non-increasing key across + // iterations of this loop. // - // The adjustment of key here can only move it to a smaller key. Since - // the caller of seekLT guaranteed that the original key was less than - // or equal to m.upper, the new key will continue to be less than or - // equal to m.upper. - if l.smallestUserKey != nil && - m.heap.cmp(l.smallestUserKey, l.tombstone.Start) >= 0 { - // Truncate the tombstone for seeking purposes. Note that this can over-truncate - // but that is harmless for this seek optimization. - key = l.smallestUserKey - } else { - key = l.tombstone.Start - } + // The adjustment of key here can only move it to a smaller key. + // Since the caller of seekLT guaranteed that the original key + // was less than or equal to m.upper, the new key will continue + // to be less than or equal to m.upper. + key = l.tombstone.Start } } } - m.initMaxHeap() + return m.initMaxHeap() } // SeekLT implements base.InternalIterator.SeekLT. Note that SeekLT only checks // the lower bound. It is up to the caller to ensure that key is less than the // upper bound. -func (m *mergingIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { - m.err = nil // clear cached iteration error +func (m *mergingIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { m.prefix = nil - m.seekLT(key, 0 /* start level */, flags) + m.err = m.seekLT(key, 0 /* start level */, flags) + if m.err != nil { + return nil + } return m.findPrevEntry() } // First implements base.InternalIterator.First. Note that First only checks // the upper bound. It is up to the caller to ensure that key is greater than // or equal to the lower bound (e.g. via a call to SeekGE(lower)). -func (m *mergingIter) First() (*InternalKey, base.LazyValue) { +func (m *mergingIter) First() *base.InternalKV { m.err = nil // clear cached iteration error m.prefix = nil m.heap.items = m.heap.items[:0] for i := range m.levels { l := &m.levels[i] - l.iterKey, l.iterValue = l.iter.First() + l.iterKV = l.iter.First() + if l.iterKV == nil { + if m.err = l.iter.Error(); m.err != nil { + return nil + } + } + } + if m.err = m.initMinHeap(); m.err != nil { + return nil } - m.initMinHeap() return m.findNextEntry() } // Last implements base.InternalIterator.Last. Note that Last only checks the // lower bound. It is up to the caller to ensure that key is less than the // upper bound (e.g. via a call to SeekLT(upper)) -func (m *mergingIter) Last() (*InternalKey, base.LazyValue) { +func (m *mergingIter) Last() *base.InternalKV { m.err = nil // clear cached iteration error m.prefix = nil for i := range m.levels { l := &m.levels[i] - l.iterKey, l.iterValue = l.iter.Last() + l.iterKV = l.iter.Last() + if l.iterKV == nil { + if m.err = l.iter.Error(); m.err != nil { + return nil + } + } + } + if m.err = m.initMaxHeap(); m.err != nil { + return nil } - m.initMaxHeap() return m.findPrevEntry() } -func (m *mergingIter) Next() (*InternalKey, base.LazyValue) { +func (m *mergingIter) Next() *base.InternalKV { if m.err != nil { - return nil, base.LazyValue{} + return nil } if m.dir != 1 { - m.switchToMinHeap() + if m.err = m.switchToMinHeap(); m.err != nil { + return nil + } return m.findNextEntry() } if m.heap.len() == 0 { - return nil, base.LazyValue{} + return nil } // NB: It's okay to call nextEntry directly even during prefix iteration - // mode (as opposed to indirectly through maybeNextEntryWithinPrefix). - // During prefix iteration mode, we rely on the caller to not call Next if - // the iterator has already advanced beyond the iteration prefix. See the - // comment above the base.InternalIterator interface. - m.nextEntry(m.heap.items[0], nil /* succKey */) - return m.findNextEntry() + // mode. During prefix iteration mode, we rely on the caller to not call + // Next if the iterator has already advanced beyond the iteration prefix. + // See the comment above the base.InternalIterator interface. + if m.err = m.nextEntry(m.heap.items[0].mergingIterLevel, nil /* succKey */); m.err != nil { + return nil + } + + iterKV := m.findNextEntry() + if invariants.Enabled && m.prefix != nil && iterKV != nil { + if !bytes.Equal(m.prefix, m.split.Prefix(iterKV.K.UserKey)) { + m.logger.Fatalf("mergingIter: prefix violation: returning key %q without prefix %q\n", iterKV, m.prefix) + } + } + return iterKV } -func (m *mergingIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { +func (m *mergingIter) NextPrefix(succKey []byte) *base.InternalKV { if m.dir != 1 { panic("pebble: cannot switch directions with NextPrefix") } if m.err != nil || m.heap.len() == 0 { - return nil, LazyValue{} + return nil } if m.levelsPositioned == nil { m.levelsPositioned = make([]bool, len(m.levels)) @@ -1272,55 +1216,81 @@ func (m *mergingIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { // The heap root necessarily must be positioned at a key < succKey, because // NextPrefix was invoked. - root := &m.heap.items[0] - m.levelsPositioned[(*root).index] = true - if invariants.Enabled && m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 { + root := m.heap.items[0].mergingIterLevel + if invariants.Enabled && m.heap.cmp((*root).iterKV.K.UserKey, succKey) >= 0 { m.logger.Fatalf("pebble: invariant violation: NextPrefix(%q) called on merging iterator already positioned at %q", - succKey, (*root).iterKey) + succKey, (*root).iterKV) } - m.nextEntry(*root, succKey) - // NB: root is a pointer to the heap root. nextEntry may have changed - // the heap root, so we must not expect root to still point to the same - // level (or to even be valid, if the heap is now exhaused). + // NB: root is the heap root before we call nextEntry; nextEntry may change + // the heap root, so we must not `root` to still be the root of the heap, or + // even to be in the heap if the level's iterator becomes exhausted. + if m.err = m.nextEntry(root, succKey); m.err != nil { + return nil + } + // We only consider the level to be conclusively positioned at the next + // prefix if our call to nextEntry did not advance the level onto a range + // deletion's boundary. Range deletions may have bounds within the prefix + // that are still surfaced by NextPrefix. + m.levelsPositioned[root.index] = root.iterKV == nil || !root.iterKV.K.IsExclusiveSentinel() for m.heap.len() > 0 { - if m.levelsPositioned[(*root).index] { + root := m.heap.items[0].mergingIterLevel + if m.levelsPositioned[root.index] { // A level we've previously positioned is at the top of the heap, so // there are no other levels positioned at keys < succKey. We've // advanced as far as we need to. break } + // If the current heap root is a sentinel key, we need to skip it. + // Calling NextPrefix while positioned at a sentinel key is not + // supported. + if root.iterKV.K.IsExclusiveSentinel() { + if m.err = m.nextEntry(root, nil); m.err != nil { + return nil + } + continue + } + // Since this level was not the original heap root when NextPrefix was // called, we don't know whether this level's current key has the // previous prefix or a new one. - if m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 { + if m.heap.cmp(root.iterKV.K.UserKey, succKey) >= 0 { break } - m.levelsPositioned[(*root).index] = true - m.nextEntry(*root, succKey) + if m.err = m.nextEntry(root, succKey); m.err != nil { + return nil + } + // We only consider the level to be conclusively positioned at the next + // prefix if our call to nextEntry did not land onto a range deletion's + // boundary. Range deletions may have bounds within the prefix that are + // still surfaced by NextPrefix. + m.levelsPositioned[root.index] = root.iterKV == nil || !root.iterKV.K.IsExclusiveSentinel() } return m.findNextEntry() } -func (m *mergingIter) Prev() (*InternalKey, base.LazyValue) { +func (m *mergingIter) Prev() *base.InternalKV { if m.err != nil { - return nil, base.LazyValue{} + return nil } if m.dir != -1 { if m.prefix != nil { m.err = errors.New("pebble: unsupported reverse prefix iteration") - return nil, base.LazyValue{} + return nil + } + if m.err = m.switchToMaxHeap(); m.err != nil { + return nil } - m.switchToMaxHeap() return m.findPrevEntry() } if m.heap.len() == 0 { - return nil, base.LazyValue{} + return nil + } + if m.err = m.prevEntry(m.heap.items[0].mergingIterLevel); m.err != nil { + return nil } - - m.prevEntry(m.heap.items[0]) return m.findPrevEntry() } @@ -1337,11 +1307,7 @@ func (m *mergingIter) Close() error { if err := iter.Close(); err != nil && m.err == nil { m.err = err } - if rangeDelIter := m.levels[i].rangeDelIter; rangeDelIter != nil { - if err := rangeDelIter.Close(); err != nil && m.err == nil { - m.err = err - } - } + m.levels[i].setRangeDelIter(nil) } m.levels = nil m.heap.items = m.heap.items[:0] @@ -1358,18 +1324,38 @@ func (m *mergingIter) SetBounds(lower, upper []byte) { m.heap.clear() } +func (m *mergingIter) SetContext(ctx context.Context) { + for i := range m.levels { + m.levels[i].iter.SetContext(ctx) + } +} + +// DebugTree is part of the InternalIterator interface. +func (m *mergingIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", m, m) + for i := range m.levels { + if iter := m.levels[i].iter; iter != nil { + iter.DebugTree(n) + } + } +} + func (m *mergingIter) DebugString() string { var buf bytes.Buffer sep := "" for m.heap.len() > 0 { item := m.heap.pop() - fmt.Fprintf(&buf, "%s%s", sep, item.iterKey) + fmt.Fprintf(&buf, "%s%s", sep, item.iterKV.K) sep = " " } + var err error if m.dir == 1 { - m.initMinHeap() + err = m.initMinHeap() } else { - m.initMaxHeap() + err = m.initMaxHeap() + } + if err != nil { + fmt.Fprintf(&buf, "err=<%s>", err) } return buf.String() } @@ -1386,8 +1372,8 @@ func (m *mergingIter) ForEachLevelIter(fn func(li *levelIter) bool) { func (m *mergingIter) addItemStats(l *mergingIterLevel) { m.stats.PointCount++ - m.stats.KeyBytes += uint64(len(l.iterKey.UserKey)) - m.stats.ValueBytes += uint64(len(l.iterValue.ValueOrHandle)) + m.stats.KeyBytes += uint64(len(l.iterKV.K.UserKey)) + m.stats.ValueBytes += uint64(l.iterKV.V.InternalLen()) } var _ internalIterator = &mergingIter{} diff --git a/vendor/github.com/cockroachdb/pebble/v2/merging_iter_heap.go b/vendor/github.com/cockroachdb/pebble/v2/merging_iter_heap.go new file mode 100644 index 0000000..b1bab8e --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/merging_iter_heap.go @@ -0,0 +1,140 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import "github.com/cockroachdb/pebble/v2/internal/invariants" + +// mergingIterHeap is a heap of mergingIterLevels. It only reads +// mergingIterLevel.iterKV.K. +// +// REQUIRES: Every mergingIterLevel.iterKV is non-nil. +// +// TODO(sumeer): consider using golang generics. +type mergingIterHeap struct { + cmp Compare + reverse bool + items []mergingIterHeapItem +} + +type mergingIterHeapItem struct { + *mergingIterLevel + winnerChild winnerChild +} + +// winnerChild represents the child that is less than the other child, i.e., +// would get promoted up the heap before the other child if the parent was +// removed, or the parent's value "increased". +// +// It can be unknown, represented by winnerChildUnknown. If both children are +// equal, or if there is only one child, any of the three values are +// permitted. +type winnerChild uint8 + +const ( + winnerChildUnknown winnerChild = iota + winnerChildLeft + winnerChildRight +) + +// len returns the number of elements in the heap. +func (h *mergingIterHeap) len() int { + return len(h.items) +} + +// clear empties the heap. +func (h *mergingIterHeap) clear() { + h.items = h.items[:0] +} + +// less is an internal method, to compare the elements at i and j. +func (h *mergingIterHeap) less(i, j int) bool { + ikv, jkv := h.items[i].iterKV, h.items[j].iterKV + if c := h.cmp(ikv.K.UserKey, jkv.K.UserKey); c != 0 { + if h.reverse { + return c > 0 + } + return c < 0 + } + if h.reverse { + return ikv.K.Trailer < jkv.K.Trailer + } + return ikv.K.Trailer > jkv.K.Trailer +} + +// swap is an internal method, used to swap the elements at i and j. +func (h *mergingIterHeap) swap(i, j int) { + h.items[i].mergingIterLevel, h.items[j].mergingIterLevel = + h.items[j].mergingIterLevel, h.items[i].mergingIterLevel +} + +// init initializes the heap. +func (h *mergingIterHeap) init() { + // heapify + n := h.len() + for i := n/2 - 1; i >= 0; i-- { + h.down(i, n) + } +} + +// fixTop restores the heap property after the top of the heap has been +// modified. +func (h *mergingIterHeap) fixTop() { + h.down(0, h.len()) +} + +// pop removes the top of the heap. +func (h *mergingIterHeap) pop() *mergingIterLevel { + n := h.len() - 1 + h.swap(0, n) + // Parent of n does not know which child is the winner. But since index n is + // removed, the parent of n will have at most one child, and so the value of + // winnerChild is irrelevant, and we don't need to do: + // h.items[(n-1)/2].winnerChild = winnerChildUnknown + h.down(0, n) + item := h.items[n] + h.items = h.items[:n] + return item.mergingIterLevel +} + +// down is an internal method. It moves i down the heap, which has length n, +// until the heap property is restored. +func (h *mergingIterHeap) down(i, n int) { + for { + j1 := 2*i + 1 + if j1 >= n || j1 < 0 { // j1 < 0 after int overflow + break + } + j := j1 // left child + if j2 := j1 + 1; j2 < n { + if h.items[i].winnerChild == winnerChildUnknown { + if h.less(j2, j1) { + h.items[i].winnerChild = winnerChildRight + } else { + h.items[i].winnerChild = winnerChildLeft + } + } else if invariants.Enabled { + wc := winnerChildUnknown + if h.less(j1, j2) { + wc = winnerChildLeft + } else if h.less(j2, j1) { + wc = winnerChildRight + } + if wc != winnerChildUnknown && wc != h.items[i].winnerChild { + panic("winnerChild mismatch") + } + } + if h.items[i].winnerChild == winnerChildRight { + j = j2 // = 2*i + 2 // right child + } + } + if !h.less(j, i) { + break + } + // NB: j is a child of i. + h.swap(i, j) + h.items[i].winnerChild = winnerChildUnknown + i = j + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/metrics.go b/vendor/github.com/cockroachdb/pebble/v2/metrics.go new file mode 100644 index 0000000..96788af --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/metrics.go @@ -0,0 +1,951 @@ +// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "fmt" + "math" + "time" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/cache" + "github.com/cockroachdb/pebble/v2/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/manual" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/sharedcache" + "github.com/cockroachdb/pebble/v2/record" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/wal" + "github.com/cockroachdb/redact" + "github.com/prometheus/client_golang/prometheus" +) + +// CacheMetrics holds metrics for the block and file cache. +type CacheMetrics = cache.Metrics + +// FilterMetrics holds metrics for the filter policy +type FilterMetrics = sstable.FilterMetrics + +// ThroughputMetric is a cumulative throughput metric. See the detailed +// comment in base. +type ThroughputMetric = base.ThroughputMetric + +// SecondaryCacheMetrics holds metrics for the persistent secondary cache +// that caches commonly accessed blocks from blob storage on a local +// file system. +type SecondaryCacheMetrics = sharedcache.Metrics + +// LevelMetrics holds per-level metrics such as the number of files and total +// size of the files, and compaction related metrics. +type LevelMetrics struct { + // The number of sublevels within the level. The sublevel count corresponds + // to the read amplification for the level. An empty level will have a + // sublevel count of 0, implying no read amplification. Only L0 will have + // a sublevel count other than 0 or 1. + Sublevels int32 + // The total count of sstables in the level. + TablesCount int64 + // The total size in bytes of the sstables in the level. Note that if tables + // contain references to blob files, this quantity does not include the the + // size of the blob files or the referenced values. + TablesSize int64 + // The total number of virtual sstables in the level. + VirtualTablesCount uint64 + // The total size of the virtual sstables in the level. + VirtualTablesSize uint64 + // The estimated total physical size of all blob references across all + // sstables in the level. The physical size is estimated based on the size + // of referenced values and the values' blob file's compression ratios. + EstimatedReferencesSize uint64 + // The level's compaction score, used to rank levels (0 if the level doesn't + // need compaction). See candidateLevelInfo. + Score float64 + // The level's fill factor (the ratio between the size of the level and the + // ideal size). See candidateLevelInfo. + FillFactor float64 + // The level's compensated fill factor. See candidateLevelInfo. + CompensatedFillFactor float64 + // The number of incoming bytes from other levels' sstables read during + // compactions. This excludes bytes moved and bytes ingested. For L0 this is + // the bytes written to the WAL. + TableBytesIn uint64 + // The number of sstable bytes ingested. The sibling metric for tables is + // TablesIngested. + TableBytesIngested uint64 + // The number of sstable bytes moved into the level by a "move" compaction. + // The sibling metric for tables is TablesMoved. + TableBytesMoved uint64 + // The number of bytes read for compactions at the level. This includes bytes + // read from other levels (BytesIn), as well as bytes read for the level. + TableBytesRead uint64 + // The number of bytes written to sstables during compactions. The sibling + // metric for tables is TablesCompacted. This metric may be summed with + // BytesFlushed to compute the total bytes written for the level. + TableBytesCompacted uint64 + // The number of bytes written to sstables during flushes. The sibling + // metrics for tables is TablesFlushed. This metric is always zero for all + // levels other than L0. + TableBytesFlushed uint64 + // The number of sstables compacted to this level. + TablesCompacted uint64 + // The number of sstables flushed to this level. + TablesFlushed uint64 + // The number of sstables ingested into the level. + TablesIngested uint64 + // The number of sstables moved to this level by a "move" compaction. + TablesMoved uint64 + // The number of sstables deleted in a level by a delete-only compaction. + TablesDeleted uint64 + // The number of sstables excised in a level by a delete-only compaction. + TablesExcised uint64 + // BlobBytesReadEstimate is an estimate of the physical bytes corresponding + // to values referenced by sstables that were inputs into compactions + // outputting into this level. + BlobBytesReadEstimate uint64 + // BlobBytesCompacted is the number of bytes written to blob files while + // compacting sstables in this level. + BlobBytesCompacted uint64 + // BlobBytesFlushed is the number of bytes written to blob files while + // flushing sstables. This metric is always zero for all levels other than + // L0. + BlobBytesFlushed uint64 + + MultiLevel struct { + // TableBytesInTop are the total bytes in a multilevel compaction coming + // from the top level. + TableBytesInTop uint64 + + // TableBytesIn, exclusively for multiLevel compactions. + TableBytesIn uint64 + + // TableBytesRead, exclusively for multilevel compactions. + TableBytesRead uint64 + } + + // Additional contains misc additional metrics that are not always printed. + Additional struct { + // The sum of Properties.ValueBlocksSize for all the sstables in this + // level. Printed by LevelMetrics.format iff there is at least one level + // with a non-zero value. + ValueBlocksSize uint64 + // Cumulative metrics about bytes written to data blocks and value blocks, + // via compactions (except move compactions) or flushes. Not printed by + // LevelMetrics.format, but are available to sophisticated clients. + BytesWrittenDataBlocks uint64 + BytesWrittenValueBlocks uint64 + } +} + +// AggregateSize returns an estimated physical size of the level's sstables and +// their referenced values stored in blob files. The size of physical sstables +// is exactly known. Virtual sstables' sizes are estimated, and the size of +// values stored in blob files is estimated based on the volume of referenced +// data and the blob file's compression ratio. +func (m *LevelMetrics) AggregateSize() int64 { + return m.TablesSize + int64(m.EstimatedReferencesSize) +} + +// Add updates the counter metrics for the level. +func (m *LevelMetrics) Add(u *LevelMetrics) { + m.TablesCount += u.TablesCount + m.TablesSize += u.TablesSize + m.VirtualTablesCount += u.VirtualTablesCount + m.VirtualTablesSize += u.VirtualTablesSize + m.EstimatedReferencesSize += u.EstimatedReferencesSize + m.TableBytesIn += u.TableBytesIn + m.TableBytesIngested += u.TableBytesIngested + m.TableBytesMoved += u.TableBytesMoved + m.TableBytesRead += u.TableBytesRead + m.TableBytesCompacted += u.TableBytesCompacted + m.TableBytesFlushed += u.TableBytesFlushed + m.TablesCompacted += u.TablesCompacted + m.TablesFlushed += u.TablesFlushed + m.TablesIngested += u.TablesIngested + m.TablesMoved += u.TablesMoved + m.BlobBytesCompacted += u.BlobBytesCompacted + m.BlobBytesFlushed += u.BlobBytesFlushed + m.BlobBytesReadEstimate += u.BlobBytesReadEstimate + m.MultiLevel.TableBytesInTop += u.MultiLevel.TableBytesInTop + m.MultiLevel.TableBytesRead += u.MultiLevel.TableBytesRead + m.MultiLevel.TableBytesIn += u.MultiLevel.TableBytesIn + m.Additional.BytesWrittenDataBlocks += u.Additional.BytesWrittenDataBlocks + m.Additional.BytesWrittenValueBlocks += u.Additional.BytesWrittenValueBlocks + m.Additional.ValueBlocksSize += u.Additional.ValueBlocksSize +} + +// WriteAmp computes the write amplification for compactions at this +// level. +// +// The write amplification is computed as the quantity of physical bytes written +// divided by the quantity of logical bytes written. +// +// Concretely, it's computed as: +// +// TableBytesFlushed + TableBytesCompacted + BlobBytesFlushed + BlobBytesCompacted +// ------------------------------------------------------------------------------- +// TableBytesIn +func (m *LevelMetrics) WriteAmp() float64 { + if m.TableBytesIn == 0 { + return 0 + } + return float64(m.TableBytesFlushed+m.TableBytesCompacted+m.BlobBytesFlushed+m.BlobBytesCompacted) / + float64(m.TableBytesIn) +} + +var categoryCompaction = block.RegisterCategory("pebble-compaction", block.NonLatencySensitiveQoSLevel) +var categoryIngest = block.RegisterCategory("pebble-ingest", block.LatencySensitiveQoSLevel) +var categoryGet = block.RegisterCategory("pebble-get", block.LatencySensitiveQoSLevel) + +// Metrics holds metrics for various subsystems of the DB such as the Cache, +// Compactions, WAL, and per-Level metrics. +// +// TODO(peter): The testing of these metrics is relatively weak. There should +// be testing that performs various operations on a DB and verifies that the +// metrics reflect those operations. +type Metrics struct { + BlockCache CacheMetrics + + Compact struct { + // The total number of compactions, and per-compaction type counts. + Count int64 + DefaultCount int64 + DeleteOnlyCount int64 + ElisionOnlyCount int64 + CopyCount int64 + MoveCount int64 + ReadCount int64 + TombstoneDensityCount int64 + RewriteCount int64 + MultiLevelCount int64 + BlobFileRewriteCount int64 + CounterLevelCount int64 + // An estimate of the number of bytes that need to be compacted for the LSM + // to reach a stable state. + EstimatedDebt uint64 + // Number of bytes present in sstables being written by in-progress + // compactions. This value will be zero if there are no in-progress + // compactions. + InProgressBytes int64 + // Number of compactions that are in-progress. + NumInProgress int64 + // Number of compactions that were cancelled. + CancelledCount int64 + // CancelledBytes the number of bytes written by compactions that were + // cancelled. + CancelledBytes int64 + // Total number of compactions that hit an error. + FailedCount int64 + // NumProblemSpans is the current (instantaneous) count of "problem spans" + // which temporarily block compactions. + NumProblemSpans int + // MarkedFiles is a count of files that are marked for + // compaction. Such files are compacted in a rewrite compaction + // when no other compactions are picked. + MarkedFiles int + // Duration records the cumulative duration of all compactions since the + // database was opened. + Duration time.Duration + } + + Ingest struct { + // The total number of ingestions + Count uint64 + } + + Flush struct { + // The total number of flushes. + Count int64 + // TODO(sumeer): the IdleDuration in this metric is flawed. It only + // measures idle duration when a flush finishes, representing the idleness + // before the start of a flush. So computing deltas over this metric over + // some time interval D may observe the sum of IdleDuration+WorkDuration + // to be either much smaller or much larger than D. + WriteThroughput ThroughputMetric + // Number of flushes that are in-progress. In the current implementation + // this will always be zero or one. + NumInProgress int64 + // AsIngestCount is a monotonically increasing counter of flush operations + // handling ingested tables. + AsIngestCount uint64 + // AsIngestCount is a monotonically increasing counter of tables ingested as + // flushables. + AsIngestTableCount uint64 + // AsIngestBytes is a monotonically increasing counter of the bytes flushed + // for flushables that originated as ingestion operations. + AsIngestBytes uint64 + } + + Filter FilterMetrics + + Levels [numLevels]LevelMetrics + + MemTable struct { + // The number of bytes allocated by memtables and large (flushable) + // batches. + Size uint64 + // The count of memtables. + Count int64 + // The number of bytes present in zombie memtables which are no longer + // referenced by the current DB state. An unbounded number of memtables + // may be zombie if they're still in use by an iterator. One additional + // memtable may be zombie if it's no longer in use and waiting to be + // recycled. + ZombieSize uint64 + // The count of zombie memtables. + ZombieCount int64 + } + + Keys struct { + // The approximate count of internal range key set keys in the database. + RangeKeySetsCount uint64 + // The approximate count of internal tombstones (DEL, SINGLEDEL and + // RANGEDEL key kinds) within the database. + TombstoneCount uint64 + // A cumulative total number of missized DELSIZED keys encountered by + // compactions since the database was opened. + MissizedTombstonesCount uint64 + } + + Snapshots struct { + // The number of currently open snapshots. + Count int + // The sequence number of the earliest, currently open snapshot. + EarliestSeqNum base.SeqNum + // A running tally of keys written to sstables during flushes or + // compactions that would've been elided if it weren't for open + // snapshots. + PinnedKeys uint64 + // A running cumulative sum of the size of keys and values written to + // sstables during flushes or compactions that would've been elided if + // it weren't for open snapshots. + PinnedSize uint64 + } + + Table struct { + // The number of bytes present in obsolete tables which are no longer + // referenced by the current DB state or any open iterators. + ObsoleteSize uint64 + // The count of obsolete tables. + ObsoleteCount int64 + // The number of bytes present in zombie tables which are no longer + // referenced by the current DB state but are still in use by an iterator. + ZombieSize uint64 + // The count of zombie tables. + ZombieCount int64 + // The count of sstables backing virtual tables. + BackingTableCount uint64 + // The sum of the sizes of the BackingTableCount sstables that are backing virtual tables. + BackingTableSize uint64 + // The number of sstables that are compressed with an unknown compression + // algorithm. + CompressedCountUnknown int64 + // The number of sstables that are compressed with the default compression + // algorithm, snappy. + CompressedCountSnappy int64 + // The number of sstables that are compressed with zstd. + CompressedCountZstd int64 + // The number of sstables that are compressed with minlz. + CompressedCountMinLZ int64 + // The number of sstables that are uncompressed. + CompressedCountNone int64 + + // Local file sizes. + Local struct { + // LiveSize is the number of bytes in live tables. + LiveSize uint64 + // LiveCount is the number of live tables. + LiveCount uint64 + // ObsoleteSize is the number of bytes in obsolete tables. + ObsoleteSize uint64 + // ObsoleteCount is the number of obsolete tables. + ObsoleteCount uint64 + // ZombieSize is the number of bytes in zombie tables. + ZombieSize uint64 + // ZombieCount is the number of zombie tables. + ZombieCount uint64 + } + + // Garbage bytes. + Garbage struct { + // PointDeletionsBytesEstimate is the estimated file bytes that will be + // saved by compacting all point deletions. This is dependent on table + // stats collection, so can be very incomplete until + // InitialStatsCollectionComplete becomes true. + PointDeletionsBytesEstimate uint64 + // RangeDeletionsBytesEstimate is the estimated file bytes that will be + // saved by compacting all range deletions. This is dependent on table + // stats collection, so can be very incomplete until + // InitialStatsCollectionComplete becomes true. + RangeDeletionsBytesEstimate uint64 + } + + // Whether the initial stats collection (for existing tables on Open) is + // complete. + InitialStatsCollectionComplete bool + // The count of recently created sstables that need stats collection. This + // does not include sstables that existed when the DB was opened, so the + // value is only useful when InitialStatsCollectionComplete is true. + PendingStatsCollectionCount int64 + } + + BlobFiles struct { + // The count of all live blob files. + LiveCount uint64 + // The physical file size of all live blob files. + LiveSize uint64 + // ValueSize is the sum of the length of the uncompressed values in all + // live (referenced by some sstable(s) within the current version) blob + // files. ValueSize may be greater than LiveSize when compression is + // effective. ValueSize includes bytes in live blob files that are not + // actually reachable by any sstable key. If any value within the blob + // file is reachable by a key in a live sstable, then the entirety of + // the blob file's values are included within ValueSize. + ValueSize uint64 + // ReferencedValueSize is the sum of the length of the uncompressed + // values (in all live blob files) that are still referenced by keys + // within live tables. Over the lifetime of a blob file, a blob file's + // references are removed as some compactions choose to write new blob + // files containing the same values or keys referencing the file's + // values are deleted. ReferencedValueSize accounts the volume of bytes + // that are actually reachable by some key in a live table. + // + // The difference between ValueSize and ReferencedValueSize is + // (uncompressed) space amplification that could be reclaimed if all + // blob files were rewritten, discarding values that are no longer + // referenced by any keys in any sstables within the current version. + ReferencedValueSize uint64 + // The count of all obsolete blob files. + ObsoleteCount uint64 + // The physical size of all obsolete blob files. + ObsoleteSize uint64 + // The count of all zombie blob files. + ZombieCount uint64 + // The physical size of all zombie blob files. + ZombieSize uint64 + // Local file sizes. + Local struct { + // LiveSize is the physical size of local live blob files. + LiveSize uint64 + // LiveCount is the number of local live blob files. + LiveCount uint64 + // ObsoleteSize is the physical size of local obsolete blob files. + ObsoleteSize uint64 + // ObsoleteCount is the number of local obsolete blob files. + ObsoleteCount uint64 + // ZombieSize is the physical size of local zombie blob files. + ZombieSize uint64 + // ZombieCount is the number of local zombie blob files. + ZombieCount uint64 + } + } + + FileCache FileCacheMetrics + + // Count of the number of open sstable iterators. + TableIters int64 + // Uptime is the total time since this DB was opened. + Uptime time.Duration + + WAL struct { + // Number of live WAL files. + Files int64 + // Number of obsolete WAL files. + ObsoleteFiles int64 + // Physical size of the obsolete WAL files. + ObsoletePhysicalSize uint64 + // Size of the live data in the WAL files. Note that with WAL file + // recycling this is less than the actual on-disk size of the WAL files. + Size uint64 + // Physical size of the WAL files on-disk. With WAL file recycling, + // this is greater than the live data in WAL files. + // + // TODO(sumeer): it seems this does not include ObsoletePhysicalSize. + // Should the comment be updated? + PhysicalSize uint64 + // Number of logical bytes written to the WAL. + BytesIn uint64 + // Number of bytes written to the WAL. + BytesWritten uint64 + // Failover contains failover stats. Empty if failover is not enabled. + Failover wal.FailoverStats + } + + LogWriter struct { + FsyncLatency prometheus.Histogram + record.LogWriterMetrics + } + + CategoryStats []block.CategoryStatsAggregate + + SecondaryCacheMetrics SecondaryCacheMetrics + + private struct { + optionsFileSize uint64 + manifestFileSize uint64 + } + + manualMemory manual.Metrics +} + +var ( + // FsyncLatencyBuckets are prometheus histogram buckets suitable for a histogram + // that records latencies for fsyncs. + FsyncLatencyBuckets = append( + prometheus.LinearBuckets(0.0, float64(time.Microsecond*100), 50), + prometheus.ExponentialBucketsRange(float64(time.Millisecond*5), float64(10*time.Second), 50)..., + ) + + // SecondaryCacheIOBuckets exported to enable exporting from package pebble to + // enable exporting metrics with below buckets in CRDB. + SecondaryCacheIOBuckets = sharedcache.IOBuckets + // SecondaryCacheChannelWriteBuckets exported to enable exporting from package + // pebble to enable exporting metrics with below buckets in CRDB. + SecondaryCacheChannelWriteBuckets = sharedcache.ChannelWriteBuckets +) + +// DiskSpaceUsage returns the total disk space used by the database in bytes, +// including live and obsolete files. This only includes local files, i.e., +// remote files (as known to objstorage.Provider) are not included. +func (m *Metrics) DiskSpaceUsage() uint64 { + var usageBytes uint64 + usageBytes += m.WAL.PhysicalSize + usageBytes += m.WAL.ObsoletePhysicalSize + usageBytes += m.Table.Local.LiveSize + usageBytes += m.Table.Local.ObsoleteSize + usageBytes += m.Table.Local.ZombieSize + usageBytes += m.BlobFiles.Local.LiveSize + usageBytes += m.BlobFiles.Local.ObsoleteSize + usageBytes += m.BlobFiles.Local.ZombieSize + usageBytes += m.private.optionsFileSize + usageBytes += m.private.manifestFileSize + // TODO(sumeer): InProgressBytes does not distinguish between local and + // remote files. This causes a small error. Fix. + usageBytes += uint64(m.Compact.InProgressBytes) + return usageBytes +} + +// NumVirtual is the number of virtual sstables in the latest version +// summed over every level in the lsm. +func (m *Metrics) NumVirtual() uint64 { + var n uint64 + for _, level := range m.Levels { + n += level.VirtualTablesCount + } + return n +} + +// VirtualSize is the sum of the sizes of the virtual sstables in the +// latest version. BackingTableSize - VirtualSize gives an estimate for +// the space amplification caused by not compacting virtual sstables. +func (m *Metrics) VirtualSize() uint64 { + var size uint64 + for _, level := range m.Levels { + size += level.VirtualTablesSize + } + return size +} + +// ReadAmp returns the current read amplification of the database. +// It's computed as the number of sublevels in L0 + the number of non-empty +// levels below L0. +func (m *Metrics) ReadAmp() int { + var ramp int32 + for _, l := range m.Levels { + ramp += l.Sublevels + } + return int(ramp) +} + +// Total returns the sum of the per-level metrics and WAL metrics. +func (m *Metrics) Total() LevelMetrics { + var total LevelMetrics + for level := 0; level < numLevels; level++ { + l := &m.Levels[level] + total.Add(l) + total.Sublevels += l.Sublevels + } + // Compute total bytes-in as the bytes written to the WAL + bytes ingested. + total.TableBytesIn = m.WAL.BytesWritten + total.TableBytesIngested + // Add the total bytes-in to the total bytes-flushed. This is to account for + // the bytes written to the log and bytes written externally and then + // ingested. + total.TableBytesFlushed += total.TableBytesIn + return total +} + +// RemoteTablesTotal returns the total number of remote tables and their total +// size. Remote tables are computed as the difference between total tables +// (live + obsolete + zombie) and local tables. +func (m *Metrics) RemoteTablesTotal() (count uint64, size uint64) { + var liveTables, liveTableBytes int64 + for level := 0; level < numLevels; level++ { + liveTables += m.Levels[level].TablesCount + liveTableBytes += m.Levels[level].TablesSize + } + totalCount := liveTables + m.Table.ObsoleteCount + m.Table.ZombieCount + localCount := m.Table.Local.LiveCount + m.Table.Local.ObsoleteCount + m.Table.Local.ZombieCount + remoteCount := uint64(totalCount) - localCount + + totalSize := uint64(liveTableBytes) + m.Table.ObsoleteSize + m.Table.ZombieSize + localSize := m.Table.Local.LiveSize + m.Table.Local.ObsoleteSize + m.Table.Local.ZombieSize + remoteSize := totalSize - localSize + + return remoteCount, remoteSize +} + +// String pretty-prints the metrics as below (semi-adjusted visually to avoid +// the crlfmt from auto-reformatting): +// +// | | | | ingested | moved | written | | amp | val sep | multilevel +// level | tables size val-bl vtables | score ff cff | in | tables size | tables size |tables size| read | r w | refsz valblk| top in read +// ------+-----------------------------+----------------+-------+--------------+--------------+-----------+-------+---------+--------------+------------------ +// 0 | 101 102B 0B 101 | 1.10 2.10 0.30 | 104B | 112 104B | 113 106B | 221 217B| 107B | 1 2.09 | 114B 0B| 104B 104B 104B +// 1 | 201 202B 0B 201 | 1.20 2.20 0.60 | 204B | 212 204B | 213 206B | 421 417B| 207B | 2 2.04 | 214B 0B| 204B 204B 204B +// 2 | 301 302B 0B 301 | 1.30 2.30 0.90 | 304B | 312 304B | 313 306B | 621 617B| 307B | 3 2.03 | 314B 0B| 304B 304B 304B +// 3 | 401 402B 0B 401 | 1.40 2.40 1.20 | 404B | 412 404B | 413 406B | 821 817B| 407B | 4 2.02 | 414B 0B| 404B 404B 404B +// 4 | 501 502B 0B 501 | 1.50 2.50 1.50 | 504B | 512 504B | 513 506B |1.0K 1017B| 507B | 5 2.02 | 514B 0B| 504B 504B 504B +// 5 | 601 602B 0B 601 | 1.60 2.60 1.80 | 604B | 612 604B | 613 606B |1.2K 1.2KB| 607B | 6 2.01 | 614B 0B| 604B 604B 604B +// 6 | 701 702B 0B 701 | - 2.70 2.10 | 704B | 712 704B | 713 706B |1.4K 1.4KB| 707B | 7 2.01 | 714B 0B| 704B 704B 704B +// total | 2.8K 2.7KB 0B 2.8K | - - - | 2.8KB | 2.9K 2.8KB | 2.9K 2.8KB |5.7K 8.4KB| 2.8KB | 28 3.00 |2.8KB 0B| 2.8KB 2.8KB 2.8KB +// +// WAL: 22 files (24B) in: 25B written: 26B (4% overhead) +// Flushes: 8 +// Compactions: 5 estimated debt: 6B in progress: 2 (7B) +// default: 27 delete: 28 elision: 29 move: 30 read: 31 tombstone-density: 16 rewrite: 32 copy: 33 multi-level: 34 +// MemTables: 12 (11B) zombie: 14 (13B) +// Zombie tables: 16 (15B, local: 30B) +// Backing tables: 1 (2.0MB) +// Virtual tables: 2807 (2.8KB) +// Local tables size: 28B +// Compression types: +// Table stats: 31 +// Block cache: 2 entries (1B) hit rate: 42.9% +// Table cache: 18 entries (17B) hit rate: 48.7% +// Range key sets: 123 Tombstones: 456 Total missized tombstones encountered: 789 +// Snapshots: 4 earliest seq num: 1024 +// Table iters: 21 +// Filter utility: 47.4% +// Ingestions: 27 as flushable: 36 (34B in 35 tables) +// Cgo memory usage: 15KB block cache: 9.0KB (data: 4.0KB, maps: 2.0KB, entries: 3.0KB) memtables: 5.0KB +func (m *Metrics) String() string { + return redact.StringWithoutMarkers(m) +} + +var _ redact.SafeFormatter = &Metrics{} + +// SafeFormat implements redact.SafeFormatter. +func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune) { + // NB: Pebble does not make any assumptions as to which Go primitive types + // have been registered as safe with redact.RegisterSafeType and does not + // register any types itself. Some of the calls to `redact.Safe`, etc are + // superfluous in the context of CockroachDB, which registers all the Go + // numeric types as safe. + + multiExists := m.Compact.MultiLevelCount > 0 + appendIfMulti := func(line redact.SafeString) { + if multiExists { + w.SafeString(line) + } + } + newline := func() { + w.SafeString("\n") + } + + w.SafeString(" | | | | ingested | moved | written | | amp | val sep") + appendIfMulti(" | multilevel") + newline() + w.SafeString("level | tables size val-bl vtables | score ff cff | in | tables size | tables size | tables size | read | r w | refsz valblk") + appendIfMulti(" | top in read") + newline() + w.SafeString("------+-----------------------------+----------------+-------+--------------+--------------+--------------+-------+----------+--------------") + appendIfMulti("-+------------------") + newline() + + // formatRow prints out a row of the table. + formatRow := func(m *LevelMetrics) { + score := m.Score + if score == 0 { + // Format a zero level score as a dash. + score = math.NaN() + } + w.Printf("| %5s %6s %6s %7s | %4s %4s %4s | %5s | %5s %6s | %5s %6s | %5s %6s | %5s | %3d %4s | %5s %7s", + humanize.Count.Int64(m.TablesCount), + humanize.Bytes.Int64(m.TablesSize), + humanize.Bytes.Uint64(m.Additional.ValueBlocksSize), + humanize.Count.Uint64(m.VirtualTablesCount), + humanizeFloat(score, 4), + humanizeFloat(m.FillFactor, 4), + humanizeFloat(m.CompensatedFillFactor, 4), + humanize.Bytes.Uint64(m.TableBytesIn), + humanize.Count.Uint64(m.TablesIngested), + humanize.Bytes.Uint64(m.TableBytesIngested), + humanize.Count.Uint64(m.TablesMoved), + humanize.Bytes.Uint64(m.TableBytesMoved), + humanize.Count.Uint64(m.TablesFlushed+m.TablesCompacted), + humanize.Bytes.Uint64(m.TableBytesFlushed+m.TableBytesCompacted), + humanize.Bytes.Uint64(m.TableBytesRead), + redact.Safe(m.Sublevels), + humanizeFloat(m.WriteAmp(), 4), + humanize.Bytes.Uint64(m.EstimatedReferencesSize), + humanize.Bytes.Uint64(m.Additional.ValueBlocksSize), + ) + + if multiExists { + w.Printf(" | %5s %5s %5s", + humanize.Bytes.Uint64(m.MultiLevel.TableBytesInTop), + humanize.Bytes.Uint64(m.MultiLevel.TableBytesIn), + humanize.Bytes.Uint64(m.MultiLevel.TableBytesRead)) + } + newline() + } + + var total LevelMetrics + for level := 0; level < numLevels; level++ { + l := &m.Levels[level] + w.Printf("%5d ", redact.Safe(level)) + formatRow(l) + total.Add(l) + total.Sublevels += l.Sublevels + } + // Compute total bytes-in as the bytes written to the WAL + bytes ingested. + total.TableBytesIn = m.WAL.BytesWritten + total.TableBytesIngested + // Add the total bytes-in to the total bytes-flushed. This is to account for + // the bytes written to the log and bytes written externally and then + // ingested. + total.TableBytesFlushed += total.TableBytesIn + total.Score = math.NaN() + total.FillFactor = math.NaN() + total.CompensatedFillFactor = math.NaN() + w.SafeString("total ") + formatRow(&total) + + w.SafeString("--------------------------------------------------------------------------------------------------------------------------------------------") + appendIfMulti("--------------------") + newline() + w.Printf("WAL: %d files (%s) in: %s written: %s (%.0f%% overhead)", + redact.Safe(m.WAL.Files), + humanize.Bytes.Uint64(m.WAL.Size), + humanize.Bytes.Uint64(m.WAL.BytesIn), + humanize.Bytes.Uint64(m.WAL.BytesWritten), + redact.Safe(percent(int64(m.WAL.BytesWritten)-int64(m.WAL.BytesIn), int64(m.WAL.BytesIn)))) + failoverStats := m.WAL.Failover + failoverStats.FailoverWriteAndSyncLatency = nil + if failoverStats == (wal.FailoverStats{}) { + w.Printf("\n") + } else { + w.Printf(" failover: (switches: %d, primary: %s, secondary: %s)\n", m.WAL.Failover.DirSwitchCount, + m.WAL.Failover.PrimaryWriteDuration.String(), m.WAL.Failover.SecondaryWriteDuration.String()) + } + + w.Printf("Flushes: %d\n", redact.Safe(m.Flush.Count)) + + w.Printf("Compactions: %d estimated debt: %s in progress: %d (%s) canceled: %d (%s) failed: %d problem spans: %d\n", + redact.Safe(m.Compact.Count), + humanize.Bytes.Uint64(m.Compact.EstimatedDebt), + redact.Safe(m.Compact.NumInProgress), + humanize.Bytes.Int64(m.Compact.InProgressBytes), + redact.Safe(m.Compact.CancelledCount), + humanize.Bytes.Int64(m.Compact.CancelledBytes), + redact.Safe(m.Compact.FailedCount), + redact.Safe(m.Compact.NumProblemSpans), + ) + + w.Printf(" default: %d delete: %d elision: %d move: %d read: %d tombstone-density: %d rewrite: %d copy: %d multi-level: %d blob-file-rewrite: %d\n", + redact.Safe(m.Compact.DefaultCount), + redact.Safe(m.Compact.DeleteOnlyCount), + redact.Safe(m.Compact.ElisionOnlyCount), + redact.Safe(m.Compact.MoveCount), + redact.Safe(m.Compact.ReadCount), + redact.Safe(m.Compact.TombstoneDensityCount), + redact.Safe(m.Compact.RewriteCount), + redact.Safe(m.Compact.CopyCount), + redact.Safe(m.Compact.MultiLevelCount), + redact.Safe(m.Compact.BlobFileRewriteCount), + ) + + w.Printf("MemTables: %d (%s) zombie: %d (%s)\n", + redact.Safe(m.MemTable.Count), + humanize.Bytes.Uint64(m.MemTable.Size), + redact.Safe(m.MemTable.ZombieCount), + humanize.Bytes.Uint64(m.MemTable.ZombieSize)) + + w.Printf("Zombie tables: %d (%s, local: %s)\n", + redact.Safe(m.Table.ZombieCount), + humanize.Bytes.Uint64(m.Table.ZombieSize), + humanize.Bytes.Uint64(m.Table.Local.ZombieSize)) + + w.Printf("Backing tables: %d (%s)\n", + redact.Safe(m.Table.BackingTableCount), + humanize.Bytes.Uint64(m.Table.BackingTableSize)) + w.Printf("Virtual tables: %d (%s)\n", + redact.Safe(m.NumVirtual()), + humanize.Bytes.Uint64(m.VirtualSize())) + w.Printf("Local tables size: %s\n", humanize.Bytes.Uint64(m.Table.Local.LiveSize)) + w.SafeString("Compression types:") + if count := m.Table.CompressedCountSnappy; count > 0 { + w.Printf(" snappy: %d", redact.Safe(count)) + } + if count := m.Table.CompressedCountZstd; count > 0 { + w.Printf(" zstd: %d", redact.Safe(count)) + } + if count := m.Table.CompressedCountMinLZ; count > 0 { + w.Printf(" minlz: %d", redact.Safe(count)) + } + if count := m.Table.CompressedCountNone; count > 0 { + w.Printf(" none: %d", redact.Safe(count)) + } + if count := m.Table.CompressedCountUnknown; count > 0 { + w.Printf(" unknown: %d", redact.Safe(count)) + } + w.Printf("\n") + if m.Table.Garbage.PointDeletionsBytesEstimate > 0 || m.Table.Garbage.RangeDeletionsBytesEstimate > 0 { + w.Printf("Garbage: point-deletions %s range-deletions %s\n", + humanize.Bytes.Uint64(m.Table.Garbage.PointDeletionsBytesEstimate), + humanize.Bytes.Uint64(m.Table.Garbage.RangeDeletionsBytesEstimate)) + } + w.Printf("Table stats: ") + if !m.Table.InitialStatsCollectionComplete { + w.Printf("initial load in progress") + } else if m.Table.PendingStatsCollectionCount == 0 { + w.Printf("all loaded") + } else { + w.Printf("%s", humanize.Count.Int64(m.Table.PendingStatsCollectionCount)) + } + w.Printf("\n") + + w.Printf("Block cache: %s entries (%s) hit rate: %.1f%%\n", + humanize.Count.Int64(m.BlockCache.Count), + humanize.Bytes.Int64(m.BlockCache.Size), + redact.Safe(hitRate(m.BlockCache.Hits, m.BlockCache.Misses))) + + w.Printf("File cache: %s tables, %s blobfiles (%s) hit rate: %.1f%%\n", + humanize.Count.Int64(m.FileCache.TableCount), + humanize.Count.Int64(m.FileCache.BlobFileCount), + humanize.Bytes.Int64(m.FileCache.Size), + redact.Safe(hitRate(m.FileCache.Hits, m.FileCache.Misses))) + + formatSharedCacheMetrics := func(w redact.SafePrinter, m *SecondaryCacheMetrics, name redact.SafeString) { + w.Printf("%s: %s entries (%s) hit rate: %.1f%%\n", + name, + humanize.Count.Int64(m.Count), + humanize.Bytes.Int64(m.Size), + redact.Safe(hitRate(m.ReadsWithFullHit, m.ReadsWithPartialHit+m.ReadsWithNoHit))) + } + if m.SecondaryCacheMetrics.Size > 0 || m.SecondaryCacheMetrics.ReadsWithFullHit > 0 { + formatSharedCacheMetrics(w, &m.SecondaryCacheMetrics, "Secondary cache") + } + + w.Printf("Range key sets: %s Tombstones: %s Total missized tombstones encountered: %s\n", + humanize.Count.Uint64(m.Keys.RangeKeySetsCount), + humanize.Count.Uint64(m.Keys.TombstoneCount), + humanize.Count.Uint64(m.Keys.MissizedTombstonesCount), + ) + + w.Printf("Snapshots: %d earliest seq num: %d\n", + redact.Safe(m.Snapshots.Count), + redact.Safe(m.Snapshots.EarliestSeqNum)) + + w.Printf("Table iters: %d\n", redact.Safe(m.TableIters)) + w.Printf("Filter utility: %.1f%%\n", redact.Safe(hitRate(m.Filter.Hits, m.Filter.Misses))) + w.Printf("Ingestions: %d as flushable: %d (%s in %d tables)\n", + redact.Safe(m.Ingest.Count), + redact.Safe(m.Flush.AsIngestCount), + humanize.Bytes.Uint64(m.Flush.AsIngestBytes), + redact.Safe(m.Flush.AsIngestTableCount)) + + var inUseTotal uint64 + for i := range m.manualMemory { + inUseTotal += m.manualMemory[i].InUseBytes + } + inUse := func(purpose manual.Purpose) uint64 { + return m.manualMemory[purpose].InUseBytes + } + w.Printf("Cgo memory usage: %s block cache: %s (data: %s, maps: %s, entries: %s) memtables: %s\n", + humanize.Bytes.Uint64(inUseTotal), + humanize.Bytes.Uint64(inUse(manual.BlockCacheData)+inUse(manual.BlockCacheMap)+inUse(manual.BlockCacheEntry)), + humanize.Bytes.Uint64(inUse(manual.BlockCacheData)), + humanize.Bytes.Uint64(inUse(manual.BlockCacheMap)), + humanize.Bytes.Uint64(inUse(manual.BlockCacheEntry)), + humanize.Bytes.Uint64(inUse(manual.MemTable)), + ) +} + +func hitRate(hits, misses int64) float64 { + return percent(hits, hits+misses) +} + +func percent(numerator, denominator int64) float64 { + if denominator == 0 { + return 0 + } + return 100 * float64(numerator) / float64(denominator) +} + +// StringForTests is identical to m.String() on 64-bit platforms. It is used to +// provide a platform-independent result for tests. +func (m *Metrics) StringForTests() string { + mCopy := *m + + // We recalculate the file cache size using the 64-bit sizes, and we ignore + // the genericcache metadata size which is harder to adjust. + const sstableReaderSize64bit = 280 + const blobFileReaderSize64bit = 96 + mCopy.FileCache.Size = mCopy.FileCache.TableCount*sstableReaderSize64bit + mCopy.FileCache.BlobFileCount*blobFileReaderSize64bit + if math.MaxInt == math.MaxInt64 { + // Verify the 64-bit sizes, so they are kept updated. + if sstableReaderSize64bit != unsafe.Sizeof(sstable.Reader{}) { + panic(fmt.Sprintf("sstableReaderSize64bit should be updated to %d", unsafe.Sizeof(sstable.Reader{}))) + } + if blobFileReaderSize64bit != unsafe.Sizeof(blob.FileReader{}) { + panic(fmt.Sprintf("blobFileReaderSize64bit should be updated to %d", unsafe.Sizeof(blob.FileReader{}))) + } + } + // Don't show cgo memory statistics as they can vary based on architecture, + // invariants tag, etc. + mCopy.manualMemory = manual.Metrics{} + return redact.StringWithoutMarkers(&mCopy) +} + +// levelMetricsDelta accumulates incremental ("delta") level metric updates +// (e.g. from compactions or flushes). +type levelMetricsDelta [manifest.NumLevels]*LevelMetrics + +func (m *levelMetricsDelta) level(level int) *LevelMetrics { + if m[level] == nil { + m[level] = &LevelMetrics{} + } + return m[level] +} + +func (m *Metrics) updateLevelMetrics(updates levelMetricsDelta) { + for i, u := range updates { + if u != nil { + m.Levels[i].Add(u) + } + } +} + +// humanizeFloat formats a float64 value as a string. It shows up to two +// decimals, depending on the target length. NaN is shown as "-". +func humanizeFloat(v float64, targetLength int) redact.SafeString { + if math.IsNaN(v) { + return "-" + } + // We treat 0 specially. Values near zero will show up as 0.00. + if v == 0 { + return "0" + } + res := fmt.Sprintf("%.2f", v) + if len(res) <= targetLength { + return redact.SafeString(res) + } + if len(res) == targetLength+1 { + return redact.SafeString(fmt.Sprintf("%.1f", v)) + } + return redact.SafeString(fmt.Sprintf("%.0f", v)) +} diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/noop_readahead.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/noop_readahead.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/objstorage/noop_readahead.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/noop_readahead.go diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorage.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorage.go similarity index 74% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorage.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorage.go index bbb5b6e..ec428d9 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorage.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorage.go @@ -9,10 +9,11 @@ import ( "fmt" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" - "github.com/cockroachdb/pebble/objstorage/remote" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/sharedcache" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/redact" ) // Readable is the handle for an object that is open for reading. @@ -37,9 +38,38 @@ type Readable interface { // The ReadHandle must be closed before the Readable is closed. // // Multiple separate ReadHandles can be used. - NewReadHandle(ctx context.Context) ReadHandle + NewReadHandle(readBeforeSize ReadBeforeSize) ReadHandle } +// ReadBeforeSize specifies whether the first read should read additional +// bytes before the offset, and how big the overall read should be. This is +// just a suggestion that the callee can ignore (and does ignore in +// fileReadable). +// +// When 0, the first read will only read what it is asked to read, say n +// bytes. When it is a value b > 0, if b > n, then the read will be padded by +// an additional b-n bytes to the left, resulting in an overall read size of +// b. This behavior is akin to what the read-ahead implementation does -- when +// the n bytes are not buffered, and there is read-ahead of b > n, the read +// length is b bytes. +type ReadBeforeSize int64 + +const ( + // NoReadBefore specifies no read-before. + NoReadBefore ReadBeforeSize = 0 + // ReadBeforeForNewReader is used for a new Reader reading the footer, + // metaindex, properties. 32KB is unnecessarily large, but it is still small + // when considering remote object storage. + ReadBeforeForNewReader = 32 * 1024 + // ReadBeforeForIndexAndFilter is used for an iterator reading the top-level + // index, filter and second-level index blocks. + // + // Consider a 128MB sstable with 32KB blocks, so 4K blocks. Say keys are + // ~100 bytes, then the size of the index blocks is ~400KB. 512KB is a bit + // bigger, and not too large to be a memory concern. + ReadBeforeForIndexAndFilter = 512 * 1024 +) + // ReadHandle is used to perform reads that are related and might benefit from // optimizations like read-ahead. type ReadHandle interface { @@ -140,11 +170,11 @@ func (meta *ObjectMetadata) AssertValid() { panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote)) } } else { - if meta.Remote.CustomObjectName != "" { + if meta.Remote.CustomObjectName == "" { if meta.Remote.CreatorID == 0 { panic(errors.AssertionFailedf("CreatorID not set")) } - if meta.Remote.CreatorFileNum == base.FileNum(0).DiskFileNum() { + if meta.Remote.CreatorFileNum == 0 { panic(errors.AssertionFailedf("CreatorFileNum not set")) } } @@ -167,6 +197,11 @@ func (c CreatorID) IsSet() bool { return c != 0 } func (c CreatorID) String() string { return fmt.Sprintf("%d", c) } +// SafeFormat implements redact.SafeFormatter. +func (c CreatorID) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("%d", redact.SafeUint(c)) +} + // SharedCleanupMethod indicates the method for cleaning up unused shared objects. type SharedCleanupMethod uint8 @@ -182,8 +217,8 @@ const ( // OpenOptions contains optional arguments for OpenForReading. type OpenOptions struct { - // MustExist triggers a fatal error if the file does not exist. The fatal - // error message contains extra information helpful for debugging. + // MustExist converts a not-exist error into a corruption error, and adds + // extra information helpful for debugging. MustExist bool } @@ -196,6 +231,10 @@ type CreateOptions struct { // SharedCleanupMethod is used for the object when it is created on shared storage. // The default (zero) value is SharedRefTracking. SharedCleanupMethod SharedCleanupMethod + + // WriteCategory is used for the object when it is created on local storage + // to collect aggregated write metrics for each write source. + WriteCategory vfs.DiskWriteCategory } // Provider is a singleton object used to access and manage objects. @@ -280,7 +319,14 @@ type Provider interface { // Pebble and will never be removed by Pebble. CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error) + // GetExternalObjects returns a list of DiskFileNums corresponding to all + // objects that are backed by the given external object. + GetExternalObjects(locator remote.Locator, objName string) []base.DiskFileNum + // AttachRemoteObjects registers existing remote objects with this provider. + // + // The objects are not guaranteed to be durable (accessible in case of + // crashes) until Sync is called. AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error) Close() error @@ -289,6 +335,11 @@ type Provider interface { // directory does not exist. IsNotExistError(err error) bool + // CheckpointState saves any saved state on local disk to the specified + // directory on the specified VFS. A new Pebble instance instantiated at that + // path should be able to resolve references to the specified files. + CheckpointState(fs vfs.FS, dir string, fileNums []base.DiskFileNum) error + // Metrics returns metrics about objstorage. Currently, it only returns metrics // about the shared cache. Metrics() sharedcache.Metrics @@ -322,3 +373,45 @@ type RemoteObjectToAttach struct { // implementation). Backing RemoteObjectBacking } + +// Copy copies the specified range from the input to the output. +func Copy(ctx context.Context, r ReadHandle, out Writable, offset, length uint64) error { + buf := make([]byte, 256<<10) + end := offset + length + for offset < end { + n := min(end-offset, uint64(len(buf))) + if n == 0 { + break + } + readErr := r.ReadAt(ctx, buf[:n], int64(offset)) + if readErr != nil { + return readErr + } + offset += n + if err := out.Write(buf[:n]); err != nil { + return err + } + } + return nil +} + +// IsLocalBlobFile returns true if a blob file with the given fileNum exists and is +// local. +func IsLocalBlobFile(provider Provider, fileNum base.DiskFileNum) bool { + meta, err := provider.Lookup(base.FileTypeBlob, fileNum) + return err == nil && !meta.IsRemote() +} + +// IsLocalTable returns true if a table with the given fileNum exists and is +// local. +func IsLocalTable(provider Provider, fileNum base.DiskFileNum) bool { + meta, err := provider.Lookup(base.FileTypeTable, fileNum) + return err == nil && !meta.IsRemote() +} + +// IsExternalTable returns true if a table with the given fileNum exists and is +// external. +func IsExternalTable(provider Provider, fileNum base.DiskFileNum) bool { + meta, err := provider.Lookup(base.FileTypeTable, fileNum) + return err == nil && meta.IsExternal() +} diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing/obj_io_tracing.go similarity index 84% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing/obj_io_tracing.go index 2672636..6cac5b6 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing/obj_io_tracing.go @@ -4,7 +4,10 @@ package objiotracing -import "github.com/cockroachdb/pebble/internal/base" +import ( + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/sstable/block/blockkind" +) // OpType indicates the type of operation. type OpType uint8 @@ -33,18 +36,6 @@ const ( // TODO(radu): add ForUserFacing. ) -// BlockType indicates the type of data block relevant to an operation. -type BlockType uint8 - -// BlockType values. -const ( - UnknownBlock BlockType = iota - DataBlock - ValueBlock - FilterBlock - MetadataBlock -) - // Event is the on-disk format of a tracing event. It is exported here so that // trace processing tools can use it by importing this package. type Event struct { @@ -54,12 +45,12 @@ type Event struct { StartUnixNano int64 Op OpType Reason Reason - BlockType BlockType + BlockKind blockkind.Kind // LSM level plus one (with 0 indicating unknown level). LevelPlusOne uint8 // Hardcoded padding so that struct layout doesn't depend on architecture. _ uint32 - FileNum base.FileNum + FileNum base.DiskFileNum // HandleID is a unique identifier corresponding to an objstorage.ReadHandle; // only set for read operations performed through a ReadHandle. HandleID uint64 diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_off.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing/obj_io_tracing_off.go similarity index 83% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_off.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing/obj_io_tracing_off.go index a4923ab..3896298 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_off.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing/obj_io_tracing_off.go @@ -3,16 +3,16 @@ // the LICENSE file. //go:build !pebble_obj_io_tracing -// +build !pebble_obj_io_tracing package objiotracing import ( "context" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/sstable/block/blockkind" + "github.com/cockroachdb/pebble/v2/vfs" ) // Enabled is used to short circuit tracing-related code in regular builds. @@ -50,9 +50,9 @@ func (t *Tracer) WrapWritable( // traces created under that context). func WithReason(ctx context.Context, reason Reason) context.Context { return ctx } -// WithBlockType creates a context that has an associated BlockType (which ends up in +// WithBlockKind creates a context that has an associated BlockType (which ends up in // traces created under that context). -func WithBlockType(ctx context.Context, blockType BlockType) context.Context { return ctx } +func WithBlockKind(ctx context.Context, kind blockkind.Kind) context.Context { return ctx } // WithLevel creates a context that has an associated level (which ends up in // traces created under that context). diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go similarity index 86% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go index 0680b34..d7a9b41 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing/obj_io_tracing_on.go @@ -3,7 +3,6 @@ // the LICENSE file. //go:build pebble_obj_io_tracing -// +build pebble_obj_io_tracing package objiotracing @@ -11,15 +10,16 @@ import ( "bufio" "context" "fmt" - "math/rand" + "math/rand/v2" "sync" "sync/atomic" "time" "unsafe" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/sstable/block/blockkind" + "github.com/cockroachdb/pebble/v2/vfs" ) // Enabled is used to short circuit tracing-related code in regular builds. @@ -55,7 +55,7 @@ func Open(fs vfs.FS, fsDir string) *Tracer { workerDataCh: make(chan eventBuf, channelBufSize), } - t.handleID.Store(uint64(rand.NewSource(time.Now().UnixNano()).Int63())) + t.handleID.Store(rand.Uint64()) t.workerWait.Add(1) go t.workerLoop() @@ -76,18 +76,18 @@ func (t *Tracer) Close() { // WrapWritable wraps an objstorage.Writable with one that generates tracing // events. func (t *Tracer) WrapWritable( - ctx context.Context, w objstorage.Writable, fileNum base.FileNum, + ctx context.Context, w objstorage.Writable, fileNum base.DiskFileNum, ) objstorage.Writable { return &writable{ w: w, fileNum: fileNum, - g: makeEventGenerator(ctx, t), + g: makeEventGenerator(infoFromCtx(ctx), t), } } type writable struct { w objstorage.Writable - fileNum base.FileNum + fileNum base.DiskFileNum curOffset int64 g eventGenerator } @@ -125,20 +125,22 @@ func (w *writable) Abort() { // WrapReadable wraps an objstorage.Readable with one that generates tracing // events. func (t *Tracer) WrapReadable( - ctx context.Context, r objstorage.Readable, fileNum base.FileNum, + ctx context.Context, r objstorage.Readable, fileNum base.DiskFileNum, ) objstorage.Readable { res := &readable{ - r: r, - fileNum: fileNum, + r: r, + fileNum: fileNum, + baseCtxInfo: infoFromCtx(ctx), } - res.mu.g = makeEventGenerator(ctx, t) + res.mu.g = makeEventGenerator(res.baseCtxInfo, t) return res } type readable struct { - r objstorage.Readable - fileNum base.FileNum - mu struct { + r objstorage.Readable + fileNum base.DiskFileNum + baseCtxInfo ctxInfo + mu struct { sync.Mutex g eventGenerator } @@ -147,7 +149,7 @@ type readable struct { var _ objstorage.Readable = (*readable)(nil) // ReadAt is part of the objstorage.Readable interface. -func (r *readable) ReadAt(ctx context.Context, v []byte, off int64) (n int, err error) { +func (r *readable) ReadAt(ctx context.Context, v []byte, off int64) error { r.mu.Lock() r.mu.g.add(ctx, Event{ Op: ReadOp, @@ -171,20 +173,20 @@ func (r *readable) Size() int64 { } // NewReadHandle is part of the objstorage.Readable interface. -func (r *readable) NewReadHandle(ctx context.Context) objstorage.ReadHandle { +func (r *readable) NewReadHandle(readBeforeSize objstorage.ReadBeforeSize) objstorage.ReadHandle { // It's safe to get the tracer from the generator without the mutex since it never changes. t := r.mu.g.t return &readHandle{ - rh: r.r.NewReadHandle(ctx), + rh: r.r.NewReadHandle(readBeforeSize), fileNum: r.fileNum, handleID: t.handleID.Add(1), - g: makeEventGenerator(ctx, t), + g: makeEventGenerator(r.baseCtxInfo, t), } } type readHandle struct { rh objstorage.ReadHandle - fileNum base.FileNum + fileNum base.DiskFileNum handleID uint64 g eventGenerator } @@ -192,7 +194,7 @@ type readHandle struct { var _ objstorage.ReadHandle = (*readHandle)(nil) // ReadAt is part of the objstorage.ReadHandle interface. -func (rh *readHandle) ReadAt(ctx context.Context, p []byte, off int64) (n int, err error) { +func (rh *readHandle) ReadAt(ctx context.Context, p []byte, off int64) error { rh.g.add(ctx, Event{ Op: ReadOp, FileNum: rh.fileNum, @@ -233,7 +235,7 @@ func (rh *readHandle) RecordCacheHit(ctx context.Context, offset, size int64) { type ctxInfo struct { reason Reason - blockType BlockType + blockKind blockkind.Kind levelPlusOne uint8 } @@ -242,8 +244,8 @@ func mergeCtxInfo(base, other ctxInfo) ctxInfo { if res.reason == 0 { res.reason = base.reason } - if res.blockType == 0 { - res.blockType = base.blockType + if res.blockKind == 0 { + res.blockKind = base.blockKind } if res.levelPlusOne == 0 { res.levelPlusOne = base.levelPlusOne @@ -273,11 +275,11 @@ func WithReason(ctx context.Context, reason Reason) context.Context { return withInfo(ctx, info) } -// WithBlockType creates a context that has an associated BlockType (which ends up in +// WithBlockKind creates a context that has an associated BlockType (which ends up in // traces created under that context). -func WithBlockType(ctx context.Context, blockType BlockType) context.Context { +func WithBlockKind(ctx context.Context, kind blockkind.Kind) context.Context { info := infoFromCtx(ctx) - info.blockType = blockType + info.blockKind = kind return withInfo(ctx, info) } @@ -308,10 +310,10 @@ type eventGenerator struct { buf eventBuf } -func makeEventGenerator(ctx context.Context, t *Tracer) eventGenerator { +func makeEventGenerator(baseCtxInfo ctxInfo, t *Tracer) eventGenerator { return eventGenerator{ t: t, - baseCtxInfo: infoFromCtx(ctx), + baseCtxInfo: baseCtxInfo, } } @@ -327,7 +329,7 @@ func (g *eventGenerator) add(ctx context.Context, e Event) { info := infoFromCtx(ctx) info = mergeCtxInfo(g.baseCtxInfo, info) e.Reason = info.reason - e.BlockType = info.blockType + e.BlockKind = info.blockKind e.LevelPlusOne = info.levelPlusOne if g.buf.num == eventsPerBuf { g.flush() @@ -381,7 +383,7 @@ func (t *Tracer) workerWriteTraces(state *workerState, data eventBuf) { func (t *Tracer) workerNewFile(state *workerState) { filename := fmt.Sprintf("IOTRACES-%s", time.Now().UTC().Format(time.RFC3339Nano)) - file, err := t.fs.Create(t.fs.PathJoin(t.fsDir, filename)) + file, err := t.fs.Create(t.fs.PathJoin(t.fsDir, filename), vfs.WriteCategoryUnspecified) if err != nil { panic(err) } diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/provider.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/provider.go similarity index 73% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/provider.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/provider.go index 1652b7b..b22ee92 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/provider.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/provider.go @@ -5,22 +5,24 @@ package objstorageprovider import ( + "cmp" "context" "io" "os" - "sort" + "slices" "sync" + "sync/atomic" "github.com/cockroachdb/errors" "github.com/cockroachdb/errors/oserror" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" - "github.com/cockroachdb/pebble/objstorage/remote" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remoteobjcat" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/sharedcache" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/vfs" ) // provider is the implementation of objstorage.Provider. @@ -36,17 +38,16 @@ type provider struct { mu struct { sync.RWMutex - remote struct { - // catalogBatch accumulates remote object creations and deletions until - // Sync is called. - catalogBatch remoteobjcat.Batch + remote remoteLockedState - storageObjects map[remote.Locator]remote.Storage - } - - // localObjectsChanged is set if non-remote objects were created or deleted - // but Sync was not yet called. - localObjectsChanged bool + // TODO(radu): move these fields to a localLockedState struct. + // localObjectsChanged is incremented whenever non-remote objects are created. + // The purpose of this counter is to avoid syncing the local filesystem when + // only remote objects are changed. + localObjectsChangeCounter uint64 + // localObjectsChangeCounterSynced is the value of localObjectsChangeCounter + // value at the time the last completed sync was launched. + localObjectsChangeCounterSynced uint64 // knownObjects maintains information about objects that are known to the provider. // It is initialized with the list of files in the manifest when we open a DB. @@ -97,10 +98,9 @@ type Settings struct { Local struct { // TODO(radu): move FSCleaner, NoSyncOnClose, BytesPerSync here. - // ReadaheadConfigFn is a function used to retrieve the current readahead - // mode. This function is run whenever a local object is open for reading. - // If it is nil, DefaultReadaheadConfig is used. - ReadaheadConfigFn func() ReadaheadConfig + // ReadaheadConfig is used to retrieve the current readahead mode; it is + // consulted whenever a read handle is initialized. + ReadaheadConfig *ReadaheadConfig } // Fields here are set only if the provider is to support remote objects @@ -140,22 +140,49 @@ type Settings struct { } } -// ReadaheadConfig controls the use of read-ahead. +// ReadaheadConfig is a container for the settings that control the use of +// read-ahead. +// +// It stores two ReadaheadModes: +// - Informed is the type of read-ahead for operations that are known to read a +// large consecutive chunk of a file. +// - Speculative is the type of read-ahead used automatically, when consecutive +// reads are detected. +// +// The settings can be changed and read atomically. type ReadaheadConfig struct { - // Informed is the type of read-ahead for operations that are known to read a - // large consecutive chunk of a file. - Informed ReadaheadMode + value atomic.Uint32 +} + +// These are the default readahead modes when a config is not specified. +const ( + defaultReadaheadInformed = FadviseSequential + defaultReadaheadSpeculative = FadviseSequential +) - // Speculative is the type of read-ahead used automatically, when consecutive - // reads are detected. - Speculative ReadaheadMode +// NewReadaheadConfig returns a new readahead config container initialized with +// default values. +func NewReadaheadConfig() *ReadaheadConfig { + rc := &ReadaheadConfig{} + rc.Set(defaultReadaheadInformed, defaultReadaheadSpeculative) + return rc } -// DefaultReadaheadConfig is the readahead config used when ReadaheadConfigFn is -// not specified. -var DefaultReadaheadConfig = ReadaheadConfig{ - Informed: FadviseSequential, - Speculative: FadviseSequential, +// Set the informed and speculative readahead modes. +func (rc *ReadaheadConfig) Set(informed, speculative ReadaheadMode) { + rc.value.Store(uint32(speculative)<<8 | uint32(informed)) +} + +// Informed returns the type of read-ahead for operations that are known to read +// a large consecutive chunk of a file. +func (rc *ReadaheadConfig) Informed() ReadaheadMode { + return ReadaheadMode(rc.value.Load() & 0xff) +} + +// Speculative returns the type of read-ahead used automatically, when +// consecutive reads are detected. +func (rc *ReadaheadConfig) Speculative() ReadaheadMode { + return ReadaheadMode(rc.value.Load() >> 8) } // ReadaheadMode indicates the type of read-ahead to use, either for informed @@ -170,10 +197,10 @@ const ( // The prefetch window grows dynamically as consecutive writes are detected. SysReadahead - // FadviseSequential enables to use of FADV_SEQUENTIAL. For informed + // FadviseSequential enables the use of FADV_SEQUENTIAL. For informed // read-ahead, FADV_SEQUENTIAL is used from the beginning. For speculative - // read-ahead SYS_READAHEAD is first used until the window reaches the maximum - // size, then we siwtch to FADV_SEQUENTIAL. + // read-ahead, SYS_READAHEAD is first used until the window reaches the + // maximum size, then we switch to FADV_SEQUENTIAL. FadviseSequential ) @@ -213,6 +240,10 @@ func open(settings Settings) (p *provider, _ error) { } }() + if settings.Local.ReadaheadConfig == nil { + settings.Local.ReadaheadConfig = NewReadaheadConfig() + } + p = &provider{ st: settings, fsDir: fsDir, @@ -263,7 +294,7 @@ func (p *provider) OpenForReading( meta, err := p.Lookup(fileType, fileNum) if err != nil { if opts.MustExist { - p.st.Logger.Fatalf("%v", err) + err = base.MarkCorruptionError(err) } return nil, err } @@ -276,6 +307,7 @@ func (p *provider) OpenForReading( if err != nil && p.isNotExistError(meta, err) { // Wrap the error so that IsNotExistError functions properly. err = errors.Mark(err, os.ErrNotExist) + err = base.MarkCorruptionError(err) } } if err != nil { @@ -300,10 +332,16 @@ func (p *provider) Create( if opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone { w, meta, err = p.sharedCreate(ctx, fileType, fileNum, p.st.Remote.CreateOnSharedLocator, opts) } else { - w, meta, err = p.vfsCreate(ctx, fileType, fileNum) + var category vfs.DiskWriteCategory + if opts.WriteCategory != "" { + category = opts.WriteCategory + } else { + category = vfs.WriteCategoryUnspecified + } + w, meta, err = p.vfsCreate(ctx, fileType, fileNum, category) } if err != nil { - err = errors.Wrapf(err, "creating object %s", errors.Safe(fileNum)) + err = errors.Wrapf(err, "creating object %s", fileNum) return nil, objstorage.ObjectMetadata{}, err } p.addMetadata(meta) @@ -340,7 +378,7 @@ func (p *provider) Remove(fileType base.FileType, fileNum base.DiskFileNum) erro // We want to be able to retry a Remove, so we keep the object in our list. // TODO(radu): we should mark the object as "zombie" and not allow any other // operations. - return errors.Wrapf(err, "removing object %s", errors.Safe(fileNum)) + return errors.Wrapf(err, "removing object %s", fileNum) } p.removeMetadata(fileNum) @@ -451,14 +489,14 @@ func (p *provider) Lookup( if !ok { return objstorage.ObjectMetadata{}, errors.Wrapf( os.ErrNotExist, - "file %s (type %d) unknown to the objstorage provider", - errors.Safe(fileNum), errors.Safe(fileType), + "file %s (type %s) unknown to the objstorage provider", + fileNum, fileType, ) } if meta.FileType != fileType { - return objstorage.ObjectMetadata{}, errors.AssertionFailedf( - "file %s type mismatch (known type %d, expected type %d)", - errors.Safe(fileNum), errors.Safe(meta.FileType), errors.Safe(fileType), + return objstorage.ObjectMetadata{}, base.AssertionFailedf( + "file %s type mismatch (known type %s, expected type %s)", + fileNum, errors.Safe(meta.FileType), errors.Safe(fileType), ) } return meta, nil @@ -488,8 +526,8 @@ func (p *provider) List() []objstorage.ObjectMetadata { for _, meta := range p.mu.knownObjects { res = append(res, meta) } - sort.Slice(res, func(i, j int) bool { - return res[i].DiskFileNum.FileNum() < res[j].DiskFileNum.FileNum() + slices.SortFunc(res, func(a, b objstorage.ObjectMetadata) int { + return cmp.Compare(a.DiskFileNum, b.DiskFileNum) }) return res } @@ -502,24 +540,54 @@ func (p *provider) Metrics() sharedcache.Metrics { return sharedcache.Metrics{} } +// CheckpointState is part of the objstorage.Provider interface. +func (p *provider) CheckpointState(fs vfs.FS, dir string, fileNums []base.DiskFileNum) error { + p.mu.Lock() + defer p.mu.Unlock() + for i := range fileNums { + if _, ok := p.mu.knownObjects[fileNums[i]]; !ok { + return errors.Wrapf( + os.ErrNotExist, + "file %s unknown to the objstorage provider", + fileNums[i], + ) + } + // Prevent this object from deletion, at least for the life of this instance. + p.mu.protectedObjects[fileNums[i]] = p.mu.protectedObjects[fileNums[i]] + 1 + } + + if p.remote.catalog != nil { + return p.remote.catalog.Checkpoint(fs, dir) + } + return nil +} + func (p *provider) addMetadata(meta objstorage.ObjectMetadata) { + p.mu.Lock() + defer p.mu.Unlock() + p.addMetadataLocked(meta) +} + +func (p *provider) addMetadataLocked(meta objstorage.ObjectMetadata) { if invariants.Enabled { meta.AssertValid() } - p.mu.Lock() - defer p.mu.Unlock() p.mu.knownObjects[meta.DiskFileNum] = meta if meta.IsRemote() { p.mu.remote.catalogBatch.AddObject(remoteobjcat.RemoteObjectMetadata{ - FileNum: meta.DiskFileNum, - FileType: meta.FileType, - CreatorID: meta.Remote.CreatorID, - CreatorFileNum: meta.Remote.CreatorFileNum, - Locator: meta.Remote.Locator, - CleanupMethod: meta.Remote.CleanupMethod, + FileNum: meta.DiskFileNum, + FileType: meta.FileType, + CreatorID: meta.Remote.CreatorID, + CreatorFileNum: meta.Remote.CreatorFileNum, + Locator: meta.Remote.Locator, + CleanupMethod: meta.Remote.CleanupMethod, + CustomObjectName: meta.Remote.CustomObjectName, }) + if meta.IsExternal() { + p.mu.remote.addExternalObject(meta) + } } else { - p.mu.localObjectsChanged = true + p.mu.localObjectsChangeCounter++ } } @@ -532,10 +600,13 @@ func (p *provider) removeMetadata(fileNum base.DiskFileNum) { return } delete(p.mu.knownObjects, fileNum) + if meta.IsExternal() { + p.mu.remote.removeExternalObject(meta) + } if meta.IsRemote() { p.mu.remote.catalogBatch.DeleteObject(fileNum) } else { - p.mu.localObjectsChanged = true + p.mu.localObjectsChangeCounter++ } } diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/readahead.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/readahead.go similarity index 75% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/readahead.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/readahead.go index de9ff71..cfd44ad 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/readahead.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/readahead.go @@ -4,7 +4,7 @@ package objstorageprovider -import "github.com/cockroachdb/pebble/internal/invariants" +import "github.com/cockroachdb/pebble/v2/internal/invariants" const ( // Constants for dynamic readahead of data blocks. Note that the size values @@ -42,40 +42,7 @@ func makeReadaheadState(maxReadaheadSize int64) readaheadState { } func (rs *readaheadState) recordCacheHit(offset, blockLength int64) { - currentReadEnd := offset + blockLength - if rs.numReads >= minFileReadsForReadahead { - if currentReadEnd >= rs.limit && offset <= rs.limit+rs.maxReadaheadSize { - // This is a read that would have resulted in a readahead, had it - // not been a cache hit. - rs.limit = currentReadEnd - return - } - if currentReadEnd < rs.limit-rs.prevSize || offset > rs.limit+rs.maxReadaheadSize { - // We read too far away from rs.limit to benefit from readahead in - // any scenario. Reset all variables. - rs.numReads = 1 - rs.limit = currentReadEnd - rs.size = initialReadaheadSize - rs.prevSize = 0 - return - } - // Reads in the range [rs.limit - rs.prevSize, rs.limit] end up - // here. This is a read that is potentially benefitting from a past - // readahead. - return - } - if currentReadEnd >= rs.limit && offset <= rs.limit+rs.maxReadaheadSize { - // Blocks are being read sequentially and would benefit from readahead - // down the line. - rs.numReads++ - return - } - // We read too far ahead of the last read, or before it. This indicates - // a random read, where readahead is not desirable. Reset all variables. - rs.numReads = 1 - rs.limit = currentReadEnd - rs.size = initialReadaheadSize - rs.prevSize = 0 + _ = rs.maybeReadaheadOrCacheHit(offset, blockLength, false) } // maybeReadahead updates state and determines whether to issue a readahead / @@ -83,6 +50,13 @@ func (rs *readaheadState) recordCacheHit(offset, blockLength int64) { // Returns a size value (greater than 0) that should be prefetched if readahead // would be beneficial. func (rs *readaheadState) maybeReadahead(offset, blockLength int64) int64 { + return rs.maybeReadaheadOrCacheHit(offset, blockLength, true) +} + +// The return value should be ignored if !readahead. +func (rs *readaheadState) maybeReadaheadOrCacheHit( + offset, blockLength int64, readahead bool, +) int64 { if invariants.Enabled && rs.maxReadaheadSize == 0 { panic("readaheadState not initialized") } @@ -124,18 +98,22 @@ func (rs *readaheadState) maybeReadahead(offset, blockLength int64) int64 { // // rs.numReads++ - rs.limit = offset + rs.size - rs.prevSize = rs.size - // Increase rs.size for the next read. - rs.size *= 2 - if rs.size > rs.maxReadaheadSize { - rs.size = rs.maxReadaheadSize + if readahead { + rs.limit = offset + rs.size + rs.prevSize = rs.size + // Increase rs.size for the next read. + rs.size *= 2 + if rs.size > rs.maxReadaheadSize { + rs.size = rs.maxReadaheadSize + } + } else { + // This is a read that would have resulted in a readahead, had it + // not been a cache hit. + rs.limit = currentReadEnd } return rs.prevSize } if currentReadEnd < rs.limit-rs.prevSize || offset > rs.limit+rs.maxReadaheadSize { - // The above conditional has rs.limit > rs.prevSize to confirm that - // rs.limit - rs.prevSize would not underflow. // We read too far away from rs.limit to benefit from readahead in // any scenario. Reset all variables. // The case where we read too far ahead: @@ -161,7 +139,16 @@ func (rs *readaheadState) maybeReadahead(offset, blockLength int64) int64 { return 0 } - // Reads in the range [rs.limit - rs.prevSize, rs.limit] end up + // The previous if-block predicates were all false. This mechanically implies: + // + // INVARIANT: + // !(currentReadEnd >= rs.limit && offset <= rs.limit+rs.maxReadaheadSize) && + // !(currentReadEnd < rs.limit-rs.prevSize || offset > rs.limit+rs.maxReadaheadSize) + // Which mechanically simplifies to: + // currentReadEnd < rs.limit && currentReadEnd >= rs.limit-rs.prevSize && + // offset <= rs.limit+rs.maxReadaheadSize + // + // So reads in the range [rs.limit - rs.prevSize, rs.limit] end up // here. This is a read that is potentially benefitting from a past // readahead, but there's no reason to issue a readahead call at the // moment. @@ -176,6 +163,8 @@ func (rs *readaheadState) maybeReadahead(offset, blockLength int64) int64 { rs.numReads++ return 0 } + // Not yet at the numReads threshold to justify readahead. But we want to + // capture whether readahead will be beneficial in the future. if currentReadEnd >= rs.limit && offset <= rs.limit+rs.maxReadaheadSize { // Blocks are being read sequentially and would benefit from readahead // down the line. @@ -187,6 +176,16 @@ func (rs *readaheadState) maybeReadahead(offset, blockLength int64) int64 { // offset currentReadEnd // rs.numReads++ + // It is possible to fall here when rs.limit has not been initialized. If + // we don't initialize, rs.limit, it is possible that the first read + // offset was at rs.limit+rs.maxReadaheadSize-delta and the enclosing + // if-block predicate was true, and the next read is sequential but has + // offset > rs.limit+rs.maxReadaheadSize (if we left rs.limit at 0), and + // the enclosing if-block predicate will be false and we will incorrectly + // think that readahead is not beneficial. The same issue arises if + // rs.limit has been initialized and currentReadEnd is advancing past + // rs.limit. + rs.limit = currentReadEnd return 0 } // We read too far ahead of the last read, or before it. This indicates diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote.go similarity index 80% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote.go index e8ee42b..0006aa4 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote.go @@ -8,16 +8,17 @@ import ( "context" "fmt" "runtime" + "slices" "sync" "sync/atomic" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" - "github.com/cockroachdb/pebble/objstorage/remote" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remoteobjcat" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/sharedcache" + "github.com/cockroachdb/pebble/v2/objstorage/remote" ) // remoteSubsystem contains the provider fields related to remote storage. @@ -45,6 +46,43 @@ type remoteSubsystem struct { } } +type remoteLockedState struct { + // catalogBatch accumulates remote object creations and deletions until + // Sync is called. + catalogBatch remoteobjcat.Batch + + storageObjects map[remote.Locator]remote.Storage + + externalObjects map[remote.ObjectKey][]base.DiskFileNum +} + +func (rs *remoteLockedState) addExternalObject(meta objstorage.ObjectMetadata) { + if rs.externalObjects == nil { + rs.externalObjects = make(map[remote.ObjectKey][]base.DiskFileNum) + } + key := remote.MakeObjectKey(meta.Remote.Locator, meta.Remote.CustomObjectName) + rs.externalObjects[key] = append(rs.externalObjects[key], meta.DiskFileNum) +} + +func (rs *remoteLockedState) removeExternalObject(meta objstorage.ObjectMetadata) { + key := remote.MakeObjectKey(meta.Remote.Locator, meta.Remote.CustomObjectName) + newSlice := slices.DeleteFunc(rs.externalObjects[key], func(n base.DiskFileNum) bool { + return n == meta.DiskFileNum + }) + if len(newSlice) == 0 { + delete(rs.externalObjects, key) + } else { + rs.externalObjects[key] = newSlice + } +} + +func (rs *remoteLockedState) getExternalObjects( + locator remote.Locator, objName string, +) []base.DiskFileNum { + key := remote.MakeObjectKey(locator, objName) + return rs.externalObjects[key] +} + // remoteInit initializes the remote object subsystem (if configured) and finds // any remote objects. func (p *provider) remoteInit() error { @@ -109,6 +147,9 @@ func (p *provider) remoteInit() error { o.AssertValid() } p.mu.knownObjects[o.DiskFileNum] = o + if o.IsExternal() { + p.mu.remote.addExternalObject(o) + } } return nil } @@ -125,9 +166,9 @@ func (p *provider) sharedClose() error { if p.st.Remote.StorageFactory == nil { return nil } - var err error + err := p.sharedSync() if p.remote.cache != nil { - err = p.remote.cache.Close() + err = firstError(err, p.remote.cache.Close()) p.remote.cache = nil } if p.remote.catalog != nil { @@ -140,7 +181,7 @@ func (p *provider) sharedClose() error { // SetCreatorID is part of the objstorage.Provider interface. func (p *provider) SetCreatorID(creatorID objstorage.CreatorID) error { if p.st.Remote.StorageFactory == nil { - return errors.AssertionFailedf("attempt to set CreatorID but remote storage not enabled") + return base.AssertionFailedf("attempt to set CreatorID but remote storage not enabled") } // Note: this call is a cheap no-op if the creator ID was already set. This // call also checks if we are trying to change the ID. @@ -233,7 +274,7 @@ func (p *provider) sharedCreateRef(meta objstorage.ObjectMetadata) error { err = writer.Close() } if err != nil { - return errors.Wrapf(err, "creating marker object %q", refName) + return errors.Wrapf(err, "creating marker object %q", errors.Safe(refName)) } return nil } @@ -265,7 +306,7 @@ func (p *provider) sharedCreate( objName := remoteObjectName(meta) writer, err := storage.CreateObject(objName) if err != nil { - return nil, objstorage.ObjectMetadata{}, errors.Wrapf(err, "creating object %q", objName) + return nil, objstorage.ObjectMetadata{}, errors.Wrapf(err, "creating object %q", errors.Safe(objName)) } return &sharedWritable{ p: p, @@ -289,25 +330,25 @@ func (p *provider) remoteOpenForReading( refName := p.sharedObjectRefName(meta) if _, err := meta.Remote.Storage.Size(refName); err != nil { if meta.Remote.Storage.IsNotExistError(err) { + err = errors.Wrapf(err, "marker object %q does not exist", errors.Safe(refName)) if opts.MustExist { - p.st.Logger.Fatalf("marker object %q does not exist", refName) - // TODO(radu): maybe list references for the object. + err = base.MarkCorruptionError(err) } - return nil, errors.Errorf("marker object %q does not exist", refName) + return nil, err } - return nil, errors.Wrapf(err, "checking marker object %q", refName) + return nil, errors.Wrapf(err, "checking marker object %q", errors.Safe(refName)) } } objName := remoteObjectName(meta) reader, size, err := meta.Remote.Storage.ReadObject(ctx, objName) if err != nil { if opts.MustExist && meta.Remote.Storage.IsNotExistError(err) { - p.st.Logger.Fatalf("object %q does not exist", objName) // TODO(radu): maybe list references for the object. + err = base.MarkCorruptionError(err) } return nil, err } - return p.newRemoteReadable(reader, size, meta.DiskFileNum), nil + return p.newRemoteReadable(reader, size, meta.DiskFileNum, meta.Remote.Storage.IsNotExistError), nil } func (p *provider) remoteSize(meta objstorage.ObjectMetadata) (int64, error) { @@ -374,3 +415,10 @@ func (p *provider) ensureStorage(locator remote.Locator) (remote.Storage, error) defer p.mu.Unlock() return p.ensureStorageLocked(locator) } + +// GetExternalObjects is part of the Provider interface. +func (p *provider) GetExternalObjects(locator remote.Locator, objName string) []base.DiskFileNum { + p.mu.Lock() + defer p.mu.Unlock() + return slices.Clone(p.mu.remote.getExternalObjects(locator, objName)) +} diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote_backing.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote_backing.go similarity index 87% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote_backing.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote_backing.go index d302151..dea0fa2 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote_backing.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote_backing.go @@ -10,10 +10,9 @@ import ( "io" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat" - "github.com/cockroachdb/pebble/objstorage/remote" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/remote" ) const ( @@ -45,7 +44,7 @@ func (p *provider) encodeRemoteObjectBacking( meta *objstorage.ObjectMetadata, ) (objstorage.RemoteObjectBacking, error) { if !meta.IsRemote() { - return nil, errors.AssertionFailedf("object %s not on remote storage", meta.DiskFileNum) + return nil, base.AssertionFailedf("object %s not on remote storage", meta.DiskFileNum) } buf := make([]byte, 0, binary.MaxVarintLen64*4) @@ -53,13 +52,13 @@ func (p *provider) encodeRemoteObjectBacking( buf = binary.AppendUvarint(buf, uint64(meta.Remote.CreatorID)) // TODO(radu): encode file type as well? buf = binary.AppendUvarint(buf, tagCreatorFileNum) - buf = binary.AppendUvarint(buf, uint64(meta.Remote.CreatorFileNum.FileNum())) + buf = binary.AppendUvarint(buf, uint64(meta.Remote.CreatorFileNum)) buf = binary.AppendUvarint(buf, tagCleanupMethod) buf = binary.AppendUvarint(buf, uint64(meta.Remote.CleanupMethod)) if meta.Remote.CleanupMethod == objstorage.SharedRefTracking { buf = binary.AppendUvarint(buf, tagRefCheckID) buf = binary.AppendUvarint(buf, uint64(p.remote.shared.creatorID)) - buf = binary.AppendUvarint(buf, uint64(meta.DiskFileNum.FileNum())) + buf = binary.AppendUvarint(buf, uint64(meta.DiskFileNum)) } if meta.Remote.Locator != "" { buf = binary.AppendUvarint(buf, tagLocator) @@ -196,14 +195,14 @@ func decodeRemoteObjectBacking( res.meta.DiskFileNum = fileNum res.meta.FileType = fileType res.meta.Remote.CreatorID = objstorage.CreatorID(creatorID) - res.meta.Remote.CreatorFileNum = base.FileNum(creatorFileNum).DiskFileNum() + res.meta.Remote.CreatorFileNum = base.DiskFileNum(creatorFileNum) res.meta.Remote.CleanupMethod = objstorage.SharedCleanupMethod(cleanupMethod) if res.meta.Remote.CleanupMethod == objstorage.SharedRefTracking { if refCheckCreatorID == 0 || refCheckFileNum == 0 { return decodedBacking{}, errors.Newf("remote object backing missing ref to check") } res.refToCheck.creatorID = objstorage.CreatorID(refCheckCreatorID) - res.refToCheck.fileNum = base.FileNum(refCheckFileNum).DiskFileNum() + res.refToCheck.fileNum = base.DiskFileNum(refCheckFileNum) } res.meta.Remote.Locator = remote.Locator(locator) res.meta.Remote.CustomObjectName = customObjName @@ -264,31 +263,13 @@ func (p *provider) AttachRemoteObjects( _ = p.sharedUnref(d.meta) // TODO(radu): clean up references previously created in this loop. if d.meta.Remote.Storage.IsNotExistError(err) { - return nil, errors.Errorf("origin marker object %q does not exist;"+ + return nil, base.CorruptionErrorf("origin marker object %q does not exist;"+ " object probably removed from the provider which created the backing", refName) } return nil, errors.Wrapf(err, "checking origin's marker object %s", refName) } } - func() { - p.mu.Lock() - defer p.mu.Unlock() - for _, d := range decoded { - p.mu.remote.catalogBatch.AddObject(remoteobjcat.RemoteObjectMetadata{ - FileNum: d.meta.DiskFileNum, - FileType: d.meta.FileType, - CreatorID: d.meta.Remote.CreatorID, - CreatorFileNum: d.meta.Remote.CreatorFileNum, - CleanupMethod: d.meta.Remote.CleanupMethod, - Locator: d.meta.Remote.Locator, - }) - } - }() - if err := p.sharedSync(); err != nil { - return nil, err - } - metas := make([]objstorage.ObjectMetadata, len(decoded)) for i, d := range decoded { metas[i] = d.meta @@ -296,8 +277,8 @@ func (p *provider) AttachRemoteObjects( p.mu.Lock() defer p.mu.Unlock() - for _, meta := range metas { - p.mu.knownObjects[meta.DiskFileNum] = meta + for i := range metas { + p.addMetadataLocked(metas[i]) } return metas, nil } diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote_obj_name.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote_obj_name.go similarity index 81% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote_obj_name.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote_obj_name.go index b33a908..2cdcb73 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remote_obj_name.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote_obj_name.go @@ -7,8 +7,8 @@ package objstorageprovider import ( "fmt" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/objstorage" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/objstorage" ) // remoteObjectName returns the name of an object on remote storage. @@ -23,7 +23,12 @@ func remoteObjectName(meta objstorage.ObjectMetadata) string { case base.FileTypeTable: return fmt.Sprintf( "%04x-%d-%06d.sst", - objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum.FileNum(), + objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum, + ) + case base.FileTypeBlob: + return fmt.Sprintf( + "%04x-%d-%06d.blob", + objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum, ) } panic("unknown FileType") @@ -42,14 +47,19 @@ func sharedObjectRefName( } if meta.Remote.CustomObjectName != "" { return fmt.Sprintf( - "%s.ref.%d.%06d", meta.Remote.CustomObjectName, refCreatorID, refFileNum.FileNum(), + "%s.ref.%d.%06d", meta.Remote.CustomObjectName, refCreatorID, refFileNum, ) } switch meta.FileType { case base.FileTypeTable: return fmt.Sprintf( "%04x-%d-%06d.sst.ref.%d.%06d", - objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum.FileNum(), refCreatorID, refFileNum.FileNum(), + objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum, refCreatorID, refFileNum, + ) + case base.FileTypeBlob: + return fmt.Sprintf( + "%04x-%d-%06d.blob.ref.%d.%06d", + objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum, refCreatorID, refFileNum, ) } panic("unknown FileType") @@ -63,7 +73,12 @@ func sharedObjectRefPrefix(meta objstorage.ObjectMetadata) string { case base.FileTypeTable: return fmt.Sprintf( "%04x-%d-%06d.sst.ref.", - objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum.FileNum(), + objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum, + ) + case base.FileTypeBlob: + return fmt.Sprintf( + "%04x-%d-%06d.blob.ref.", + objHash(meta), meta.Remote.CreatorID, meta.Remote.CreatorFileNum, ) } panic("unknown FileType") @@ -87,5 +102,5 @@ func (p *provider) sharedObjectRefName(meta objstorage.ObjectMetadata) string { func objHash(meta objstorage.ObjectMetadata) uint16 { const prime1 = 7459 const prime2 = 17539 - return uint16(uint64(meta.Remote.CreatorID)*prime1 + uint64(meta.Remote.CreatorFileNum.FileNum())*prime2) + return uint16(uint64(meta.Remote.CreatorID)*prime1 + uint64(meta.Remote.CreatorFileNum)*prime2) } diff --git a/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote_readable.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote_readable.go new file mode 100644 index 0000000..b7d59ac --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remote_readable.go @@ -0,0 +1,285 @@ +// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package objstorageprovider + +import ( + "context" + "io" + "sync" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/sharedcache" + "github.com/cockroachdb/pebble/v2/objstorage/remote" +) + +// NewRemoteReadable creates an objstorage.Readable out of a remote.ObjectReader. +func NewRemoteReadable(objReader remote.ObjectReader, size int64) objstorage.Readable { + return &remoteReadable{ + objReader: objReader, + size: size, + } +} + +const remoteMaxReadaheadSize = 1024 * 1024 /* 1MB */ + +// Number of concurrent compactions is bounded and significantly lower than +// the number of concurrent queries, and compactions consume reads from a few +// levels, so there is no risk of high memory usage due to a higher readahead +// size. So set this higher than remoteMaxReadaheadSize +const remoteReadaheadSizeForCompaction = 8 * 1024 * 1024 /* 8MB */ + +// remoteReadable is a very simple implementation of Readable on top of the +// remote.ObjectReader returned by remote.Storage.ReadObject. It is stateless +// and can be called concurrently. +type remoteReadable struct { + objReader remote.ObjectReader + size int64 + fileNum base.DiskFileNum + cache *sharedcache.Cache + errIsNotExist func(error) bool +} + +var _ objstorage.Readable = (*remoteReadable)(nil) + +func (p *provider) newRemoteReadable( + objReader remote.ObjectReader, + size int64, + fileNum base.DiskFileNum, + errIsNotExist func(error) bool, +) *remoteReadable { + return &remoteReadable{ + objReader: objReader, + size: size, + fileNum: fileNum, + cache: p.remote.cache, + errIsNotExist: errIsNotExist, + } +} + +// ReadAt is part of the objstorage.Readable interface. +func (r *remoteReadable) ReadAt(ctx context.Context, p []byte, offset int64) error { + return r.readInternal(ctx, p, offset, false /* forCompaction */) +} + +// readInternal performs a read for the object, using the cache when +// appropriate. +func (r *remoteReadable) readInternal( + ctx context.Context, p []byte, offset int64, forCompaction bool, +) error { + var err error + if r.cache != nil { + flags := sharedcache.ReadFlags{ + // Don't add data to the cache if this read is for a compaction. + ReadOnly: forCompaction, + } + err = r.cache.ReadAt(ctx, r.fileNum, p, offset, r.objReader, r.size, flags) + } else { + err = r.objReader.ReadAt(ctx, p, offset) + } + if err != nil && r.errIsNotExist(err) { + // If a file goes missing, we consider this a corruption error. + err = base.MarkCorruptionError(err) + } + return err +} + +func (r *remoteReadable) Close() error { + defer func() { r.objReader = nil }() + return r.objReader.Close() +} + +func (r *remoteReadable) Size() int64 { + return r.size +} + +// TODO(sumeer): both readBeforeSize and ReadHandle.SetupForCompaction are +// initial configuration of a ReadHandle. So they should both be passed as +// Options to NewReadHandle. But currently the latter is a separate method. +// This is because of how the sstable.Reader calls setupForCompaction on the +// iterators after they are constructed. Consider fixing this oddity. + +func (r *remoteReadable) NewReadHandle( + readBeforeSize objstorage.ReadBeforeSize, +) objstorage.ReadHandle { + rh := remoteReadHandlePool.Get().(*remoteReadHandle) + *rh = remoteReadHandle{readable: r, readBeforeSize: readBeforeSize, buffered: rh.buffered} + rh.readAheadState = makeReadaheadState(remoteMaxReadaheadSize) + return rh +} + +// TODO(sumeer): add test for remoteReadHandle. + +// remoteReadHandle supports doing larger reads, and buffering the additional +// data, to serve future reads. It is not thread-safe. There are two kinds of +// larger reads (a) read-ahead (for sequential data reads), (b) read-before, +// for non-data reads. +// +// For both (a) and (b), the goal is to reduce the number of reads since +// remote read latency and cost can be high. We have to balance this with +// buffers consuming too much memory, since there can be a large number of +// iterators holding remoteReadHandles open for every level. +// +// For (b) we have to two use-cases: +// +// - When a sstable.Reader is opened, it needs to read the footer, metaindex +// block and meta properties block. It starts by reading the footer which is +// at the end of the table and then proceeds to read the other two. Instead +// of doing 3 tiny reads, we would like to do one read. +// +// - When a single-level or two-level iterator is opened, it reads the +// (top-level) index block first. When the iterator is used, it will +// typically follow this by reading the filter block (since SeeKPrefixGE is +// common in CockroachDB). For a two-level iterator it will also read the +// lower-level index blocks which are after the filter block and before the +// top-level index block. It would be ideal if reading the top-level index +// block read enough to include the filter block. And for two-level +// iterators this would also include the lower-level index blocks. +// +// In both use-cases we want the first read from the remoteReadable to do a +// larger read, and read bytes earlier than the requested read, hence +// "read-before". Subsequent reads from the remoteReadable can use the usual +// readahead logic (for the second use-case above, this can help with +// sequential reads of the lower-level index blocks when the read-before was +// insufficient to satisfy such reads). In the first use-case, the block cache +// is not used. In the second use-case, the block cache is used, and if the +// first read, which reads the top-level index, has a cache hit, we do not do +// any read-before, since we assume that with some locality in the workload +// the other reads will also have a cache hit (it is also messier code to try +// to preserve some read-before). +// +// Note that both use-cases can often occur near each other if there is enough +// locality of access, in which case file cache and block cache misses are +// mainly happening for new sstables created by compactions -- in this case a +// user-facing read will cause a file cache miss and a new sstable.Reader to +// be created, followed by an iterator creation. We don't currently combine +// the reads across the Reader and the iterator creation, since the code +// structure is not simple enough, but we could consider that in the future. +type remoteReadHandle struct { + readable *remoteReadable + readBeforeSize objstorage.ReadBeforeSize + readAheadState readaheadState + buffered struct { + data []byte + offset int64 + } + forCompaction bool +} + +var _ objstorage.ReadHandle = (*remoteReadHandle)(nil) + +var remoteReadHandlePool = sync.Pool{ + New: func() interface{} { + return &remoteReadHandle{} + }, +} + +// ReadAt is part of the objstorage.ReadHandle interface. +func (r *remoteReadHandle) ReadAt(ctx context.Context, p []byte, offset int64) error { + var extraBytesBefore int64 + if r.readBeforeSize > 0 { + if int(r.readBeforeSize) > len(p) { + extraBytesBefore = min(int64(int(r.readBeforeSize)-len(p)), offset) + } + // Only first read uses read-before. + r.readBeforeSize = 0 + } + readaheadSize := r.maybeReadahead(offset, len(p)) + + // Prefer read-before to read-ahead since only first call does read-before. + // Also, since this is the first call, the buffer must be empty. + if extraBytesBefore > 0 { + r.buffered.offset = offset - extraBytesBefore + err := r.readToBuffer(ctx, offset-extraBytesBefore, len(p)+int(extraBytesBefore)) + if err != nil { + return err + } + copy(p, r.buffered.data[int(extraBytesBefore):]) + return nil + } + // Check if we already have the data from a previous read-ahead/read-before. + if rhSize := int64(len(r.buffered.data)); rhSize > 0 { + // We only consider the case where we have a prefix of the needed data. We + // could enhance this to utilize a suffix of the needed data. + if r.buffered.offset <= offset && r.buffered.offset+rhSize > offset { + n := copy(p, r.buffered.data[offset-r.buffered.offset:]) + if n == len(p) { + // All data was available. + return nil + } + // Use the data that we had and do a shorter read. + offset += int64(n) + p = p[n:] + readaheadSize -= n + } + } + + if readaheadSize > len(p) { + // Don't try to read past EOF. + if offset+int64(readaheadSize) > r.readable.size { + readaheadSize = int(r.readable.size - offset) + if readaheadSize <= 0 { + // This shouldn't happen in practice (Pebble should never try to read + // past EOF). + return io.EOF + } + } + if err := r.readToBuffer(ctx, offset, readaheadSize); err != nil { + return err + } + copy(p, r.buffered.data) + return nil + } + + return r.readable.readInternal(ctx, p, offset, r.forCompaction) +} + +func (r *remoteReadHandle) maybeReadahead(offset int64, len int) int { + if r.forCompaction { + return remoteReadaheadSizeForCompaction + } + return int(r.readAheadState.maybeReadahead(offset, int64(len))) +} + +func (r *remoteReadHandle) readToBuffer(ctx context.Context, offset int64, length int) error { + r.buffered.offset = offset + // TODO(radu): we need to somehow account for this memory. + if cap(r.buffered.data) >= length { + r.buffered.data = r.buffered.data[:length] + } else { + r.buffered.data = make([]byte, length) + } + if err := r.readable.readInternal( + ctx, r.buffered.data, r.buffered.offset, r.forCompaction); err != nil { + // Make sure we don't treat the data as valid next time. + r.buffered.data = r.buffered.data[:0] + return err + } + return nil +} + +// Close is part of the objstorage.ReadHandle interface. +func (r *remoteReadHandle) Close() error { + buf := r.buffered.data[:0] + *r = remoteReadHandle{} + r.buffered.data = buf + remoteReadHandlePool.Put(r) + return nil +} + +// SetupForCompaction is part of the objstorage.ReadHandle interface. +func (r *remoteReadHandle) SetupForCompaction() { + r.forCompaction = true +} + +// RecordCacheHit is part of the objstorage.ReadHandle interface. +func (r *remoteReadHandle) RecordCacheHit(_ context.Context, offset, size int64) { + if !r.forCompaction { + r.readAheadState.recordCacheHit(offset, size) + } + if r.readBeforeSize > 0 { + r.readBeforeSize = 0 + } +} diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat/catalog.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remoteobjcat/catalog.go similarity index 85% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat/catalog.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remoteobjcat/catalog.go index 8508f19..f01ef38 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat/catalog.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remoteobjcat/catalog.go @@ -5,18 +5,20 @@ package remoteobjcat import ( + "cmp" "fmt" "io" - "sort" + "path/filepath" + "slices" "sync" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/remote" - "github.com/cockroachdb/pebble/record" - "github.com/cockroachdb/pebble/vfs" - "github.com/cockroachdb/pebble/vfs/atomicfs" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/record" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/pebble/v2/vfs/atomicfs" ) // Catalog is used to manage the on-disk remote object catalog. @@ -114,8 +116,8 @@ func Open(fs vfs.FS, dirname string) (*Catalog, CatalogContents, error) { res.Objects = append(res.Objects, meta) } // Sort the objects so the function is deterministic. - sort.Slice(res.Objects, func(i, j int) bool { - return res.Objects[i].FileNum.FileNum() < res.Objects[j].FileNum.FileNum() + slices.SortFunc(res.Objects, func(a, b RemoteObjectMetadata) int { + return cmp.Compare(a.FileNum, b.FileNum) }) return c, res, nil } @@ -123,7 +125,7 @@ func Open(fs vfs.FS, dirname string) (*Catalog, CatalogContents, error) { // SetCreatorID sets the creator ID. If it is already set, it must match. func (c *Catalog) SetCreatorID(id objstorage.CreatorID) error { if !id.IsSet() { - return errors.AssertionFailedf("attempt to unset CreatorID") + return base.AssertionFailedf("attempt to unset CreatorID") } c.mu.Lock() @@ -131,14 +133,14 @@ func (c *Catalog) SetCreatorID(id objstorage.CreatorID) error { if c.mu.creatorID.IsSet() { if c.mu.creatorID != id { - return errors.AssertionFailedf("attempt to change CreatorID from %s to %s", c.mu.creatorID, id) + return base.AssertionFailedf("attempt to change CreatorID from %s to %s", c.mu.creatorID, id) } return nil } ve := VersionEdit{CreatorID: id} if err := c.writeToCatalogFileLocked(&ve); err != nil { - return errors.Wrapf(err, "pebble: could not write to remote object catalog: %v", err) + return errors.Wrapf(err, "pebble: could not write to remote object catalog") } c.mu.creatorID = id return nil @@ -146,7 +148,12 @@ func (c *Catalog) SetCreatorID(id objstorage.CreatorID) error { // Close any open files. func (c *Catalog) Close() error { - return c.closeCatalogFile() + var err error + if c.mu.marker != nil { + err = c.mu.marker.Close() + c.mu.marker = nil + } + return errors.CombineErrors(err, c.closeCatalogFile()) } func (c *Catalog) closeCatalogFile() error { @@ -229,18 +236,18 @@ func (c *Catalog) ApplyBatch(b Batch) error { } for _, meta := range b.ve.NewObjects { if exists(meta.FileNum) { - return errors.AssertionFailedf("adding existing object %s", meta.FileNum) + return base.AssertionFailedf("adding existing object %s", meta.FileNum) } toAdd[meta.FileNum] = struct{}{} } for _, n := range b.ve.DeletedObjects { if !exists(n) { - return errors.AssertionFailedf("deleting non-existent object %s", n) + return base.AssertionFailedf("deleting non-existent object %s", n) } } if err := c.writeToCatalogFileLocked(&b.ve); err != nil { - return errors.Wrapf(err, "pebble: could not write to remote object catalog: %v", err) + return errors.Wrapf(err, "pebble: could not write to remote object catalog") } // Add new objects before deleting any objects. This allows for cases where @@ -324,11 +331,11 @@ func makeCatalogFilename(iter uint64) string { // current catalog and sets c.mu.catalogFile and c.mu.catalogRecWriter. func (c *Catalog) createNewCatalogFileLocked() (outErr error) { if c.mu.catalogFile != nil { - return errors.AssertionFailedf("catalogFile already open") + return base.AssertionFailedf("catalogFile already open") } filename := makeCatalogFilename(c.mu.marker.NextIter()) filepath := c.fs.PathJoin(c.dirname, filename) - file, err := c.fs.Create(filepath) + file, err := c.fs.Create(filepath, "pebble-manifest") if err != nil { return err } @@ -372,6 +379,28 @@ func (c *Catalog) createNewCatalogFileLocked() (outErr error) { return nil } +// Checkpoint copies catalog state to a file in the specified directory +func (c *Catalog) Checkpoint(fs vfs.FS, dir string) error { + c.mu.Lock() + defer c.mu.Unlock() + + // NB: Every write to recWriter is flushed. We don't need to worry about + // this new file descriptor not getting all the saved catalog entries. + existingCatalogFilepath := filepath.Join(c.dirname, c.mu.catalogFilename) + destPath := filepath.Join(dir, c.mu.catalogFilename) + if err := vfs.CopyAcrossFS(c.fs, existingCatalogFilepath, fs, destPath); err != nil { + return err + } + catalogMarker, _, err := atomicfs.LocateMarker(fs, dir, catalogMarkerName) + if err != nil { + return err + } + if err := catalogMarker.Move(c.mu.catalogFilename); err != nil { + return err + } + return catalogMarker.Close() +} + func writeRecord(ve *VersionEdit, file vfs.File, recWriter *record.Writer) error { w, err := recWriter.Next() if err != nil { diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat/version_edit.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remoteobjcat/version_edit.go similarity index 88% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat/version_edit.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remoteobjcat/version_edit.go index 44552f5..4a003d1 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat/version_edit.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/remoteobjcat/version_edit.go @@ -10,10 +10,10 @@ import ( "io" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/remote" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/remote" ) // VersionEdit is a modification to the remote object state which can be encoded @@ -48,12 +48,15 @@ const ( // more general (and we want freedom to change it in the future). const ( objTypeTable = 1 + objTypeBlob = 2 ) func objTypeToFileType(objType uint64) (base.FileType, error) { switch objType { case objTypeTable: return base.FileTypeTable, nil + case objTypeBlob: + return base.FileTypeBlob, nil default: return 0, errors.Newf("unknown object type %d", objType) } @@ -63,7 +66,8 @@ func fileTypeToObjType(fileType base.FileType) (uint64, error) { switch fileType { case base.FileTypeTable: return objTypeTable, nil - + case base.FileTypeBlob: + return objTypeBlob, nil default: return 0, errors.Newf("unknown object type for file type %d", fileType) } @@ -78,10 +82,10 @@ func (v *VersionEdit) Encode(w io.Writer) error { return err } buf = binary.AppendUvarint(buf, uint64(tagNewObject)) - buf = binary.AppendUvarint(buf, uint64(meta.FileNum.FileNum())) + buf = binary.AppendUvarint(buf, uint64(meta.FileNum)) buf = binary.AppendUvarint(buf, objType) buf = binary.AppendUvarint(buf, uint64(meta.CreatorID)) - buf = binary.AppendUvarint(buf, uint64(meta.CreatorFileNum.FileNum())) + buf = binary.AppendUvarint(buf, uint64(meta.CreatorFileNum)) buf = binary.AppendUvarint(buf, uint64(meta.CleanupMethod)) if meta.Locator != "" { buf = binary.AppendUvarint(buf, uint64(tagNewObjectLocator)) @@ -97,7 +101,7 @@ func (v *VersionEdit) Encode(w io.Writer) error { for _, dfn := range v.DeletedObjects { buf = binary.AppendUvarint(buf, uint64(tagDeletedObject)) - buf = binary.AppendUvarint(buf, uint64(dfn.FileNum())) + buf = binary.AppendUvarint(buf, uint64(dfn)) } if v.CreatorID.IsSet() { buf = binary.AppendUvarint(buf, uint64(tagCreatorID)) @@ -166,10 +170,10 @@ func (v *VersionEdit) Decode(r io.Reader) error { if err == nil { v.NewObjects = append(v.NewObjects, RemoteObjectMetadata{ - FileNum: base.FileNum(fileNum).DiskFileNum(), + FileNum: base.DiskFileNum(fileNum), FileType: fileType, CreatorID: objstorage.CreatorID(creatorID), - CreatorFileNum: base.FileNum(creatorFileNum).DiskFileNum(), + CreatorFileNum: base.DiskFileNum(creatorFileNum), CleanupMethod: objstorage.SharedCleanupMethod(cleanupMethod), Locator: remote.Locator(locator), CustomObjectName: customName, @@ -180,7 +184,7 @@ func (v *VersionEdit) Decode(r io.Reader) error { var fileNum uint64 fileNum, err = binary.ReadUvarint(br) if err == nil { - v.DeletedObjects = append(v.DeletedObjects, base.FileNum(fileNum).DiskFileNum()) + v.DeletedObjects = append(v.DeletedObjects, base.DiskFileNum(fileNum)) } case tagCreatorID: @@ -237,7 +241,7 @@ func (v *VersionEdit) Apply( for _, meta := range v.NewObjects { if invariants.Enabled { if _, exists := objects[meta.FileNum]; exists { - return errors.AssertionFailedf("version edit adds existing object %s", meta.FileNum) + return base.AssertionFailedf("version edit adds existing object %s", meta.FileNum) } } objects[meta.FileNum] = meta @@ -245,7 +249,7 @@ func (v *VersionEdit) Apply( for _, fileNum := range v.DeletedObjects { if invariants.Enabled { if _, exists := objects[fileNum]; !exists { - return errors.AssertionFailedf("version edit deletes non-existent object %s", fileNum) + return base.AssertionFailedf("version edit deletes non-existent object %s", fileNum) } } delete(objects, fileNum) diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/shared_writable.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/shared_writable.go similarity index 97% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/shared_writable.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/shared_writable.go index 5e8d45a..e03fd12 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/shared_writable.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/shared_writable.go @@ -7,7 +7,7 @@ package objstorageprovider import ( "io" - "github.com/cockroachdb/pebble/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage" ) // NewRemoteWritable creates an objstorage.Writable out of an io.WriteCloser. diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache/shared_cache.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/sharedcache/shared_cache.go similarity index 98% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache/shared_cache.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/sharedcache/shared_cache.go index 6d6409e..942f259 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache/shared_cache.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/sharedcache/shared_cache.go @@ -14,10 +14,10 @@ import ( "time" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/objstorage/remote" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/vfs" "github.com/prometheus/client_golang/prometheus" ) @@ -365,7 +365,7 @@ func (c *Cache) set(fileNum base.DiskFileNum, p []byte, ofs int64) error { func (c *Cache) getShard(fileNum base.DiskFileNum, ofs int64) *shard { const prime64 = 1099511628211 - hash := uint64(fileNum.FileNum())*prime64 + uint64(ofs/c.shardingBlockSize) + hash := uint64(fileNum)*prime64 + uint64(ofs/c.shardingBlockSize) // TODO(josh): Instance change ops are often run in production. Such an operation // updates len(c.shards); see openSharedCache. As a result, the behavior of this // function changes, and the cache empties out at restart time. We may want a better @@ -446,7 +446,7 @@ func (s *shard) init( } s.bm = makeBlockMath(blockSize) s.shardingBlockSize = shardingBlockSize - file, err := fs.OpenReadWrite(fs.PathJoin(fsDir, fmt.Sprintf("SHARED-CACHE-%03d", shardIdx))) + file, err := fs.OpenReadWrite(fs.PathJoin(fsDir, fmt.Sprintf("SHARED-CACHE-%03d", shardIdx)), vfs.WriteCategoryUnspecified) if err != nil { return err } @@ -568,7 +568,7 @@ func (s *shard) get(fileNum base.DiskFileNum, p []byte, ofs int64) (n int, _ err // than blob storage read latency) that miss on the same logical block ID will not necessarily // be rare. We may want to do only one read, with the later readers blocking on the first read // completing. This could be implemented either here or in the primary block cache. See - // https://github.com/cockroachdb/pebble/pull/2586 for additional discussion. + // https://github.com/cockroachdb/pebble/v2/pull/2586 for additional discussion. if !ok { s.mu.Unlock() return n, nil @@ -710,11 +710,10 @@ func (s *shard) set(fileNum base.DiskFileNum, p []byte, ofs int64) error { if err != nil { // Free the block. s.mu.Lock() - defer s.mu.Unlock() - delete(s.mu.where, k) s.lruUnlink(cacheBlockIdx) s.freePush(cacheBlockIdx) + s.mu.Unlock() return err } s.dropWriteLock(cacheBlockIdx) @@ -872,7 +871,7 @@ func (w *writeWorkers) Start(c *Cache, numWorkers int) { if err != nil { c.metrics.writeBackFailures.Add(1) // TODO(radu): throttle logs. - c.logger.Infof("writing back to cache after miss failed: %v", err) + c.logger.Errorf("writing back to cache after miss failed: %v", err) } } } diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/vfs.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/vfs.go similarity index 68% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/vfs.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/vfs.go index 9e23815..6c3a0ed 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/vfs.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/vfs.go @@ -8,9 +8,9 @@ import ( "context" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/vfs" ) func (p *provider) vfsPath(fileType base.FileType, fileNum base.DiskFileNum) string { @@ -26,23 +26,23 @@ func (p *provider) vfsOpenForReading( filename := p.vfsPath(fileType, fileNum) file, err := p.st.FS.Open(filename, vfs.RandomReadsOption) if err != nil { - if opts.MustExist { - base.MustExist(p.st.FS, filename, p.st.Logger, err) + if opts.MustExist && p.IsNotExistError(err) { + err = base.AddDetailsToNotExistError(p.st.FS, filename, err) + err = base.MarkCorruptionError(err) } return nil, err } - readaheadConfig := DefaultReadaheadConfig - if f := p.st.Local.ReadaheadConfigFn; f != nil { - readaheadConfig = f() - } - return newFileReadable(file, p.st.FS, readaheadConfig, filename) + return newFileReadable(file, p.st.FS, p.st.Local.ReadaheadConfig, filename) } func (p *provider) vfsCreate( - _ context.Context, fileType base.FileType, fileNum base.DiskFileNum, + _ context.Context, + fileType base.FileType, + fileNum base.DiskFileNum, + category vfs.DiskWriteCategory, ) (objstorage.Writable, objstorage.ObjectMetadata, error) { filename := p.vfsPath(fileType, fileNum) - file, err := p.st.FS.Create(filename) + file, err := p.st.FS.Create(filename, category) if err != nil { return nil, objstorage.ObjectMetadata{}, err } @@ -74,12 +74,15 @@ func (p *provider) vfsInit() error { for _, filename := range listing { fileType, fileNum, ok := base.ParseFilename(p.st.FS, filename) - if ok && fileType == base.FileTypeTable { - o := objstorage.ObjectMetadata{ - FileType: fileType, - DiskFileNum: fileNum, + if ok { + switch fileType { + case base.FileTypeTable, base.FileTypeBlob: + o := objstorage.ObjectMetadata{ + FileType: fileType, + DiskFileNum: fileNum, + } + p.mu.knownObjects[o.DiskFileNum] = o } - p.mu.knownObjects[o.DiskFileNum] = o } } return nil @@ -87,19 +90,23 @@ func (p *provider) vfsInit() error { func (p *provider) vfsSync() error { p.mu.Lock() - shouldSync := p.mu.localObjectsChanged - p.mu.localObjectsChanged = false + counterVal := p.mu.localObjectsChangeCounter + lastSynced := p.mu.localObjectsChangeCounterSynced p.mu.Unlock() - if !shouldSync { + if lastSynced >= counterVal { return nil } if err := p.fsDir.Sync(); err != nil { - p.mu.Lock() - defer p.mu.Unlock() - p.mu.localObjectsChanged = true return err } + + p.mu.Lock() + if p.mu.localObjectsChangeCounterSynced < counterVal { + p.mu.localObjectsChangeCounterSynced = counterVal + } + p.mu.Unlock() + return nil } diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/vfs_readable.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/vfs_readable.go similarity index 79% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/vfs_readable.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/vfs_readable.go index 584f6a9..8bf7bfd 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/vfs_readable.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/vfs_readable.go @@ -8,11 +8,12 @@ import ( "context" "fmt" "os" + "runtime/debug" "sync" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/vfs" ) const fileMaxReadaheadSize = 256 * 1024 /* 256KB */ @@ -25,17 +26,18 @@ type fileReadable struct { file vfs.File size int64 + readaheadConfig *ReadaheadConfig + // The following fields are used to possibly open the file again using the // sequential reads option (see vfsReadHandle). - filename string - fs vfs.FS - readaheadConfig ReadaheadConfig + filename string + fs vfs.FS } var _ objstorage.Readable = (*fileReadable)(nil) func newFileReadable( - file vfs.File, fs vfs.FS, readaheadConfig ReadaheadConfig, filename string, + file vfs.File, fs vfs.FS, readaheadConfig *ReadaheadConfig, filename string, ) (*fileReadable, error) { info, err := file.Stat() if err != nil { @@ -48,12 +50,15 @@ func newFileReadable( fs: fs, readaheadConfig: readaheadConfig, } - invariants.SetFinalizer(r, func(obj interface{}) { - if obj.(*fileReadable).file != nil { - fmt.Fprintf(os.Stderr, "Readable was not closed") - os.Exit(1) - } - }) + if invariants.UseFinalizers { + stack := debug.Stack() + invariants.SetFinalizer(r, func(obj interface{}) { + if obj.(*fileReadable).file != nil { + fmt.Fprintf(os.Stderr, "Readable %s was not closed\n%s", filename, stack) + os.Exit(1) + } + }) + } return r, nil } @@ -78,7 +83,9 @@ func (r *fileReadable) Size() int64 { } // NewReadHandle is part of the objstorage.Readable interface. -func (r *fileReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle { +func (r *fileReadable) NewReadHandle( + readBeforeSize objstorage.ReadBeforeSize, +) objstorage.ReadHandle { rh := readHandlePool.Get().(*vfsReadHandle) rh.init(r) return rh @@ -101,13 +108,14 @@ var _ objstorage.ReadHandle = (*vfsReadHandle)(nil) var readHandlePool = sync.Pool{ New: func() interface{} { i := &vfsReadHandle{} - // Note: this is a no-op if invariants are disabled or race is enabled. - invariants.SetFinalizer(i, func(obj interface{}) { - if obj.(*vfsReadHandle).r != nil { - fmt.Fprintf(os.Stderr, "ReadHandle was not closed") - os.Exit(1) - } - }) + if invariants.UseFinalizers { + invariants.SetFinalizer(i, func(obj interface{}) { + if obj.(*vfsReadHandle).r != nil { + fmt.Fprintf(os.Stderr, "ReadHandle was not closed") + os.Exit(1) + } + }) + } return i }, } @@ -116,7 +124,7 @@ func (rh *vfsReadHandle) init(r *fileReadable) { *rh = vfsReadHandle{ r: r, rs: makeReadaheadState(fileMaxReadaheadSize), - readaheadMode: r.readaheadConfig.Speculative, + readaheadMode: r.readaheadConfig.Speculative(), } } @@ -161,7 +169,7 @@ func (rh *vfsReadHandle) ReadAt(_ context.Context, p []byte, offset int64) error // SetupForCompaction is part of the objstorage.ReadHandle interface. func (rh *vfsReadHandle) SetupForCompaction() { - rh.readaheadMode = rh.r.readaheadConfig.Informed + rh.readaheadMode = rh.r.readaheadConfig.Informed() if rh.readaheadMode == FadviseSequential { rh.switchToOSReadahead() } @@ -192,6 +200,14 @@ func (rh *vfsReadHandle) RecordCacheHit(_ context.Context, offset, size int64) { rh.rs.recordCacheHit(offset, size) } +// NewFileReadable returns a new objstorage.Readable that reads from the given +// file. It should not be used directly, except in tools or tests. +func NewFileReadable( + file vfs.File, fs vfs.FS, readaheadConfig *ReadaheadConfig, filename string, +) (objstorage.Readable, error) { + return newFileReadable(file, fs, readaheadConfig, filename) +} + // TestingCheckMaxReadahead returns true if the ReadHandle has switched to // OS-level read-ahead. func TestingCheckMaxReadahead(rh objstorage.ReadHandle) bool { @@ -226,11 +242,13 @@ func (rh *PreallocatedReadHandle) Close() error { // (currently this happens if we are reading from a local file). // The returned handle still needs to be closed. func UsePreallocatedReadHandle( - ctx context.Context, readable objstorage.Readable, rh *PreallocatedReadHandle, + readable objstorage.Readable, + readBeforeSize objstorage.ReadBeforeSize, + rh *PreallocatedReadHandle, ) objstorage.ReadHandle { if r, ok := readable.(*fileReadable); ok { rh.init(r) return rh } - return readable.NewReadHandle(ctx) + return readable.NewReadHandle(readBeforeSize) } diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/vfs_writable.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/vfs_writable.go similarity index 79% rename from vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/vfs_writable.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/vfs_writable.go index c9b207c..403621a 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/objstorageprovider/vfs_writable.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/vfs_writable.go @@ -7,8 +7,9 @@ package objstorageprovider import ( "bufio" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/vfs" ) // NewFileWritable returns a Writable that uses a file as underlying storage. @@ -35,6 +36,14 @@ func (w *fileBufferedWritable) Write(p []byte) error { // Ignoring the length written since bufio.Writer.Write is guaranteed to // return an error if the length written is < len(p). _, err := w.bw.Write(p) + + // Write is allowed to mangle the buffer. Do it sometimes in invariant builds + // to catch callers that don't handle this. + if invariants.Enabled && invariants.Sometimes(1) { + for i := range p { + p[i] = 0xFF + } + } return err } diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/remote/factory.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/factory.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/objstorage/remote/factory.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/factory.go diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/remote/localfs.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/localfs.go similarity index 68% rename from vendor/github.com/cockroachdb/pebble/objstorage/remote/localfs.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/localfs.go index 539ecb1..c52b32e 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/remote/localfs.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/localfs.go @@ -7,10 +7,12 @@ package remote import ( "context" "io" - "os" "path" + "strings" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/v2/vfs" ) // NewLocalFS returns a vfs-backed implementation of the remote.Storage @@ -78,24 +80,66 @@ func (r *localFSReader) Close() error { return nil } +func (f *localFSStore) sync() error { + file, err := f.vfs.OpenDir(f.dirname) + if err != nil { + return err + } + return errors.CombineErrors(file.Sync(), file.Close()) +} + +type objWriter struct { + vfs.File + store *localFSStore +} + +func (w *objWriter) Close() error { + if w.File == nil { + return nil + } + err := w.File.Sync() + err = errors.CombineErrors(err, w.File.Close()) + err = errors.CombineErrors(err, w.store.sync()) + *w = objWriter{} + return err +} + // CreateObject is part of the remote.Storage interface. func (s *localFSStore) CreateObject(objName string) (io.WriteCloser, error) { - file, err := s.vfs.Create(path.Join(s.dirname, objName)) - return file, err + file, err := s.vfs.Create(path.Join(s.dirname, objName), vfs.WriteCategoryUnspecified) + if err != nil { + return nil, err + } + return &objWriter{ + File: file, + store: s, + }, nil } // List is part of the remote.Storage interface. func (s *localFSStore) List(prefix, delimiter string) ([]string, error) { - // TODO(josh): For the intended use case of localfs.go of running 'pebble bench', - // List can always return , since this indicates a file has only one ref, - // and since `pebble bench` implies running in a single-pebble-instance context. - // https://github.com/cockroachdb/pebble/blob/a9a079d4fb6bf4a9ebc52e4d83a76ad4cbf676cb/objstorage/objstorageprovider/shared.go#L292 - return nil, nil + if delimiter != "" { + panic("delimiter unimplemented") + } + files, err := s.vfs.List(s.dirname) + if err != nil { + return nil, err + } + res := make([]string, 0, len(files)) + for _, name := range files { + if strings.HasPrefix(name, prefix) { + res = append(res, name) + } + } + return res, nil } // Delete is part of the remote.Storage interface. func (s *localFSStore) Delete(objName string) error { - return s.vfs.Remove(path.Join(s.dirname, objName)) + if err := s.vfs.Remove(path.Join(s.dirname, objName)); err != nil { + return err + } + return s.sync() } // Size is part of the remote.Storage interface. @@ -114,5 +158,5 @@ func (s *localFSStore) Size(objName string) (int64, error) { // IsNotExistError is part of the remote.Storage interface. func (s *localFSStore) IsNotExistError(err error) bool { - return err == os.ErrNotExist + return oserror.IsNotExist(err) } diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/remote/logging.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/logging.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/objstorage/remote/logging.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/logging.go diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/remote/mem.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/mem.go similarity index 80% rename from vendor/github.com/cockroachdb/pebble/objstorage/remote/mem.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/mem.go index 9bbda71..3c49e83 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/remote/mem.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/mem.go @@ -8,9 +8,10 @@ import ( "bytes" "context" "io" - "os" "strings" "sync" + + "github.com/cockroachdb/errors" ) // NewInMem returns an in-memory implementation of the remote.Storage @@ -49,25 +50,32 @@ func (s *inMemStore) ReadObject( if err != nil { return nil, 0, err } - return &inMemReader{data: obj.data}, int64(len(obj.data)), nil + return &inMemReader{objName: objName, store: s}, int64(len(obj.data)), nil } type inMemReader struct { - data []byte + objName string + store *inMemStore } var _ ObjectReader = (*inMemReader)(nil) func (r *inMemReader) ReadAt(ctx context.Context, p []byte, offset int64) error { - if offset+int64(len(p)) > int64(len(r.data)) { + // We don't just store obj.data in the inMemReader because we want to emit an + // error if the object is deleted from under us. + obj, err := r.store.getObj(r.objName) + if err != nil { + return err + } + if offset+int64(len(p)) > int64(len(obj.data)) { return io.EOF } - copy(p, r.data[offset:]) + copy(p, obj.data[offset:]) return nil } func (r *inMemReader) Close() error { - r.data = nil + r.store = nil return nil } @@ -135,15 +143,19 @@ func (s *inMemStore) Size(objName string) (int64, error) { } func (s *inMemStore) IsNotExistError(err error) bool { - return err == os.ErrNotExist + return errors.Is(err, inMemStoreNotExistErr) } +// We use a custom "not exists" error to make sure that callers correctly use +// IsNotExistError. +var inMemStoreNotExistErr = errors.Newf("in-mem remote storage object does not exist") + func (s *inMemStore) getObj(name string) (*inMemObj, error) { s.mu.Lock() defer s.mu.Unlock() obj, ok := s.mu.objects[name] if !ok { - return nil, os.ErrNotExist + return nil, inMemStoreNotExistErr } return obj, nil } diff --git a/vendor/github.com/cockroachdb/pebble/objstorage/remote/storage.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/storage.go similarity index 89% rename from vendor/github.com/cockroachdb/pebble/objstorage/remote/storage.go rename to vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/storage.go index dccb006..5a2ee54 100644 --- a/vendor/github.com/cockroachdb/pebble/objstorage/remote/storage.go +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/remote/storage.go @@ -7,6 +7,8 @@ package remote import ( "context" "io" + + "github.com/cockroachdb/redact" ) // Locator is an opaque string identifying a remote.Storage implementation. @@ -16,6 +18,11 @@ import ( // RemoteObjectBacking; they can also appear in error messages. type Locator string +// SafeFormat implements redact.SafeFormatter. +func (l Locator) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("%s", redact.SafeString(l)) +} + // StorageFactory is used to return Storage implementations based on locators. A // Pebble store that uses shared storage is configured with a StorageFactory. type StorageFactory interface { @@ -124,3 +131,18 @@ type ObjectReader interface { Close() error } + +// ObjectKey is a (locator, object name) pair which uniquely identifies a remote +// object and can be used as a map key. +type ObjectKey struct { + Locator Locator + ObjectName string +} + +// MakeObjectKey is a convenience constructor for ObjectKey. +func MakeObjectKey(locator Locator, objectName string) ObjectKey { + return ObjectKey{ + Locator: locator, + ObjectName: objectName, + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/objstorage/test_utils.go b/vendor/github.com/cockroachdb/pebble/v2/objstorage/test_utils.go new file mode 100644 index 0000000..c0e0733 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/objstorage/test_utils.go @@ -0,0 +1,86 @@ +// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package objstorage + +import ( + "bytes" + "context" + + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/pkg/errors" +) + +// MemObj is an in-memory implementation of the Writable and Readable that holds +// all data in memory. +// +// A zero MemObj can be populated with data through its Writable methods, and +// then can be repeatedly used as a Readable. +type MemObj struct { + buf bytes.Buffer +} + +var _ Writable = (*MemObj)(nil) +var _ Readable = (*MemObj)(nil) + +// Finish is part of the Writable interface. +func (f *MemObj) Finish() error { return nil } + +// Abort is part of the Writable interface. +func (f *MemObj) Abort() { f.buf.Reset() } + +// Write is part of the Writable interface. +func (f *MemObj) Write(p []byte) error { + _, err := f.buf.Write(p) + // Write is allowed to mangle the buffer. Do it sometimes in invariant + // builds to catch callers that don't handle this. + if invariants.Enabled && invariants.Sometimes(1) { + for i := range p { + p[i] = 0xFF + } + } + return err +} + +// Data returns the in-memory buffer behind this MemObj. +func (f *MemObj) Data() []byte { + return f.buf.Bytes() +} + +// ReadAt is part of the Readable interface. +func (f *MemObj) ReadAt(ctx context.Context, p []byte, off int64) error { + if f.Size() < off+int64(len(p)) { + return errors.Errorf("read past the end of object") + } + copy(p, f.Data()[off:off+int64(len(p))]) + return nil +} + +// Close is part of the Readable interface. +func (f *MemObj) Close() error { return nil } + +// Size is part of the Readable interface. +func (f *MemObj) Size() int64 { + return int64(f.buf.Len()) +} + +// NewReadHandle is part of the Readable interface. +func (f *MemObj) NewReadHandle(readBeforeSize ReadBeforeSize) ReadHandle { + return (*memObjReadHandle)(f) +} + +// memObjReadHandle implements ReadHandle for MemObj. +type memObjReadHandle MemObj + +var _ ReadHandle = (*memObjReadHandle)(nil) + +func (h *memObjReadHandle) ReadAt(ctx context.Context, p []byte, off int64) error { + return (*MemObj)(h).ReadAt(ctx, p, off) +} + +func (h *memObjReadHandle) Close() error { return nil } + +func (h *memObjReadHandle) SetupForCompaction() {} + +func (h *memObjReadHandle) RecordCacheHit(ctx context.Context, offset, size int64) {} diff --git a/vendor/github.com/cockroachdb/pebble/v2/obsolete_files.go b/vendor/github.com/cockroachdb/pebble/v2/obsolete_files.go new file mode 100644 index 0000000..16edebd --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/obsolete_files.go @@ -0,0 +1,738 @@ +// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "cmp" + "context" + "runtime/pprof" + "slices" + "sync" + "time" + + "github.com/cockroachdb/crlib/crtime" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/pebble/v2/wal" + "github.com/cockroachdb/tokenbucket" +) + +// Cleaner exports the base.Cleaner type. +type Cleaner = base.Cleaner + +// DeleteCleaner exports the base.DeleteCleaner type. +type DeleteCleaner = base.DeleteCleaner + +// ArchiveCleaner exports the base.ArchiveCleaner type. +type ArchiveCleaner = base.ArchiveCleaner + +type cleanupManager struct { + opts *Options + objProvider objstorage.Provider + deletePacer *deletionPacer + + // jobsCh is used as the cleanup job queue. + jobsCh chan *cleanupJob + // waitGroup is used to wait for the background goroutine to exit. + waitGroup sync.WaitGroup + + mu struct { + sync.Mutex + // totalJobs is the total number of enqueued jobs (completed or in progress). + totalJobs int + completedStats obsoleteObjectStats + completedJobs int + completedJobsCond sync.Cond + jobsQueueWarningIssued bool + } +} + +// CompletedStats returns the stats summarizing objects deleted. The returned +// stats increase monotonically over the lifetime of the DB. +func (m *cleanupManager) CompletedStats() obsoleteObjectStats { + m.mu.Lock() + defer m.mu.Unlock() + return m.mu.completedStats +} + +// We can queue this many jobs before we have to block EnqueueJob. +const jobsQueueDepth = 1000 + +// obsoleteFile holds information about a file that needs to be deleted soon. +type obsoleteFile struct { + fileType base.FileType + fs vfs.FS + path string + fileNum base.DiskFileNum + fileSize uint64 // approx for log files + isLocal bool +} + +func (of *obsoleteFile) needsPacing() bool { + // We only need to pace local objects--sstables and blob files. + return of.isLocal && (of.fileType == base.FileTypeTable || of.fileType == base.FileTypeBlob) +} + +type cleanupJob struct { + jobID JobID + obsoleteFiles []obsoleteFile + stats obsoleteObjectStats +} + +// openCleanupManager creates a cleanupManager and starts its background goroutine. +// The cleanupManager must be Close()d. +func openCleanupManager( + opts *Options, objProvider objstorage.Provider, getDeletePacerInfo func() deletionPacerInfo, +) *cleanupManager { + cm := &cleanupManager{ + opts: opts, + objProvider: objProvider, + deletePacer: newDeletionPacer( + crtime.NowMono(), + opts.FreeSpaceThresholdBytes, + int64(opts.TargetByteDeletionRate), + opts.FreeSpaceTimeframe, + opts.ObsoleteBytesMaxRatio, + opts.ObsoleteBytesTimeframe, + getDeletePacerInfo, + ), + jobsCh: make(chan *cleanupJob, jobsQueueDepth), + } + cm.mu.completedJobsCond.L = &cm.mu.Mutex + cm.waitGroup.Add(1) + + go func() { + pprof.Do(context.Background(), gcLabels, func(context.Context) { + cm.mainLoop() + }) + }() + + return cm +} + +// Close stops the background goroutine, waiting until all queued jobs are completed. +// Delete pacing is disabled for the remaining jobs. +func (cm *cleanupManager) Close() { + close(cm.jobsCh) + cm.waitGroup.Wait() +} + +// EnqueueJob adds a cleanup job to the manager's queue. +func (cm *cleanupManager) EnqueueJob( + jobID JobID, obsoleteFiles []obsoleteFile, stats obsoleteObjectStats, +) { + job := &cleanupJob{ + jobID: jobID, + obsoleteFiles: obsoleteFiles, + stats: stats, + } + + // Report deleted bytes to the pacer, which can use this data to potentially + // increase the deletion rate to keep up. We want to do this at enqueue time + // rather than when we get to the job, otherwise the reported bytes will be + // subject to the throttling rate which defeats the purpose. + var pacingBytes uint64 + for _, of := range obsoleteFiles { + if of.needsPacing() { + pacingBytes += of.fileSize + } + } + if pacingBytes > 0 { + cm.deletePacer.ReportDeletion(crtime.NowMono(), pacingBytes) + } + + cm.mu.Lock() + cm.mu.totalJobs++ + cm.maybeLogLocked() + cm.mu.Unlock() + + cm.jobsCh <- job +} + +// Wait until the completion of all jobs that were already queued. +// +// Does not wait for jobs that are enqueued during the call. +// +// Note that DB.mu should not be held while calling this method; the background +// goroutine needs to acquire DB.mu to update deleted table metrics. +func (cm *cleanupManager) Wait() { + cm.mu.Lock() + defer cm.mu.Unlock() + n := cm.mu.totalJobs + for cm.mu.completedJobs < n { + cm.mu.completedJobsCond.Wait() + } +} + +// mainLoop runs the manager's background goroutine. +func (cm *cleanupManager) mainLoop() { + defer cm.waitGroup.Done() + + var tb tokenbucket.TokenBucket + // Use a token bucket with 1 token / second refill rate and 1 token burst. + tb.Init(1.0, 1.0) + for job := range cm.jobsCh { + for _, of := range job.obsoleteFiles { + switch of.fileType { + case base.FileTypeTable: + cm.maybePace(&tb, &of) + cm.deleteObsoleteObject(of.fileType, job.jobID, of.fileNum) + case base.FileTypeBlob: + cm.maybePace(&tb, &of) + cm.deleteObsoleteObject(of.fileType, job.jobID, of.fileNum) + default: + cm.deleteObsoleteFile(of.fs, of.fileType, job.jobID, of.path, of.fileNum) + } + } + cm.mu.Lock() + cm.mu.completedJobs++ + cm.mu.completedStats.Add(job.stats) + cm.mu.completedJobsCond.Broadcast() + cm.maybeLogLocked() + cm.mu.Unlock() + } +} + +// maybePace sleeps before deleting an object if appropriate. It is always +// called from the background goroutine. +func (cm *cleanupManager) maybePace(tb *tokenbucket.TokenBucket, of *obsoleteFile) { + if !of.needsPacing() { + return + } + + tokens := cm.deletePacer.PacingDelay(crtime.NowMono(), of.fileSize) + if tokens == 0.0 { + // The token bucket might be in debt; it could make us wait even for 0 + // tokens. We don't want that if the pacer decided throttling should be + // disabled. + return + } + // Wait for tokens. We use a token bucket instead of sleeping outright because + // the token bucket accumulates up to one second of unused tokens. + for { + ok, d := tb.TryToFulfill(tokenbucket.Tokens(tokens)) + if ok { + break + } + time.Sleep(d) + } +} + +// deleteObsoleteFile deletes a (non-object) file that is no longer needed. +func (cm *cleanupManager) deleteObsoleteFile( + fs vfs.FS, fileType base.FileType, jobID JobID, path string, fileNum base.DiskFileNum, +) { + // TODO(peter): need to handle this error, probably by re-adding the + // file that couldn't be deleted to one of the obsolete slices map. + err := cm.opts.Cleaner.Clean(fs, fileType, path) + if oserror.IsNotExist(err) { + return + } + + switch fileType { + case base.FileTypeLog: + cm.opts.EventListener.WALDeleted(WALDeleteInfo{ + JobID: int(jobID), + Path: path, + FileNum: fileNum, + Err: err, + }) + case base.FileTypeManifest: + cm.opts.EventListener.ManifestDeleted(ManifestDeleteInfo{ + JobID: int(jobID), + Path: path, + FileNum: fileNum, + Err: err, + }) + case base.FileTypeTable, base.FileTypeBlob: + panic("invalid deletion of object file") + } +} + +func (cm *cleanupManager) deleteObsoleteObject( + fileType base.FileType, jobID JobID, fileNum base.DiskFileNum, +) { + if fileType != base.FileTypeTable && fileType != base.FileTypeBlob { + panic("not an object") + } + + var path string + meta, err := cm.objProvider.Lookup(fileType, fileNum) + if err != nil { + path = "" + } else { + path = cm.objProvider.Path(meta) + err = cm.objProvider.Remove(fileType, fileNum) + } + if cm.objProvider.IsNotExistError(err) { + return + } + + switch fileType { + case base.FileTypeTable: + cm.opts.EventListener.TableDeleted(TableDeleteInfo{ + JobID: int(jobID), + Path: path, + FileNum: fileNum, + Err: err, + }) + case base.FileTypeBlob: + cm.opts.EventListener.BlobFileDeleted(BlobFileDeleteInfo{ + JobID: int(jobID), + Path: path, + FileNum: fileNum, + Err: err, + }) + } +} + +// maybeLogLocked issues a log if the job queue gets 75% full and issues a log +// when the job queue gets back to less than 10% full. +// +// Must be called with cm.mu locked. +func (cm *cleanupManager) maybeLogLocked() { + const highThreshold = jobsQueueDepth * 3 / 4 + const lowThreshold = jobsQueueDepth / 10 + + jobsInQueue := cm.mu.totalJobs - cm.mu.completedJobs + + if !cm.mu.jobsQueueWarningIssued && jobsInQueue > highThreshold { + cm.mu.jobsQueueWarningIssued = true + cm.opts.Logger.Infof("cleanup falling behind; job queue has over %d jobs", highThreshold) + } + + if cm.mu.jobsQueueWarningIssued && jobsInQueue < lowThreshold { + cm.mu.jobsQueueWarningIssued = false + cm.opts.Logger.Infof("cleanup back to normal; job queue has under %d jobs", lowThreshold) + } +} + +func (d *DB) getDeletionPacerInfo() deletionPacerInfo { + var pacerInfo deletionPacerInfo + // Call GetDiskUsage after every file deletion. This may seem inefficient, + // but in practice this was observed to take constant time, regardless of + // volume size used, at least on linux with ext4 and zfs. All invocations + // take 10 microseconds or less. + pacerInfo.freeBytes = d.calculateDiskAvailableBytes() + d.mu.Lock() + pacerInfo.obsoleteBytes = d.mu.versions.metrics.Table.ObsoleteSize + total := d.mu.versions.metrics.Total() + d.mu.Unlock() + pacerInfo.liveBytes = uint64(total.AggregateSize()) + return pacerInfo +} + +// scanObsoleteFiles scans the filesystem for files that are no longer needed +// and adds those to the internal lists of obsolete files. Note that the files +// are not actually deleted by this method. A subsequent call to +// deleteObsoleteFiles must be performed. Must be not be called concurrently +// with compactions and flushes. db.mu must be held when calling this function. +func (d *DB) scanObsoleteFiles(list []string, flushableIngests []*ingestedFlushable) { + // Disable automatic compactions temporarily to avoid concurrent compactions / + // flushes from interfering. The original value is restored on completion. + disabledPrev := d.opts.DisableAutomaticCompactions + defer func() { + d.opts.DisableAutomaticCompactions = disabledPrev + }() + d.opts.DisableAutomaticCompactions = true + + // Wait for any ongoing compaction to complete before continuing. + for d.mu.compact.compactingCount > 0 || d.mu.compact.downloadingCount > 0 || d.mu.compact.flushing { + d.mu.compact.cond.Wait() + } + + liveFileNums := make(map[base.DiskFileNum]struct{}) + d.mu.versions.addLiveFileNums(liveFileNums) + // Protect against files which are only referred to by the ingestedFlushable + // from being deleted. These are added to the flushable queue on WAL replay + // and handle their own obsoletion/deletion. We exclude them from this obsolete + // file scan to avoid double-deleting these files. + for _, f := range flushableIngests { + for _, file := range f.files { + liveFileNums[file.TableBacking.DiskFileNum] = struct{}{} + } + } + + manifestFileNum := d.mu.versions.manifestFileNum + + var obsoleteTables []obsoleteFile + var obsoleteBlobs []obsoleteFile + var obsoleteOptions []obsoleteFile + var obsoleteManifests []obsoleteFile + + for _, filename := range list { + fileType, diskFileNum, ok := base.ParseFilename(d.opts.FS, filename) + if !ok { + continue + } + makeObsoleteFile := func() obsoleteFile { + of := obsoleteFile{ + fileType: fileType, + fs: d.opts.FS, + path: d.opts.FS.PathJoin(d.dirname, filename), + fileNum: diskFileNum, + isLocal: true, + } + if stat, err := d.opts.FS.Stat(filename); err == nil { + of.fileSize = uint64(stat.Size()) + } + return of + } + switch fileType { + case base.FileTypeManifest: + if diskFileNum >= manifestFileNum { + continue + } + obsoleteManifests = append(obsoleteManifests, makeObsoleteFile()) + case base.FileTypeOptions: + if diskFileNum >= d.optionsFileNum { + continue + } + obsoleteOptions = append(obsoleteOptions, makeObsoleteFile()) + case base.FileTypeTable, base.FileTypeBlob: + // Objects are handled through the objstorage provider below. + default: + // Don't delete files we don't know about. + } + } + + objects := d.objProvider.List() + for _, obj := range objects { + if _, ok := liveFileNums[obj.DiskFileNum]; ok { + continue + } + if obj.FileType != base.FileTypeTable && obj.FileType != base.FileTypeBlob { + // Ignore object types we don't know about. + continue + } + of := obsoleteFile{ + fileType: obj.FileType, + fs: d.opts.FS, + path: base.MakeFilepath(d.opts.FS, d.dirname, obj.FileType, obj.DiskFileNum), + fileNum: obj.DiskFileNum, + isLocal: true, + } + if size, err := d.objProvider.Size(obj); err == nil { + of.fileSize = uint64(size) + } + if obj.FileType == base.FileTypeTable { + obsoleteTables = append(obsoleteTables, of) + } else { + obsoleteBlobs = append(obsoleteBlobs, of) + } + } + + d.mu.versions.obsoleteTables = mergeObsoleteFiles(d.mu.versions.obsoleteTables, obsoleteTables) + d.mu.versions.obsoleteBlobs = mergeObsoleteFiles(d.mu.versions.obsoleteBlobs, obsoleteBlobs) + d.mu.versions.obsoleteManifests = mergeObsoleteFiles(d.mu.versions.obsoleteManifests, obsoleteManifests) + d.mu.versions.obsoleteOptions = mergeObsoleteFiles(d.mu.versions.obsoleteOptions, obsoleteOptions) + d.mu.versions.updateObsoleteObjectMetricsLocked() +} + +// disableFileDeletions disables file deletions and then waits for any +// in-progress deletion to finish. The caller is required to call +// enableFileDeletions in order to enable file deletions again. It is ok for +// multiple callers to disable file deletions simultaneously, though they must +// all invoke enableFileDeletions in order for file deletions to be re-enabled +// (there is an internal reference count on file deletion disablement). +// +// d.mu must be held when calling this method. +func (d *DB) disableFileDeletions() { + d.mu.fileDeletions.disableCount++ + d.mu.Unlock() + defer d.mu.Lock() + d.cleanupManager.Wait() +} + +// enableFileDeletions enables previously disabled file deletions. A cleanup job +// is queued if necessary. +// +// d.mu must be held when calling this method. +func (d *DB) enableFileDeletions() { + if d.mu.fileDeletions.disableCount <= 0 { + panic("pebble: file deletion disablement invariant violated") + } + d.mu.fileDeletions.disableCount-- + if d.mu.fileDeletions.disableCount > 0 { + return + } + d.deleteObsoleteFiles(d.newJobIDLocked()) +} + +type fileInfo = base.FileInfo + +// deleteObsoleteFiles enqueues a cleanup job to the cleanup manager, if necessary. +// +// d.mu must be held when calling this. The function will release and re-aquire the mutex. +// +// Does nothing if file deletions are disabled (see disableFileDeletions). A +// cleanup job will be scheduled when file deletions are re-enabled. +func (d *DB) deleteObsoleteFiles(jobID JobID) { + if d.mu.fileDeletions.disableCount > 0 { + return + } + _, noRecycle := d.opts.Cleaner.(base.NeedsFileContents) + + // NB: d.mu.versions.minUnflushedLogNum is the log number of the earliest + // log that has not had its contents flushed to an sstable. + obsoleteLogs, err := d.mu.log.manager.Obsolete(wal.NumWAL(d.mu.versions.minUnflushedLogNum), noRecycle) + if err != nil { + panic(err) + } + + obsoleteTables := slices.Clone(d.mu.versions.obsoleteTables) + d.mu.versions.obsoleteTables = d.mu.versions.obsoleteTables[:0] + obsoleteBlobs := slices.Clone(d.mu.versions.obsoleteBlobs) + d.mu.versions.obsoleteBlobs = d.mu.versions.obsoleteBlobs[:0] + + // Ensure everything is already sorted. We want determinism for testing, and + // we need the manifests to be sorted because we want to delete some + // contiguous prefix of the older manifests. + if invariants.Enabled { + switch { + case !slices.IsSortedFunc(d.mu.versions.obsoleteManifests, cmpObsoleteFileNumbers): + d.opts.Logger.Fatalf("obsoleteManifests is not sorted") + case !slices.IsSortedFunc(d.mu.versions.obsoleteOptions, cmpObsoleteFileNumbers): + d.opts.Logger.Fatalf("obsoleteOptions is not sorted") + case !slices.IsSortedFunc(obsoleteTables, cmpObsoleteFileNumbers): + d.opts.Logger.Fatalf("obsoleteTables is not sorted") + case !slices.IsSortedFunc(obsoleteBlobs, cmpObsoleteFileNumbers): + d.opts.Logger.Fatalf("obsoleteBlobs is not sorted") + } + } + + var obsoleteManifests []obsoleteFile + manifestsToDelete := len(d.mu.versions.obsoleteManifests) - d.opts.NumPrevManifest + if manifestsToDelete > 0 { + obsoleteManifests = d.mu.versions.obsoleteManifests[:manifestsToDelete] + d.mu.versions.obsoleteManifests = d.mu.versions.obsoleteManifests[manifestsToDelete:] + if len(d.mu.versions.obsoleteManifests) == 0 { + d.mu.versions.obsoleteManifests = nil + } + } + + obsoleteOptions := d.mu.versions.obsoleteOptions + d.mu.versions.obsoleteOptions = nil + + // Compute the stats for the files being queued for deletion and add them to + // the running total. These stats will be used during DB.Metrics() to + // calculate the count and size of pending obsolete files by diffing these + // stats and the stats reported by the cleanup manager. + var objectStats obsoleteObjectStats + objectStats.tablesAll, objectStats.tablesLocal = calculateObsoleteObjectStats(obsoleteTables) + objectStats.blobFilesAll, objectStats.blobFilesLocal = calculateObsoleteObjectStats(obsoleteBlobs) + d.mu.fileDeletions.queuedStats.Add(objectStats) + d.mu.versions.updateObsoleteObjectMetricsLocked() + + // Release d.mu while preparing the cleanup job and possibly waiting. + // Note the unusual order: Unlock and then Lock. + d.mu.Unlock() + defer d.mu.Lock() + + n := len(obsoleteLogs) + len(obsoleteTables) + len(obsoleteBlobs) + len(obsoleteManifests) + len(obsoleteOptions) + filesToDelete := make([]obsoleteFile, 0, n) + filesToDelete = append(filesToDelete, obsoleteManifests...) + filesToDelete = append(filesToDelete, obsoleteOptions...) + filesToDelete = append(filesToDelete, obsoleteTables...) + filesToDelete = append(filesToDelete, obsoleteBlobs...) + for _, f := range obsoleteLogs { + filesToDelete = append(filesToDelete, obsoleteFile{ + fileType: base.FileTypeLog, + fs: f.FS, + path: f.Path, + fileNum: base.DiskFileNum(f.NumWAL), + fileSize: f.ApproxFileSize, + isLocal: true, + }) + } + for _, f := range obsoleteTables { + d.fileCache.Evict(f.fileNum, base.FileTypeTable) + } + for _, f := range obsoleteBlobs { + d.fileCache.Evict(f.fileNum, base.FileTypeBlob) + } + if len(filesToDelete) > 0 { + d.cleanupManager.EnqueueJob(jobID, filesToDelete, objectStats) + } + if d.opts.private.testingAlwaysWaitForCleanup { + d.cleanupManager.Wait() + } +} + +func (d *DB) maybeScheduleObsoleteObjectDeletion() { + d.mu.Lock() + defer d.mu.Unlock() + if len(d.mu.versions.obsoleteTables) > 0 || len(d.mu.versions.obsoleteBlobs) > 0 { + d.deleteObsoleteFiles(d.newJobIDLocked()) + } +} + +func mergeObsoleteFiles(a, b []obsoleteFile) []obsoleteFile { + if len(b) == 0 { + return a + } + + a = append(a, b...) + slices.SortFunc(a, cmpObsoleteFileNumbers) + return slices.CompactFunc(a, func(a, b obsoleteFile) bool { + return a.fileNum == b.fileNum + }) +} + +func cmpObsoleteFileNumbers(a, b obsoleteFile) int { + return cmp.Compare(a.fileNum, b.fileNum) +} + +// objectInfo describes an object in object storage (either a sstable or a blob +// file). +type objectInfo struct { + fileInfo + isLocal bool +} + +func (o objectInfo) asObsoleteFile(fs vfs.FS, fileType base.FileType, dirname string) obsoleteFile { + return obsoleteFile{ + fileType: fileType, + fs: fs, + path: base.MakeFilepath(fs, dirname, fileType, o.FileNum), + fileNum: o.FileNum, + fileSize: o.FileSize, + isLocal: o.isLocal, + } +} + +func calculateObsoleteObjectStats(files []obsoleteFile) (total, local countAndSize) { + for _, of := range files { + if of.isLocal { + local.count++ + local.size += of.fileSize + } + total.count++ + total.size += of.fileSize + } + return total, local +} + +type obsoleteObjectStats struct { + tablesLocal countAndSize + tablesAll countAndSize + blobFilesLocal countAndSize + blobFilesAll countAndSize +} + +func (s *obsoleteObjectStats) Add(other obsoleteObjectStats) { + s.tablesLocal.Add(other.tablesLocal) + s.tablesAll.Add(other.tablesAll) + s.blobFilesLocal.Add(other.blobFilesLocal) + s.blobFilesAll.Add(other.blobFilesAll) +} + +func (s *obsoleteObjectStats) Sub(other obsoleteObjectStats) { + s.tablesLocal.Sub(other.tablesLocal) + s.tablesAll.Sub(other.tablesAll) + s.blobFilesLocal.Sub(other.blobFilesLocal) + s.blobFilesAll.Sub(other.blobFilesAll) +} + +type countAndSize struct { + count uint64 + size uint64 +} + +func (c *countAndSize) Add(other countAndSize) { + c.count += other.count + c.size += other.size +} + +func (c *countAndSize) Sub(other countAndSize) { + c.count = invariants.SafeSub(c.count, other.count) + c.size = invariants.SafeSub(c.size, other.size) +} + +func makeZombieObjects() zombieObjects { + return zombieObjects{ + objs: make(map[base.DiskFileNum]objectInfo), + } +} + +// zombieObjects tracks a set of objects that are no longer required by the most +// recent version of the LSM, but may still need to be accessed by an open +// iterator. Such objects are 'dead,' but cannot be deleted until iterators that +// may access them are closed. +type zombieObjects struct { + objs map[base.DiskFileNum]objectInfo + totalSize uint64 + localSize uint64 + localCount uint64 +} + +// Add adds an object to the set of zombie objects. +func (z *zombieObjects) Add(obj objectInfo) { + if _, ok := z.objs[obj.FileNum]; ok { + panic(errors.AssertionFailedf("zombie object %s already exists", obj.FileNum)) + } + z.objs[obj.FileNum] = obj + z.totalSize += obj.FileSize + if obj.isLocal { + z.localSize += obj.FileSize + z.localCount++ + } +} + +// AddMetadata is like Add, but takes an ObjectMetadata and the object's size. +func (z *zombieObjects) AddMetadata(meta *objstorage.ObjectMetadata, size uint64) { + z.Add(objectInfo{ + fileInfo: fileInfo{ + FileNum: meta.DiskFileNum, + FileSize: size, + }, + isLocal: !meta.IsRemote(), + }) +} + +// Count returns the number of zombie objects. +func (z *zombieObjects) Count() int { + return len(z.objs) +} + +// Extract removes an object from the set of zombie objects, returning the +// object that was removed. +func (z *zombieObjects) Extract(fileNum base.DiskFileNum) objectInfo { + obj, ok := z.objs[fileNum] + if !ok { + panic(errors.AssertionFailedf("zombie object %s not found", fileNum)) + } + delete(z.objs, fileNum) + + // Detect underflow in case we have a bug that causes an object's size to be + // mutated. + if z.totalSize < obj.FileSize { + panic(errors.AssertionFailedf("zombie object %s size %d is greater than total size %d", fileNum, obj.FileSize, z.totalSize)) + } + if obj.isLocal && z.localSize < obj.FileSize { + panic(errors.AssertionFailedf("zombie object %s size %d is greater than local size %d", fileNum, obj.FileSize, z.localSize)) + } + + z.totalSize -= obj.FileSize + if obj.isLocal { + z.localSize -= obj.FileSize + z.localCount-- + } + return obj +} + +// TotalSize returns the size of all objects in the set. +func (z *zombieObjects) TotalSize() uint64 { + return z.totalSize +} + +// LocalStats returns the count and size of all local objects in the set. +func (z *zombieObjects) LocalStats() (count uint64, size uint64) { + return z.localCount, z.localSize +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/open.go b/vendor/github.com/cockroachdb/pebble/v2/open.go new file mode 100644 index 0000000..3ad6e90 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/open.go @@ -0,0 +1,1319 @@ +// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "bytes" + "context" + "encoding/binary" + "fmt" + "io" + "math" + "os" + "slices" + "sync/atomic" + "time" + + "github.com/cockroachdb/crlib/crtime" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/errors/oserror" + "github.com/cockroachdb/pebble/v2/batchrepr" + "github.com/cockroachdb/pebble/v2/internal/arenaskl" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/cache" + "github.com/cockroachdb/pebble/v2/internal/constants" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/manual" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/record" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/pebble/v2/wal" + "github.com/cockroachdb/redact" + "github.com/prometheus/client_golang/prometheus" +) + +const ( + initialMemTableSize = 256 << 10 // 256 KB + + // The max batch size is limited by the uint32 offsets stored in + // internal/batchskl.node, DeferredBatchOp, and flushableBatchEntry. + // + // We limit the size to MaxUint32 (just short of 4GB) so that the exclusive + // end of an allocation fits in uint32. + // + // On 32-bit systems, slices are naturally limited to MaxInt (just short of + // 2GB). + maxBatchSize = constants.MaxUint32OrInt + + // The max memtable size is limited by the uint32 offsets stored in + // internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry. + // + // We limit the size to MaxUint32 (just short of 4GB) so that the exclusive + // end of an allocation fits in uint32. + // + // On 32-bit systems, slices are naturally limited to MaxInt (just short of + // 2GB). + maxMemTableSize = constants.MaxUint32OrInt +) + +// FileCacheSize can be used to determine the file +// cache size for a single db, given the maximum open +// files which can be used by a file cache which is +// only used by a single db. +func FileCacheSize(maxOpenFiles int) int { + fileCacheSize := maxOpenFiles - numNonFileCacheFiles + if fileCacheSize < minFileCacheSize { + fileCacheSize = minFileCacheSize + } + return fileCacheSize +} + +// Open opens a DB whose files live in the given directory. +// +// IsCorruptionError() can be use to determine if the error is caused by on-disk +// corruption. +func Open(dirname string, opts *Options) (db *DB, err error) { + // Make a copy of the options so that we don't mutate the passed in options. + opts = opts.Clone() + opts.EnsureDefaults() + if opts.Experimental.CompactionScheduler == nil { + opts.Experimental.CompactionScheduler = newConcurrencyLimitScheduler(defaultTimeSource{}) + } + if err := opts.Validate(); err != nil { + return nil, err + } + if opts.LoggerAndTracer == nil { + opts.LoggerAndTracer = &base.LoggerWithNoopTracer{Logger: opts.Logger} + } else { + opts.Logger = opts.LoggerAndTracer + } + + if invariants.Sometimes(5) { + assertComparer := base.MakeAssertComparer(*opts.Comparer) + opts.Comparer = &assertComparer + } + + // In all error cases, we return db = nil; this is used by various + // deferred cleanups. + + // Open the database and WAL directories first. + walDirname, dataDir, err := prepareAndOpenDirs(dirname, opts) + if err != nil { + return nil, errors.Wrapf(err, "error opening database at %q", dirname) + } + defer func() { + if db == nil { + dataDir.Close() + } + }() + + // Lock the database directory. + var fileLock *Lock + if opts.Lock != nil { + // The caller already acquired the database lock. Ensure that the + // directory matches. + if err := opts.Lock.pathMatches(dirname); err != nil { + return nil, err + } + if err := opts.Lock.refForOpen(); err != nil { + return nil, err + } + fileLock = opts.Lock + } else { + fileLock, err = LockDirectory(dirname, opts.FS) + if err != nil { + return nil, err + } + } + defer func() { + if db == nil { + _ = fileLock.Close() + } + }() + + // List the directory contents. This also happens to include WAL log files, if + // they are in the same dir, but we will ignore those below. The provider is + // also given this list, but it ignores non sstable files. + ls, err := opts.FS.List(dirname) + if err != nil { + return nil, err + } + + // Establish the format major version. + formatVersion, formatVersionMarker, err := lookupFormatMajorVersion(opts.FS, dirname, ls) + if err != nil { + return nil, err + } + defer func() { + if db == nil { + _ = formatVersionMarker.Close() + } + }() + + noFormatVersionMarker := formatVersion == FormatDefault + if noFormatVersionMarker { + // We will initialize the store at the minimum possible format, then upgrade + // the format to the desired one. This helps test the format upgrade code. + formatVersion = FormatMinSupported + if opts.Experimental.CreateOnShared != remote.CreateOnSharedNone { + formatVersion = FormatMinForSharedObjects + } + // There is no format version marker file. There are three cases: + // - we are trying to open an existing store that was created at + // FormatMostCompatible (the only one without a version marker file) + // - we are creating a new store; + // - we are retrying a failed creation. + // + // To error in the first case, we set ErrorIfNotPristine. + opts.ErrorIfNotPristine = true + defer func() { + if err != nil && errors.Is(err, ErrDBNotPristine) { + // We must be trying to open an existing store at FormatMostCompatible. + // Correct the error in this case -we + err = errors.Newf( + "pebble: database %q written in format major version 1 which is no longer supported", + dirname) + } + }() + } + + // Find the currently active manifest, if there is one. + manifestMarker, manifestFileNum, manifestExists, err := findCurrentManifest(opts.FS, dirname, ls) + if err != nil { + return nil, errors.Wrapf(err, "pebble: database %q", dirname) + } + defer func() { + if db == nil { + _ = manifestMarker.Close() + } + }() + + // Atomic markers may leave behind obsolete files if there's a crash + // mid-update. Clean these up if we're not in read-only mode. + if !opts.ReadOnly { + if err := formatVersionMarker.RemoveObsolete(); err != nil { + return nil, err + } + if err := manifestMarker.RemoveObsolete(); err != nil { + return nil, err + } + } + + if opts.Cache == nil { + opts.Cache = cache.New(opts.CacheSize) + defer opts.Cache.Unref() + } + + d := &DB{ + cacheHandle: opts.Cache.NewHandle(), + dirname: dirname, + opts: opts, + cmp: opts.Comparer.Compare, + equal: opts.Comparer.Equal, + merge: opts.Merger.Merge, + split: opts.Comparer.Split, + abbreviatedKey: opts.Comparer.AbbreviatedKey, + largeBatchThreshold: (opts.MemTableSize - uint64(memTableEmptySize)) / 2, + fileLock: fileLock, + dataDir: dataDir, + closed: new(atomic.Value), + closedCh: make(chan struct{}), + } + d.mu.versions = &versionSet{} + d.diskAvailBytes.Store(math.MaxUint64) + d.problemSpans.Init(manifest.NumLevels, opts.Comparer.Compare) + + defer func() { + // If an error or panic occurs during open, attempt to release the manually + // allocated memory resources. Note that rather than look for an error, we + // look for the return of a nil DB pointer. + if r := recover(); db == nil { + // If there's an unused, recycled memtable, we need to release its memory. + if obsoleteMemTable := d.memTableRecycle.Swap(nil); obsoleteMemTable != nil { + d.freeMemTable(obsoleteMemTable) + } + + if d.fileCache != nil { + _ = d.fileCache.Close() + } + d.cacheHandle.Close() + + for _, mem := range d.mu.mem.queue { + switch t := mem.flushable.(type) { + case *memTable: + manual.Free(manual.MemTable, t.arenaBuf) + t.arenaBuf = manual.Buf{} + } + } + if d.cleanupManager != nil { + d.cleanupManager.Close() + } + if d.objProvider != nil { + _ = d.objProvider.Close() + } + if r != nil { + panic(r) + } + } + }() + + d.commit = newCommitPipeline(commitEnv{ + logSeqNum: &d.mu.versions.logSeqNum, + visibleSeqNum: &d.mu.versions.visibleSeqNum, + apply: d.commitApply, + write: d.commitWrite, + }) + d.mu.nextJobID = 1 + d.mu.mem.nextSize = opts.MemTableSize + if d.mu.mem.nextSize > initialMemTableSize { + d.mu.mem.nextSize = initialMemTableSize + } + d.mu.compact.cond.L = &d.mu.Mutex + d.mu.compact.inProgress = make(map[compaction]struct{}) + d.mu.compact.noOngoingFlushStartTime = crtime.NowMono() + d.mu.snapshots.init() + // logSeqNum is the next sequence number that will be assigned. + // Start assigning sequence numbers from base.SeqNumStart to leave + // room for reserved sequence numbers (see comments around + // SeqNumStart). + d.mu.versions.logSeqNum.Store(base.SeqNumStart) + d.mu.formatVers.vers.Store(uint64(formatVersion)) + d.mu.formatVers.marker = formatVersionMarker + + d.timeNow = time.Now + d.openedAt = d.timeNow() + + d.mu.Lock() + defer d.mu.Unlock() + + jobID := d.newJobIDLocked() + + providerSettings := opts.MakeObjStorageProviderSettings(dirname) + providerSettings.FSDirInitialListing = ls + d.objProvider, err = objstorageprovider.Open(providerSettings) + if err != nil { + return nil, err + } + + blobRewriteHeuristic := manifest.BlobRewriteHeuristic{ + CurrentTime: d.timeNow, + MinimumAge: opts.Experimental.ValueSeparationPolicy().RewriteMinimumAge, + } + + if !manifestExists { + // DB does not exist. + if d.opts.ErrorIfNotExists || d.opts.ReadOnly { + return nil, errors.Wrapf(ErrDBDoesNotExist, "dirname=%q", dirname) + } + + // Create the DB. + if err := d.mu.versions.create( + jobID, dirname, d.objProvider, opts, manifestMarker, d.FormatMajorVersion, blobRewriteHeuristic, &d.mu.Mutex); err != nil { + return nil, err + } + } else { + if opts.ErrorIfExists { + return nil, errors.Wrapf(ErrDBAlreadyExists, "dirname=%q", dirname) + } + // Load the version set. + if err := d.mu.versions.load( + dirname, d.objProvider, opts, manifestFileNum, manifestMarker, d.FormatMajorVersion, blobRewriteHeuristic, &d.mu.Mutex); err != nil { + return nil, err + } + if opts.ErrorIfNotPristine { + liveFileNums := make(map[base.DiskFileNum]struct{}) + d.mu.versions.addLiveFileNums(liveFileNums) + if len(liveFileNums) != 0 { + return nil, errors.Wrapf(ErrDBNotPristine, "dirname=%q", dirname) + } + } + } + + // In read-only mode, we replay directly into the mutable memtable but never + // flush it. We need to delay creation of the memtable until we know the + // sequence number of the first batch that will be inserted. + if !d.opts.ReadOnly { + var entry *flushableEntry + d.mu.mem.mutable, entry = d.newMemTable(0 /* logNum */, d.mu.versions.logSeqNum.Load(), 0 /* minSize */) + d.mu.mem.queue = append(d.mu.mem.queue, entry) + } + + d.mu.log.metrics.fsyncLatency = prometheus.NewHistogram(prometheus.HistogramOpts{ + Buckets: FsyncLatencyBuckets, + }) + walOpts := wal.Options{ + Primary: wal.Dir{FS: opts.FS, Dirname: walDirname}, + Secondary: wal.Dir{}, + MinUnflushedWALNum: wal.NumWAL(d.mu.versions.minUnflushedLogNum), + MaxNumRecyclableLogs: opts.MemTableStopWritesThreshold + 1, + NoSyncOnClose: opts.NoSyncOnClose, + BytesPerSync: opts.WALBytesPerSync, + PreallocateSize: d.walPreallocateSize, + MinSyncInterval: opts.WALMinSyncInterval, + FsyncLatency: d.mu.log.metrics.fsyncLatency, + QueueSemChan: d.commit.logSyncQSem, + Logger: opts.Logger, + EventListener: walEventListenerAdaptor{l: opts.EventListener}, + WriteWALSyncOffsets: func() bool { return d.FormatMajorVersion() >= FormatWALSyncChunks }, + } + if opts.WALFailover != nil { + walOpts.Secondary = opts.WALFailover.Secondary + walOpts.Secondary.Dirname = resolveStorePath(dirname, walOpts.Secondary.Dirname) + walOpts.FailoverOptions = opts.WALFailover.FailoverOptions + walOpts.FailoverWriteAndSyncLatency = prometheus.NewHistogram(prometheus.HistogramOpts{ + Buckets: FsyncLatencyBuckets, + }) + } + walDirs := walOpts.Dirs() + for _, dir := range opts.WALRecoveryDirs { + dir.Dirname = resolveStorePath(dirname, dir.Dirname) + walDirs = append(walDirs, dir) + } + wals, err := wal.Scan(walDirs...) + if err != nil { + return nil, err + } + d.opts.Logger.Infof("Found %d WALs", redact.Safe(len(wals))) + for i := range wals { + d.opts.Logger.Infof(" - %s", wals[i]) + } + walManager, err := wal.Init(walOpts, wals) + if err != nil { + return nil, err + } + defer func() { + if db == nil { + _ = walManager.Close() + } + }() + + d.mu.log.manager = walManager + + d.cleanupManager = openCleanupManager(opts, d.objProvider, d.getDeletionPacerInfo) + + if manifestExists && !opts.DisableConsistencyCheck { + curVersion := d.mu.versions.currentVersion() + if err := checkConsistency(curVersion, d.objProvider); err != nil { + return nil, err + } + } + + fileCacheSize := FileCacheSize(opts.MaxOpenFiles) + if opts.FileCache == nil { + opts.FileCache = NewFileCache(opts.Experimental.FileCacheShards, fileCacheSize) + defer opts.FileCache.Unref() + } + d.fileCache = opts.FileCache.newHandle(d.cacheHandle, d.objProvider, d.opts.LoggerAndTracer, d.opts.MakeReaderOptions(), d.reportCorruption) + d.newIters = d.fileCache.newIters + d.tableNewRangeKeyIter = tableNewRangeKeyIter(d.newIters) + + d.mu.annotators.totalFileSize = d.makeFileSizeAnnotator(func(f *manifest.TableMetadata) bool { + return true + }) + d.mu.annotators.remoteSize = d.makeFileSizeAnnotator(func(f *manifest.TableMetadata) bool { + meta, err := d.objProvider.Lookup(base.FileTypeTable, f.TableBacking.DiskFileNum) + if err != nil { + return false + } + return meta.IsRemote() + }) + d.mu.annotators.externalSize = d.makeFileSizeAnnotator(func(f *manifest.TableMetadata) bool { + meta, err := d.objProvider.Lookup(base.FileTypeTable, f.TableBacking.DiskFileNum) + if err != nil { + return false + } + return meta.IsRemote() && meta.Remote.CleanupMethod == objstorage.SharedNoCleanup + }) + + var previousOptionsFileNum base.DiskFileNum + var previousOptionsFilename string + for _, filename := range ls { + ft, fn, ok := base.ParseFilename(opts.FS, filename) + if !ok { + continue + } + + // Don't reuse any obsolete file numbers to avoid modifying an + // ingested sstable's original external file. + d.mu.versions.markFileNumUsed(fn) + + switch ft { + case base.FileTypeLog: + // Ignore. + case base.FileTypeOptions: + if previousOptionsFileNum < fn { + previousOptionsFileNum = fn + previousOptionsFilename = filename + } + case base.FileTypeTemp, base.FileTypeOldTemp: + if !d.opts.ReadOnly { + // Some codepaths write to a temporary file and then + // rename it to its final location when complete. A + // temp file is leftover if a process exits before the + // rename. Remove it. + err := opts.FS.Remove(opts.FS.PathJoin(dirname, filename)) + if err != nil { + return nil, err + } + } + } + } + if n := len(wals); n > 0 { + // Don't reuse any obsolete file numbers to avoid modifying an + // ingested sstable's original external file. + d.mu.versions.markFileNumUsed(base.DiskFileNum(wals[n-1].Num)) + } + + // Ratchet d.mu.versions.nextFileNum ahead of all known objects in the + // objProvider. This avoids FileNum collisions with obsolete sstables. + objects := d.objProvider.List() + for _, obj := range objects { + d.mu.versions.markFileNumUsed(obj.DiskFileNum) + } + + // Validate the most-recent OPTIONS file, if there is one. + if previousOptionsFilename != "" { + path := opts.FS.PathJoin(dirname, previousOptionsFilename) + previousOptions, err := readOptionsFile(opts, path) + if err != nil { + return nil, err + } + if err := opts.CheckCompatibility(dirname, previousOptions); err != nil { + return nil, err + } + } + + // Replay any newer log files than the ones named in the manifest. + var replayWALs wal.Logs + for i, w := range wals { + if base.DiskFileNum(w.Num) >= d.mu.versions.minUnflushedLogNum { + replayWALs = wals[i:] + break + } + } + var flushableIngests []*ingestedFlushable + for i, lf := range replayWALs { + // WALs other than the last one would have been closed cleanly. + // + // Note: we used to never require strict WAL tails when reading from older + // versions: RocksDB 6.2.1 and the version of Pebble included in CockroachDB + // 20.1 do not guarantee that closed WALs end cleanly. But the earliest + // compatible Pebble format is newer and guarantees a clean EOF. + strictWALTail := i < len(replayWALs)-1 + fi, maxSeqNum, err := d.replayWAL(jobID, lf, strictWALTail) + if err != nil { + return nil, err + } + if len(fi) > 0 { + flushableIngests = append(flushableIngests, fi...) + } + if d.mu.versions.logSeqNum.Load() < maxSeqNum { + d.mu.versions.logSeqNum.Store(maxSeqNum) + } + } + if d.mu.mem.mutable == nil { + // Recreate the mutable memtable if replayWAL got rid of it. + var entry *flushableEntry + d.mu.mem.mutable, entry = d.newMemTable(d.mu.versions.getNextDiskFileNum(), d.mu.versions.logSeqNum.Load(), 0 /* minSize */) + d.mu.mem.queue = append(d.mu.mem.queue, entry) + } + d.mu.versions.visibleSeqNum.Store(d.mu.versions.logSeqNum.Load()) + + // Register with the CompactionScheduler before calling + // d.maybeScheduleFlush, since completion of the flush can trigger + // compactions. + d.opts.Experimental.CompactionScheduler.Register(2, d) + if !d.opts.ReadOnly { + d.maybeScheduleFlush() + for d.mu.compact.flushing { + d.mu.compact.cond.Wait() + } + + // Create an empty .log file for the mutable memtable. + newLogNum := d.mu.versions.getNextDiskFileNum() + d.mu.log.writer, err = d.mu.log.manager.Create(wal.NumWAL(newLogNum), int(jobID)) + if err != nil { + return nil, err + } + + // This isn't strictly necessary as we don't use the log number for + // memtables being flushed, only for the next unflushed memtable. + d.mu.mem.queue[len(d.mu.mem.queue)-1].logNum = newLogNum + } + d.updateReadStateLocked(d.opts.DebugCheck) + + if !d.opts.ReadOnly { + // If the Options specify a format major version higher than the + // loaded database's, upgrade it. If this is a new database, this + // code path also performs an initial upgrade from the starting + // implicit MinSupported version. + // + // We ratchet the version this far into Open so that migrations have a read + // state available. Note that this also results in creating/updating the + // format version marker file. + if opts.FormatMajorVersion > d.FormatMajorVersion() { + if err := d.ratchetFormatMajorVersionLocked(opts.FormatMajorVersion); err != nil { + return nil, err + } + } else if noFormatVersionMarker { + // We are creating a new store. Create the format version marker file. + if err := d.writeFormatVersionMarker(d.FormatMajorVersion()); err != nil { + return nil, err + } + } + + // Write the current options to disk. + d.optionsFileNum = d.mu.versions.getNextDiskFileNum() + tmpPath := base.MakeFilepath(opts.FS, dirname, base.FileTypeTemp, d.optionsFileNum) + optionsPath := base.MakeFilepath(opts.FS, dirname, base.FileTypeOptions, d.optionsFileNum) + + // Write them to a temporary file first, in case we crash before + // we're done. A corrupt options file prevents opening the + // database. + optionsFile, err := opts.FS.Create(tmpPath, vfs.WriteCategoryUnspecified) + if err != nil { + return nil, err + } + serializedOpts := []byte(opts.String()) + if _, err := optionsFile.Write(serializedOpts); err != nil { + return nil, errors.CombineErrors(err, optionsFile.Close()) + } + d.optionsFileSize = uint64(len(serializedOpts)) + if err := optionsFile.Sync(); err != nil { + return nil, errors.CombineErrors(err, optionsFile.Close()) + } + if err := optionsFile.Close(); err != nil { + return nil, err + } + // Atomically rename to the OPTIONS-XXXXXX path. This rename is + // guaranteed to be atomic because the destination path does not + // exist. + if err := opts.FS.Rename(tmpPath, optionsPath); err != nil { + return nil, err + } + if err := d.dataDir.Sync(); err != nil { + return nil, err + } + } + + if !d.opts.ReadOnly { + // Get a fresh list of files, in case some of the earlier flushes/compactions + // have deleted some files. + ls, err := opts.FS.List(dirname) + if err != nil { + return nil, err + } + d.scanObsoleteFiles(ls, flushableIngests) + d.deleteObsoleteFiles(jobID) + } + // Else, nothing is obsolete. + + d.mu.tableStats.cond.L = &d.mu.Mutex + d.mu.tableValidation.cond.L = &d.mu.Mutex + if !d.opts.ReadOnly { + d.maybeCollectTableStatsLocked() + } + d.calculateDiskAvailableBytes() + + d.maybeScheduleFlush() + d.maybeScheduleCompaction() + + // Note: this is a no-op if invariants are disabled or race is enabled. + // + // Setting a finalizer on *DB causes *DB to never be reclaimed and the + // finalizer to never be run. The problem is due to this limitation of + // finalizers mention in the SetFinalizer docs: + // + // If a cyclic structure includes a block with a finalizer, that cycle is + // not guaranteed to be garbage collected and the finalizer is not + // guaranteed to run, because there is no ordering that respects the + // dependencies. + // + // DB has cycles with several of its internal structures: readState, + // newIters, fileCache, versions, etc. Each of this individually cause a + // cycle and prevent the finalizer from being run. But we can workaround this + // finializer limitation by setting a finalizer on another object that is + // tied to the lifetime of DB: the DB.closed atomic.Value. + dPtr := fmt.Sprintf("%p", d) + invariants.SetFinalizer(d.closed, func(obj interface{}) { + v := obj.(*atomic.Value) + if err := v.Load(); err == nil { + fmt.Fprintf(os.Stderr, "%s: unreferenced DB not closed\n", dPtr) + os.Exit(1) + } + }) + + return d, nil +} + +// prepareAndOpenDirs opens the directories for the store (and creates them if +// necessary). +// +// Returns an error if ReadOnly is set and the directories don't exist. +func prepareAndOpenDirs( + dirname string, opts *Options, +) (walDirname string, dataDir vfs.File, err error) { + walDirname = dirname + if opts.WALDir != "" { + walDirname = resolveStorePath(dirname, opts.WALDir) + } + + // Create directories if needed. + if !opts.ReadOnly { + f, err := mkdirAllAndSyncParents(opts.FS, dirname) + if err != nil { + return "", nil, err + } + f.Close() + if walDirname != dirname { + f, err := mkdirAllAndSyncParents(opts.FS, walDirname) + if err != nil { + return "", nil, err + } + f.Close() + } + if opts.WALFailover != nil { + secondary := opts.WALFailover.Secondary + f, err := mkdirAllAndSyncParents(secondary.FS, resolveStorePath(dirname, secondary.Dirname)) + if err != nil { + return "", nil, err + } + f.Close() + } + } + + dataDir, err = opts.FS.OpenDir(dirname) + if err != nil { + if opts.ReadOnly && oserror.IsNotExist(err) { + return "", nil, errors.Errorf("pebble: database %q does not exist", dirname) + } + return "", nil, err + } + if opts.ReadOnly && walDirname != dirname { + // Check that the wal dir exists. + walDir, err := opts.FS.OpenDir(walDirname) + if err != nil { + dataDir.Close() + return "", nil, err + } + walDir.Close() + } + + return walDirname, dataDir, nil +} + +// GetVersion returns the engine version string from the latest options +// file present in dir. Used to check what Pebble or RocksDB version was last +// used to write to the database stored in this directory. An empty string is +// returned if no valid OPTIONS file with a version key was found. +func GetVersion(dir string, fs vfs.FS) (string, error) { + ls, err := fs.List(dir) + if err != nil { + return "", err + } + var version string + lastOptionsSeen := base.DiskFileNum(0) + for _, filename := range ls { + ft, fn, ok := base.ParseFilename(fs, filename) + if !ok { + continue + } + switch ft { + case base.FileTypeOptions: + // If this file has a higher number than the last options file + // processed, reset version. This is because rocksdb often + // writes multiple options files without deleting previous ones. + // Otherwise, skip parsing this options file. + if fn > lastOptionsSeen { + version = "" + lastOptionsSeen = fn + } else { + continue + } + f, err := fs.Open(fs.PathJoin(dir, filename)) + if err != nil { + return "", err + } + data, err := io.ReadAll(f) + f.Close() + + if err != nil { + return "", err + } + err = parseOptions(string(data), parseOptionsFuncs{ + visitKeyValue: func(i, j int, section, key, value string) error { + switch { + case section == "Version": + switch key { + case "pebble_version": + version = value + case "rocksdb_version": + version = fmt.Sprintf("rocksdb v%s", value) + } + } + return nil + }, + }) + if err != nil { + return "", err + } + } + } + return version, nil +} + +func (d *DB) replayIngestedFlushable( + b *Batch, logNum base.DiskFileNum, +) (entry *flushableEntry, err error) { + br := b.Reader() + seqNum := b.SeqNum() + + fileNums := make([]base.DiskFileNum, 0, b.Count()) + var exciseSpan KeyRange + addFileNum := func(encodedFileNum []byte) { + fileNum, n := binary.Uvarint(encodedFileNum) + if n <= 0 { + panic("pebble: ingest sstable file num is invalid") + } + fileNums = append(fileNums, base.DiskFileNum(fileNum)) + } + + for i := 0; i < int(b.Count()); i++ { + kind, key, val, ok, err := br.Next() + if err != nil { + return nil, err + } + if kind != InternalKeyKindIngestSST && kind != InternalKeyKindExcise { + panic("pebble: invalid batch key kind") + } + if !ok { + panic("pebble: invalid batch count") + } + if kind == base.InternalKeyKindExcise { + if exciseSpan.Valid() { + panic("pebble: multiple excise spans in a single batch") + } + exciseSpan.Start = slices.Clone(key) + exciseSpan.End = slices.Clone(val) + continue + } + addFileNum(key) + } + + if _, _, _, ok, err := br.Next(); err != nil { + return nil, err + } else if ok { + panic("pebble: invalid number of entries in batch") + } + + meta := make([]*manifest.TableMetadata, len(fileNums)) + var lastRangeKey keyspan.Span + for i, n := range fileNums { + readable, err := d.objProvider.OpenForReading(context.TODO(), base.FileTypeTable, n, + objstorage.OpenOptions{MustExist: true}) + if err != nil { + return nil, errors.Wrap(err, "pebble: error when opening flushable ingest files") + } + // NB: ingestLoad1 will close readable. + meta[i], lastRangeKey, err = ingestLoad1(context.TODO(), d.opts, d.FormatMajorVersion(), + readable, d.cacheHandle, base.PhysicalTableFileNum(n), disableRangeKeyChecks()) + if err != nil { + return nil, errors.Wrap(err, "pebble: error when loading flushable ingest files") + } + } + if lastRangeKey.Valid() && d.opts.Comparer.Split.HasSuffix(lastRangeKey.End) { + return nil, errors.AssertionFailedf("pebble: last ingest sstable has suffixed range key end %s", + d.opts.Comparer.FormatKey(lastRangeKey.End)) + } + + numFiles := len(meta) + if exciseSpan.Valid() { + numFiles++ + } + if uint32(numFiles) != b.Count() { + panic("pebble: couldn't load all files in WAL entry") + } + + return d.newIngestedFlushableEntry(meta, seqNum, logNum, exciseSpan) +} + +// replayWAL replays the edits in the specified WAL. If the DB is in read +// only mode, then the WALs are replayed into memtables and not flushed. If +// the DB is not in read only mode, then the contents of the WAL are +// guaranteed to be flushed when a flush is scheduled after this method is run. +// Note that this flushing is very important for guaranteeing durability: +// the application may have had a number of pending +// fsyncs to the WAL before the process crashed, and those fsyncs may not have +// happened but the corresponding data may now be readable from the WAL (while +// sitting in write-back caches in the kernel or the storage device). By +// reading the WAL (including the non-fsynced data) and then flushing all +// these changes (flush does fsyncs), we are able to guarantee that the +// initial state of the DB is durable. +// +// This method mutates d.mu.mem.queue and possibly d.mu.mem.mutable and replays +// WALs into the flushable queue. Flushing of the queue is expected to be handled +// by callers. A list of flushable ingests (but not memtables) replayed is returned. +// +// d.mu must be held when calling this, but the mutex may be dropped and +// re-acquired during the course of this method. +func (d *DB) replayWAL( + jobID JobID, ll wal.LogicalLog, strictWALTail bool, +) (flushableIngests []*ingestedFlushable, maxSeqNum base.SeqNum, err error) { + rr := ll.OpenForRead() + defer func() { _ = rr.Close() }() + var ( + b Batch + buf bytes.Buffer + mem *memTable + entry *flushableEntry + offset wal.Offset + lastFlushOffset int64 + keysReplayed int64 // number of keys replayed + batchesReplayed int64 // number of batches replayed + ) + + // TODO(jackson): This function is interspersed with panics, in addition to + // corruption error propagation. Audit them to ensure we're truly only + // panicking where the error points to Pebble bug and not user or + // hardware-induced corruption. + + // "Flushes" (ie. closes off) the current memtable, if not nil. + flushMem := func() { + if mem == nil { + return + } + mem.writerUnref() + if d.mu.mem.mutable == mem { + d.mu.mem.mutable = nil + } + entry.flushForced = !d.opts.ReadOnly + var logSize uint64 + mergedOffset := offset.Physical + offset.PreviousFilesBytes + if mergedOffset >= lastFlushOffset { + logSize = uint64(mergedOffset - lastFlushOffset) + } + // Else, this was the initial memtable in the read-only case which must have + // been empty, but we need to flush it since we don't want to add to it later. + lastFlushOffset = mergedOffset + entry.logSize = logSize + mem, entry = nil, nil + } + + mem = d.mu.mem.mutable + if mem != nil { + entry = d.mu.mem.queue[len(d.mu.mem.queue)-1] + if !d.opts.ReadOnly { + flushMem() + } + } + + // Creates a new memtable if there is no current memtable. + ensureMem := func(seqNum base.SeqNum) { + if mem != nil { + return + } + mem, entry = d.newMemTable(base.DiskFileNum(ll.Num), seqNum, 0 /* minSize */) + d.mu.mem.mutable = mem + d.mu.mem.queue = append(d.mu.mem.queue, entry) + } + + defer func() { + if err != nil { + err = errors.WithDetailf(err, "replaying wal %d, offset %s", ll.Num, offset) + } + }() + + for { + var r io.Reader + var err error + r, offset, err = rr.NextRecord() + if err == nil { + _, err = io.Copy(&buf, r) + } + if err != nil { + // It is common to encounter a zeroed or invalid chunk due to WAL + // preallocation and WAL recycling. However zeroed or invalid chunks + // can also be a consequence of corruption / disk rot. When the log + // reader encounters one of these cases, it attempts to disambiguate + // by reading ahead looking for a future record. If a future chunk + // indicates the chunk at the original offset should've been valid, it + // surfaces record.ErrInvalidChunk or record.ErrZeroedChunk. These + // errors are always indicative of corruption and data loss. + // + // Otherwise, the reader surfaces record.ErrUnexpectedEOF indicating + // that the WAL terminated uncleanly and ambiguously. If the WAL is + // the most recent logical WAL, the caller passes in + // (strictWALTail=false), indicating we should tolerate the unclean + // ending. If the WAL is an older WAL, the caller passes in + // (strictWALTail=true), indicating that the WAL should have been + // closed cleanly, and we should interpret the + // `record.ErrUnexpectedEOF` as corruption and stop recovery. + if errors.Is(err, io.EOF) { + break + } else if errors.Is(err, record.ErrUnexpectedEOF) && !strictWALTail { + break + } else if (errors.Is(err, record.ErrUnexpectedEOF) && strictWALTail) || + errors.Is(err, record.ErrInvalidChunk) || errors.Is(err, record.ErrZeroedChunk) { + // If a read-ahead returns record.ErrInvalidChunk or + // record.ErrZeroedChunk, then there's definitively corruption. + // + // If strictWALTail=true, then record.ErrUnexpectedEOF should + // also be considered corruption because the strictWALTail + // indicates we expect a clean end to the WAL. + // + // Other I/O related errors should not be marked with corruption + // and simply returned. + err = errors.Mark(err, ErrCorruption) + } + + return nil, 0, errors.Wrap(err, "pebble: error when replaying WAL") + } + + if buf.Len() < batchrepr.HeaderLen { + return nil, 0, base.CorruptionErrorf("pebble: corrupt wal %s (offset %s)", + errors.Safe(base.DiskFileNum(ll.Num)), offset) + } + + if d.opts.ErrorIfNotPristine { + return nil, 0, errors.WithDetailf(ErrDBNotPristine, "location: %q", d.dirname) + } + + // Specify Batch.db so that Batch.SetRepr will compute Batch.memTableSize + // which is used below. + b = Batch{} + b.db = d + if err := b.SetRepr(buf.Bytes()); err != nil { + return nil, 0, err + } + seqNum := b.SeqNum() + maxSeqNum = seqNum + base.SeqNum(b.Count()) + keysReplayed += int64(b.Count()) + batchesReplayed++ + { + br := b.Reader() + if kind, _, _, ok, err := br.Next(); err != nil { + return nil, 0, err + } else if ok && (kind == InternalKeyKindIngestSST || kind == InternalKeyKindExcise) { + // We're in the flushable ingests (+ possibly excises) case. + // + // Ingests require an up-to-date view of the LSM to determine the target + // level of ingested sstables, and to accurately compute excises. Instead of + // doing an ingest in this function, we just enqueue a flushable ingest + // in the flushables queue and run a regular flush. + flushMem() + // mem is nil here. + entry, err = d.replayIngestedFlushable(&b, base.DiskFileNum(ll.Num)) + if err != nil { + return nil, 0, err + } + fi := entry.flushable.(*ingestedFlushable) + flushableIngests = append(flushableIngests, fi) + d.mu.mem.queue = append(d.mu.mem.queue, entry) + // A flushable ingest is always followed by a WAL rotation. + break + } + } + + if b.memTableSize >= uint64(d.largeBatchThreshold) { + flushMem() + // Make a copy of the data slice since it is currently owned by buf and will + // be reused in the next iteration. + b.data = slices.Clone(b.data) + b.flushable, err = newFlushableBatch(&b, d.opts.Comparer) + if err != nil { + return nil, 0, err + } + entry := d.newFlushableEntry(b.flushable, base.DiskFileNum(ll.Num), b.SeqNum()) + // Disable memory accounting by adding a reader ref that will never be + // removed. + entry.readerRefs.Add(1) + d.mu.mem.queue = append(d.mu.mem.queue, entry) + } else { + ensureMem(seqNum) + if err = mem.prepare(&b); err != nil && err != arenaskl.ErrArenaFull { + return nil, 0, err + } + // We loop since DB.newMemTable() slowly grows the size of allocated memtables, so the + // batch may not initially fit, but will eventually fit (since it is smaller than + // largeBatchThreshold). + for err == arenaskl.ErrArenaFull { + flushMem() + ensureMem(seqNum) + err = mem.prepare(&b) + if err != nil && err != arenaskl.ErrArenaFull { + return nil, 0, err + } + } + if err = mem.apply(&b, seqNum); err != nil { + return nil, 0, err + } + mem.writerUnref() + } + buf.Reset() + } + + d.opts.Logger.Infof("[JOB %d] WAL %s stopped reading at offset: %s; replayed %d keys in %d batches", + jobID, ll.String(), offset, keysReplayed, batchesReplayed) + if !d.opts.ReadOnly { + flushMem() + } + + // mem is nil here, if !ReadOnly. + return flushableIngests, maxSeqNum, err +} + +func readOptionsFile(opts *Options, path string) (string, error) { + f, err := opts.FS.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + data, err := io.ReadAll(f) + if err != nil { + return "", err + } + return string(data), nil +} + +// DBDesc briefly describes high-level state about a database. +type DBDesc struct { + // Exists is true if an existing database was found. + Exists bool + // FormatMajorVersion indicates the database's current format + // version. + FormatMajorVersion FormatMajorVersion + // ManifestFilename is the filename of the current active manifest, + // if the database exists. + ManifestFilename string + // OptionsFilename is the filename of the most recent OPTIONS file, if it + // exists. + OptionsFilename string +} + +// String implements fmt.Stringer. +func (d *DBDesc) String() string { + if !d.Exists { + return "uninitialized" + } + var buf bytes.Buffer + fmt.Fprintf(&buf, "initialized at format major version %s\n", d.FormatMajorVersion) + fmt.Fprintf(&buf, "manifest: %s\n", d.ManifestFilename) + fmt.Fprintf(&buf, "options: %s", d.OptionsFilename) + return buf.String() +} + +// Peek looks for an existing database in dirname on the provided FS. It +// returns a brief description of the database. Peek is read-only and +// does not open the database +func Peek(dirname string, fs vfs.FS) (*DBDesc, error) { + ls, err := fs.List(dirname) + if err != nil { + return nil, err + } + + vers, versMarker, err := lookupFormatMajorVersion(fs, dirname, ls) + if err != nil { + return nil, err + } + // TODO(jackson): Immediately closing the marker is clunky. Add a + // PeekMarker variant that avoids opening the directory. + if err := versMarker.Close(); err != nil { + return nil, err + } + + // Find the currently active manifest, if there is one. + manifestMarker, manifestFileNum, exists, err := findCurrentManifest(fs, dirname, ls) + if err != nil { + return nil, err + } + // TODO(jackson): Immediately closing the marker is clunky. Add a + // PeekMarker variant that avoids opening the directory. + if err := manifestMarker.Close(); err != nil { + return nil, err + } + + desc := &DBDesc{ + Exists: exists, + FormatMajorVersion: vers, + } + + // Find the OPTIONS file with the highest file number within the list of + // directory entries. + var previousOptionsFileNum base.DiskFileNum + for _, filename := range ls { + ft, fn, ok := base.ParseFilename(fs, filename) + if !ok || ft != base.FileTypeOptions || fn < previousOptionsFileNum { + continue + } + previousOptionsFileNum = fn + desc.OptionsFilename = fs.PathJoin(dirname, filename) + } + + if exists { + desc.ManifestFilename = base.MakeFilepath(fs, dirname, base.FileTypeManifest, manifestFileNum) + } + return desc, nil +} + +// LockDirectory acquires the database directory lock in the named directory, +// preventing another process from opening the database. LockDirectory returns a +// handle to the held lock that may be passed to Open through Options.Lock to +// subsequently open the database, skipping lock acquistion during Open. +// +// LockDirectory may be used to expand the critical section protected by the +// database lock to include setup before the call to Open. +func LockDirectory(dirname string, fs vfs.FS) (*Lock, error) { + fileLock, err := fs.Lock(base.MakeFilepath(fs, dirname, base.FileTypeLock, base.DiskFileNum(0))) + if err != nil { + return nil, err + } + l := &Lock{dirname: dirname, fileLock: fileLock} + l.refs.Store(1) + invariants.SetFinalizer(l, func(obj interface{}) { + if refs := obj.(*Lock).refs.Load(); refs > 0 { + panic(errors.AssertionFailedf("lock for %q finalized with %d refs", dirname, refs)) + } + }) + return l, nil +} + +// Lock represents a file lock on a directory. It may be passed to Open through +// Options.Lock to elide lock aquisition during Open. +type Lock struct { + dirname string + fileLock io.Closer + // refs is a count of the number of handles on the lock. refs must be 0, 1 + // or 2. + // + // When acquired by the client and passed to Open, refs = 1 and the Open + // call increments it to 2. When the database is closed, it's decremented to + // 1. Finally when the original caller, calls Close on the Lock, it's + // drecemented to zero and the underlying file lock is released. + // + // When Open acquires the file lock, refs remains at 1 until the database is + // closed. + refs atomic.Int32 +} + +func (l *Lock) refForOpen() error { + // During Open, when a user passed in a lock, the reference count must be + // exactly 1. If it's zero, the lock is no longer held and is invalid. If + // it's 2, the lock is already in use by another database within the + // process. + if !l.refs.CompareAndSwap(1, 2) { + return errors.Errorf("pebble: unexpected Lock reference count; is the lock already in use?") + } + return nil +} + +// Close releases the lock, permitting another process to lock and open the +// database. Close must not be called until after a database using the Lock has +// been closed. +func (l *Lock) Close() error { + if l.refs.Add(-1) > 0 { + return nil + } + defer func() { l.fileLock = nil }() + return l.fileLock.Close() +} + +func (l *Lock) pathMatches(dirname string) error { + if dirname == l.dirname { + return nil + } + // Check for relative paths, symlinks, etc. This isn't ideal because we're + // circumventing the vfs.FS interface here. + // + // TODO(jackson): We could add support for retrieving file inodes through Stat + // calls in the VFS interface on platforms where it's available and use that + // to differentiate. + dirStat, err1 := os.Stat(dirname) + lockDirStat, err2 := os.Stat(l.dirname) + if err1 == nil && err2 == nil && os.SameFile(dirStat, lockDirStat) { + return nil + } + return errors.Join( + errors.Newf("pebble: opts.Lock acquired in %q not %q", l.dirname, dirname), + err1, err2) +} + +// ErrDBDoesNotExist is generated when ErrorIfNotExists is set and the database +// does not exist. +// +// Note that errors can be wrapped with more details; use errors.Is(). +var ErrDBDoesNotExist = errors.New("pebble: database does not exist") + +// ErrDBAlreadyExists is generated when ErrorIfExists is set and the database +// already exists. +// +// Note that errors can be wrapped with more details; use errors.Is(). +var ErrDBAlreadyExists = errors.New("pebble: database already exists") + +// ErrDBNotPristine is generated when ErrorIfNotPristine is set and the database +// already exists and is not pristine. +// +// Note that errors can be wrapped with more details; use errors.Is(). +var ErrDBNotPristine = errors.New("pebble: database already exists and is not pristine") + +func checkConsistency(v *manifest.Version, objProvider objstorage.Provider) error { + var errs []error + dedup := make(map[base.DiskFileNum]struct{}) + for level, files := range v.Levels { + for f := range files.All() { + backingState := f.TableBacking + if _, ok := dedup[backingState.DiskFileNum]; ok { + continue + } + dedup[backingState.DiskFileNum] = struct{}{} + fileNum := backingState.DiskFileNum + fileSize := backingState.Size + // We skip over remote objects; those are instead checked asynchronously + // by the table stats loading job. + meta, err := objProvider.Lookup(base.FileTypeTable, fileNum) + var size int64 + if err == nil { + if meta.IsRemote() { + continue + } + size, err = objProvider.Size(meta) + } + if err != nil { + errs = append(errs, errors.Wrapf(err, "L%d: %s", errors.Safe(level), fileNum)) + continue + } + + if size != int64(fileSize) { + errs = append(errs, errors.Errorf( + "L%d: %s: object size mismatch (%s): %d (disk) != %d (MANIFEST)", + errors.Safe(level), fileNum, objProvider.Path(meta), + errors.Safe(size), errors.Safe(fileSize))) + continue + } + } + } + return errors.Join(errs...) +} + +type walEventListenerAdaptor struct { + l *EventListener +} + +func (l walEventListenerAdaptor) LogCreated(ci wal.CreateInfo) { + // TODO(sumeer): extend WALCreateInfo for the failover case in case the path + // is insufficient to infer whether primary or secondary. + wci := WALCreateInfo{ + JobID: ci.JobID, + Path: ci.Path, + FileNum: base.DiskFileNum(ci.Num), + RecycledFileNum: ci.RecycledFileNum, + Err: ci.Err, + } + l.l.WALCreated(wci) +} diff --git a/vendor/github.com/cockroachdb/pebble/options.go b/vendor/github.com/cockroachdb/pebble/v2/options.go similarity index 51% rename from vendor/github.com/cockroachdb/pebble/options.go rename to vendor/github.com/cockroachdb/pebble/v2/options.go index b6f240d..dd598c4 100644 --- a/vendor/github.com/cockroachdb/pebble/options.go +++ b/vendor/github.com/cockroachdb/pebble/v2/options.go @@ -8,23 +8,32 @@ import ( "bytes" "fmt" "io" + "regexp" "runtime" + "sort" "strconv" "strings" "time" + "unicode" + "github.com/cockroachdb/crlib/fifo" "github.com/cockroachdb/errors" - "github.com/cockroachdb/fifo" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/cache" - "github.com/cockroachdb/pebble/internal/humanize" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider" - "github.com/cockroachdb/pebble/objstorage/remote" - "github.com/cockroachdb/pebble/rangekey" - "github.com/cockroachdb/pebble/sstable" - "github.com/cockroachdb/pebble/vfs" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/cache" + "github.com/cockroachdb/pebble/v2/internal/humanize" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/testkeys" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/rangekey" + "github.com/cockroachdb/pebble/v2/sstable" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/colblk" + "github.com/cockroachdb/pebble/v2/vfs" + "github.com/cockroachdb/pebble/v2/wal" + "github.com/cockroachdb/redact" ) const ( @@ -32,17 +41,6 @@ const ( defaultLevelMultiplier = 10 ) -// Compression exports the base.Compression type. -type Compression = sstable.Compression - -// Exported Compression constants. -const ( - DefaultCompression = sstable.DefaultCompression - NoCompression = sstable.NoCompression - SnappyCompression = sstable.SnappyCompression - ZstdCompression = sstable.ZstdCompression -) - // FilterType exports the base.FilterType type. type FilterType = base.FilterType @@ -57,8 +55,10 @@ type FilterWriter = base.FilterWriter // FilterPolicy exports the base.FilterPolicy type. type FilterPolicy = base.FilterPolicy -// TablePropertyCollector exports the sstable.TablePropertyCollector type. -type TablePropertyCollector = sstable.TablePropertyCollector +var NoFilterPolicy = base.NoFilterPolicy + +// KeySchema exports the colblk.KeySchema type. +type KeySchema = colblk.KeySchema // BlockPropertyCollector exports the sstable.BlockPropertyCollector type. type BlockPropertyCollector = sstable.BlockPropertyCollector @@ -116,11 +116,6 @@ type IterOptions struct { // boundary the iterator will return Valid()==false. Setting UpperBound // effectively truncates the key space visible to the iterator. UpperBound []byte - // TableFilter can be used to filter the tables that are scanned during - // iteration based on the user properties. Return true to scan the table and - // false to skip scanning. This function must be thread-safe since the same - // function can be used by multiple iterators, if the iterator is cloned. - TableFilter func(userProps map[string]string) bool // SkipPoint may be used to skip over point keys that don't match an // arbitrary predicate during iteration. If set, the Iterator invokes // SkipPoint for keys encountered. If SkipPoint returns true, the iterator @@ -188,19 +183,24 @@ type IterOptions struct { // existing is not low or if we just expect a one-time Seek (where loading the // data block directly is better). UseL6Filters bool + // Category is used for categorized iterator stats. This should not be + // changed by calling SetOptions. + Category block.Category + + DebugRangeKeyStack bool // Internal options. logger Logger - // Level corresponding to this file. Only passed in if constructed by a + // Layer corresponding to this file. Only passed in if constructed by a // levelIter. - level manifest.Level + layer manifest.Layer // disableLazyCombinedIteration is an internal testing option. disableLazyCombinedIteration bool // snapshotForHideObsoletePoints is specified for/by levelIter when opening // files and is used to decide whether to hide obsolete points. A value of 0 // implies obsolete points should not be hidden. - snapshotForHideObsoletePoints uint64 + snapshotForHideObsoletePoints base.SeqNum // NB: If adding new Options, you must account for them in iterator // construction and Iterator.SetOptions. @@ -258,14 +258,13 @@ func (o *IterOptions) SpanIterOptions() keyspan.SpanIterOptions { type scanInternalOptions struct { IterOptions - visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error - visitRangeDel func(start, end []byte, seqNum uint64) error - visitRangeKey func(start, end []byte, keys []rangekey.Key) error - visitSharedFile func(sst *SharedSSTMeta) error + category block.Category - // skipSharedLevels skips levels that are shareable (level >= - // sharedLevelStart). - skipSharedLevels bool + visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error + visitRangeDel func(start, end []byte, seqNum SeqNum) error + visitRangeKey func(start, end []byte, keys []rangekey.Key) error + visitSharedFile func(sst *SharedSSTMeta) error + visitExternalFile func(sst *ExternalFile) error // includeObsoleteKeys specifies whether keys shadowed by newer internal keys // are exposed. If false, only one internal key per user key is exposed. @@ -384,25 +383,31 @@ type LevelOptions struct { // BlockRestartInterval is the number of keys between restart points // for delta encoding of keys. // - // The default value is 16. + // The default value is 16 for L0, and the value from the previous level for + // all other levels. BlockRestartInterval int // BlockSize is the target uncompressed size in bytes of each table block. // - // The default value is 4096. + // The default value is 4096 for L0, and the value from the previous level for + // all other levels. BlockSize int // BlockSizeThreshold finishes a block if the block size is larger than the // specified percentage of the target block size and adding the next entry // would cause the block to be larger than the target block size. // - // The default value is 90 + // The default value is 90 for L0, and the value from the previous level for + // all other levels. BlockSizeThreshold int // Compression defines the per-block compression to use. // - // The default value (DefaultCompression) uses snappy compression. - Compression Compression + // The default value is Snappy for L0, or the function from the previous level + // for all other levels. + // + // ApplyCompressionSettings can be used to initialize this field for all levels. + Compression func() *sstable.CompressionProfile // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can // reduce disk reads for Get calls. @@ -410,16 +415,12 @@ type LevelOptions struct { // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom // package. // - // The default value means to use no filter. + // The default value for L0 is NoFilterPolicy (no filter), and the value from + // the previous level for all other levels. FilterPolicy FilterPolicy - // FilterType defines whether an existing filter policy is applied at a - // block-level or table-level. Block-level filters use less memory to create, - // but are slower to access as a check for the key in the index must first be - // performed to locate the filter block. A table-level filter will require - // memory proportional to the number of keys in an sstable to create, but - // avoids the index lookup when determining if a key is present. Table-level - // filters should be preferred except under constrained memory situations. + // FilterType is a legacy field. The default and only possible value is + // TableFilter. FilterType FilterType // IndexBlockSize is the target uncompressed size in bytes of each index @@ -428,41 +429,59 @@ type LevelOptions struct { // (such as math.MaxInt32) disables the automatic creation of two-level // indexes. // - // The default value is the value of BlockSize. + // The default value is the value of BlockSize for L0, or the value from the + // previous level for all other levels. IndexBlockSize int - - // The target file size for the level. - TargetFileSize int64 } -// EnsureDefaults ensures that the default values for all of the options have -// been initialized. It is valid to call EnsureDefaults on a nil receiver. A -// non-nil result will always be returned. -func (o *LevelOptions) EnsureDefaults() *LevelOptions { - if o == nil { - o = &LevelOptions{} - } +// EnsureL0Defaults ensures that the L0 default values for the options have been +// initialized. +func (o *LevelOptions) EnsureL0Defaults() { if o.BlockRestartInterval <= 0 { o.BlockRestartInterval = base.DefaultBlockRestartInterval } if o.BlockSize <= 0 { o.BlockSize = base.DefaultBlockSize - } else if o.BlockSize > sstable.MaximumBlockSize { - panic(errors.Errorf("BlockSize %d exceeds MaximumBlockSize", o.BlockSize)) + } else if o.BlockSize > sstable.MaximumRestartOffset { + panic(errors.Errorf("BlockSize %d exceeds MaximumRestartOffset", o.BlockSize)) } if o.BlockSizeThreshold <= 0 { o.BlockSizeThreshold = base.DefaultBlockSizeThreshold } - if o.Compression <= DefaultCompression || o.Compression >= sstable.NCompression { - o.Compression = SnappyCompression + if o.Compression == nil { + o.Compression = func() *sstable.CompressionProfile { return sstable.SnappyCompression } + } + if o.FilterPolicy == nil { + o.FilterPolicy = NoFilterPolicy } if o.IndexBlockSize <= 0 { o.IndexBlockSize = o.BlockSize } - if o.TargetFileSize <= 0 { - o.TargetFileSize = 2 << 20 // 2 MB +} + +// EnsureL1PlusDefaults ensures that the L1+ default values for the options have +// been initialized. Requires the fully initialized options for the level above. +func (o *LevelOptions) EnsureL1PlusDefaults(previousLevel *LevelOptions) { + if o.BlockRestartInterval <= 0 { + o.BlockRestartInterval = previousLevel.BlockRestartInterval + } + if o.BlockSize <= 0 { + o.BlockSize = previousLevel.BlockSize + } else if o.BlockSize > sstable.MaximumRestartOffset { + panic(errors.Errorf("BlockSize %d exceeds MaximumRestartOffset", o.BlockSize)) + } + if o.BlockSizeThreshold <= 0 { + o.BlockSizeThreshold = previousLevel.BlockSizeThreshold + } + if o.Compression == nil { + o.Compression = previousLevel.Compression + } + if o.FilterPolicy == nil { + o.FilterPolicy = previousLevel.FilterPolicy + } + if o.IndexBlockSize <= 0 { + o.IndexBlockSize = previousLevel.IndexBlockSize } - return o } // Options holds the optional parameters for configuring pebble. These options @@ -478,10 +497,11 @@ type Options struct { // The default value is 512KB. BytesPerSync int - // Cache is used to cache uncompressed blocks from sstables. - // - // The default cache size is 8 MB. + // Cache is used to cache uncompressed blocks from sstables. If it is nil, + // a block cache of CacheSize will be created for each DB. Cache *cache.Cache + // CacheSize is used when Cache is not set. The default value is 8 MB. + CacheSize int64 // LoadBlockSema, if set, is used to limit the number of blocks that can be // loaded (i.e. read from the filesystem) in parallel. Each load acquires one @@ -495,9 +515,9 @@ type Options struct { // Local contains option that pertain to files stored on the local filesystem. Local struct { - // ReadaheadConfigFn is a function used to retrieve the current readahead - // mode. This function is consulted when a table enters the table cache. - ReadaheadConfigFn func() ReadaheadConfig + // ReadaheadConfig is used to retrieve the current readahead mode; it is + // consulted whenever a read handle is initialized. + ReadaheadConfig *ReadaheadConfig // TODO(radu): move BytesPerSync, LoadBlockSema, Cleaner here. } @@ -555,7 +575,7 @@ type Options struct { // The threshold of L0 read-amplification at which compaction concurrency // is enabled (if CompactionDebtConcurrency was not already exceeded). // Every multiple of this value enables another concurrent - // compaction up to MaxConcurrentCompactions. + // compaction up to CompactionConcurrencyRange. L0CompactionConcurrency int // CompactionDebtConcurrency controls the threshold of compaction debt @@ -566,13 +586,20 @@ type Options struct { // concurrency slots as determined by the two options is chosen. CompactionDebtConcurrency uint64 + // CompactionGarbageFractionForMaxConcurrency is the fraction of garbage + // due to DELs and RANGEDELs that causes MaxConcurrentCompactions to be + // allowed. Concurrent compactions are allowed in a linear manner upto + // this limit being reached. A value <= 0.0 disables adding concurrency + // due to garbage. + CompactionGarbageFractionForMaxConcurrency func() float64 + // IngestSplit, if it returns true, allows for ingest-time splitting of // existing sstables into two virtual sstables to allow ingestion sstables to // slot into a lower level than they otherwise would have. IngestSplit func() bool // ReadCompactionRate controls the frequency of read triggered - // compactions by adjusting `AllowedSeeks` in manifest.FileMetadata: + // compactions by adjusting `AllowedSeeks` in manifest.TableMetadata: // // AllowedSeeks = FileSize / ReadCompactionRate // @@ -601,28 +628,47 @@ type Options struct { // gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB). ReadSamplingMultiplier int64 - // TableCacheShards is the number of shards per table cache. + // NumDeletionsThreshold defines the minimum number of point tombstones + // that must be present in a single data block for that block to be + // considered tombstone-dense for the purposes of triggering a + // tombstone density compaction. Data blocks may also be considered + // tombstone-dense if they meet the criteria defined by + // DeletionSizeRatioThreshold below. Tombstone-dense blocks are identified + // when sstables are written, and so this is effectively an option for + // sstable writers. The default value is 100. + NumDeletionsThreshold int + + // DeletionSizeRatioThreshold defines the minimum ratio of the size of + // point tombstones to the size of a data block that must be reached + // for that block to be considered tombstone-dense for the purposes of + // triggering a tombstone density compaction. Data blocks may also be + // considered tombstone-dense if they meet the criteria defined by + // NumDeletionsThreshold above. Tombstone-dense blocks are identified + // when sstables are written, and so this is effectively an option for + // sstable writers. The default value is 0.5. + DeletionSizeRatioThreshold float32 + + // TombstoneDenseCompactionThreshold is the minimum percent of data + // blocks in a table that must be tombstone-dense for that table to be + // eligible for a tombstone density compaction. It should be defined as a + // ratio out of 1. The default value is 0.10. + // + // If multiple tables are eligible for a tombstone density compaction, then + // tables with a higher percent of tombstone-dense blocks are still + // prioritized for compaction. + // + // A zero or negative value disables tombstone density compactions. + TombstoneDenseCompactionThreshold float64 + + // FileCacheShards is the number of shards per file cache. // Reducing the value can reduce the number of idle goroutines per DB // instance which can be useful in scenarios with a lot of DB instances // and a large number of CPUs, but doing so can lead to higher contention - // in the table cache and reduced performance. + // in the file cache and reduced performance. // // The default value is the number of logical CPUs, which can be // limited by runtime.GOMAXPROCS. - TableCacheShards int - - // KeyValidationFunc is a function to validate a user key in an SSTable. - // - // Currently, this function is used to validate the smallest and largest - // keys in an SSTable undergoing compaction. In this case, returning an - // error from the validation function will result in a panic at runtime, - // given that there is rarely any way of recovering from malformed keys - // present in compacted files. By default, validation is not performed. - // - // Additional use-cases may be added in the future. - // - // NOTE: callers should take care to not mutate the key being validated. - KeyValidationFunc func(userKey []byte) error + FileCacheShards int // ValidateOnIngest schedules validation of sstables after they have // been ingested. @@ -639,23 +685,13 @@ type Options struct { // compaction will never get triggered. MultiLevelCompactionHeuristic MultiLevelHeuristic - // MaxWriterConcurrency is used to indicate the maximum number of - // compression workers the compression queue is allowed to use. If - // MaxWriterConcurrency > 0, then the Writer will use parallelism, to - // compress and write blocks to disk. Otherwise, the writer will - // compress and write blocks to disk synchronously. - MaxWriterConcurrency int - - // ForceWriterParallelism is used to force parallelism in the sstable - // Writer for the metamorphic tests. Even with the MaxWriterConcurrency - // option set, we only enable parallelism in the sstable Writer if there - // is enough CPU available, and this option bypasses that. - ForceWriterParallelism bool - - // CPUWorkPermissionGranter should be set if Pebble should be given the - // ability to optionally schedule additional CPU. See the documentation - // for CPUWorkPermissionGranter for more details. - CPUWorkPermissionGranter CPUWorkPermissionGranter + // EnableColumnarBlocks is used to decide whether to enable writing + // TableFormatPebblev5 sstables. This setting is only respected by + // FormatColumnarBlocks. In lower format major versions, the + // TableFormatPebblev5 format is prohibited. If EnableColumnarBlocks is + // nil and the DB is at FormatColumnarBlocks, the DB defaults to not + // writing columnar blocks. + EnableColumnarBlocks func() bool // EnableValueBlocks is used to decide whether to enable writing // TableFormatPebblev3 sstables. This setting is only respected by a @@ -670,29 +706,8 @@ type Options struct { // value and stored with the key, when the value is stored elsewhere. ShortAttributeExtractor ShortAttributeExtractor - // RequiredInPlaceValueBound specifies an optional span of user key - // prefixes that are not-MVCC, but have a suffix. For these the values - // must be stored with the key, since the concept of "older versions" is - // not defined. It is also useful for statically known exclusions to value - // separation. In CockroachDB, this will be used for the lock table key - // space that has non-empty suffixes, but those locks don't represent - // actual MVCC versions (the suffix ordering is arbitrary). We will also - // need to add support for dynamically configured exclusions (we want the - // default to be to allow Pebble to decide whether to separate the value - // or not, hence this is structured as exclusions), for example, for users - // of CockroachDB to dynamically exclude certain tables. - // - // Any change in exclusion behavior takes effect only on future written - // sstables, and does not start rewriting existing sstables. - // - // Even ignoring changes in this setting, exclusions are interpreted as a - // guidance by Pebble, and not necessarily honored. Specifically, user - // keys with multiple Pebble-versions *may* have the older versions stored - // in value blocks. - RequiredInPlaceValueBound UserKeyPrefixBound - // DisableIngestAsFlushable disables lazy ingestion of sstables through - // a WAL write and memtable rotation. Only effectual if the the format + // a WAL write and memtable rotation. Only effectual if the format // major version is at least `FormatFlushableIngest`. DisableIngestAsFlushable func() bool @@ -716,35 +731,34 @@ type Options struct { // on shared storage in bytes. If it is 0, no cache is used. SecondaryCacheSizeBytes int64 - // IneffectualPointDeleteCallback is called in compactions/flushes if any - // single delete is being elided without deleting a point set/merge. - IneffectualSingleDeleteCallback func(userKey []byte) + // EnableDeleteOnlyCompactionExcises enables delete-only compactions to also + // apply delete-only compaction hints on sstables that partially overlap + // with it. This application happens through an excise, similar to + // the excise phase of IngestAndExcise. + EnableDeleteOnlyCompactionExcises func() bool - // SingleDeleteInvariantViolationCallback is called in compactions/flushes if any - // single delete has consumed a Set/Merge, and there is another immediately older - // Set/SetWithDelete/Merge. The user of Pebble has violated the invariant under - // which SingleDelete can be used correctly. - // - // Consider the sequence SingleDelete#3, Set#2, Set#1. There are three - // ways some of these keys can first meet in a compaction. - // - // - All 3 keys in the same compaction: this callback will detect the - // violation. - // - // - SingleDelete#3, Set#2 meet in a compaction first: Both keys will - // disappear. The violation will not be detected, and the DB will have - // Set#1 which is likely incorrect (from the user's perspective). - // - // - Set#2, Set#1 meet in a compaction first: The output will be Set#2, - // which will later be consumed by SingleDelete#3. The violation will - // not be detected and the DB will be correct. - SingleDeleteInvariantViolationCallback func(userKey []byte) + // CompactionScheduler, if set, is used to limit concurrent compactions as + // well as to pace compactions already chosen. If nil, a default scheduler + // is created and used. + CompactionScheduler CompactionScheduler + + UserKeyCategories UserKeyCategories + + // ValueSeparationPolicy controls the policy for separating values into + // external blob files. If nil, value separation defaults to disabled. + // The value separation policy is ignored if EnableColumnarBlocks() is + // false. + ValueSeparationPolicy func() ValueSeparationPolicy + + // SpanPolicyFunc is used to determine the SpanPolicy for a key region. + SpanPolicyFunc SpanPolicyFunc } // Filters is a map from filter policy name to filter policy. It is used for // debugging tools which may be used on multiple databases configured with // different filter policies. It is not necessary to populate this filters - // map during normal usage of a DB. + // map during normal usage of a DB (it will be done automatically by + // EnsureDefaults). Filters map[string]FilterPolicy // FlushDelayDeleteRange configures how long the database should wait before @@ -792,6 +806,23 @@ type Options struct { // The default value uses the underlying operating system's file system. FS vfs.FS + // KeySchema is the name of the key schema that should be used when writing + // new sstables. There must be a key schema with this name defined in + // KeySchemas. If not set, colblk.DefaultKeySchema is used to construct a + // default key schema. + KeySchema string + + // KeySchemas defines the set of known schemas of user keys. When columnar + // blocks are in use (see FormatColumnarBlocks), the user may specify how a + // key should be decomposed into columns. Each KeySchema must have a unique + // name. The schema named by Options.KeySchema is used while writing + // sstables during flushes and compactions. + // + // Multiple KeySchemas may be used over the lifetime of a database. Once a + // KeySchema is used, it must be provided in KeySchemas in subsequent calls + // to Open for perpetuity. + KeySchemas sstable.KeySchemas + // Lock, if set, must be a database lock acquired through LockDirectory for // the same directory passed to Open. If provided, Open will skip locking // the directory. Closing the database will not release the lock, and it's @@ -820,9 +851,20 @@ type Options struct { // maximum number of bytes for a level is exceeded, compaction is requested. LBaseMaxBytes int64 - // Per-level options. Options for at least one level must be specified. The - // options for the last level are used for all subsequent levels. - Levels []LevelOptions + // TargetFileSizes contains the target file size for each level, ignoring + // unpopulated levels. Specifically: + // - TargetFileSizes[0] is the target file size for L0; + // - TargetFileSizes[1] is the target file size for Lbase; + // - TargetFileSizes[2] is the target file size for Lbase+1; + // and so on. + // + // The default value for TargetFileSizes[0] is 2MB. + // The default value for TargetFileSizes[i] is TargetFileSizes[i-1] * 2. + TargetFileSizes [manifest.NumLevels]int64 + + // Per-level options. Levels[i] contains the options for Li (regardless of + // what Lbase is). + Levels [manifest.NumLevels]LevelOptions // LoggerAndTracer will be used, if non-nil, else Logger will be used and // tracing will be a noop. @@ -873,19 +915,78 @@ type Options struct { // The default merger concatenates values. Merger *Merger - // MaxConcurrentCompactions specifies the maximum number of concurrent - // compactions. The default is 1. Concurrent compactions are performed - // - when L0 read-amplification passes the L0CompactionConcurrency threshold - // - for automatic background compactions - // - when a manual compaction for a level is split and parallelized - // MaxConcurrentCompactions must be greater than 0. - MaxConcurrentCompactions func() int + // CompactionConcurrencyRange returns a [lower, upper] range for the number of + // compactions Pebble runs in parallel (with the caveats below), not including + // download compactions (which have a separate limit specified by + // MaxConcurrentDownloads). + // + // The lower value is the concurrency allowed under normal circumstances. + // Pebble can dynamically increase the concurrency based on heuristics (like + // high read amplification or compaction debt) up to the maximum. + // + // The upper value is a rough upper bound since delete-only compactions (a) do + // not use the CompactionScheduler, and (b) the CompactionScheduler may use + // other criteria to decide on how many compactions to permit. + // + // Elaborating on (b), when the ConcurrencyLimitScheduler is being used, the + // value returned by DB.GetAllowedWithoutPermission fully controls how many + // compactions get to run. Other CompactionSchedulers may use additional + // criteria, like resource availability. + // + // Elaborating on (a), we don't use the CompactionScheduler to schedule + // delete-only compactions since they are expected to be almost free from a + // CPU and disk usage perspective. Since the CompactionScheduler does not + // know about their existence, the total running count can exceed this + // value. For example, consider CompactionConcurrencyRange returns 3, and the + // current value returned from DB.GetAllowedWithoutPermission is also 3. Say + // 3 delete-only compactions are also running. Then the + // ConcurrencyLimitScheduler can also start 3 other compactions, for a total + // of 6. + // + // DB.GetAllowedWithoutPermission returns a value in the interval + // [lower, upper]. A value > lower is returned: + // - when L0 read-amplification passes the L0CompactionConcurrency threshold; + // - when compaction debt passes the CompactionDebtConcurrency threshold; + // - when there are multiple manual compactions waiting to run. + // + // lower and upper must be greater than 0. If lower > upper, then upper is + // used for both. + // + // The default values are 1, 1. + CompactionConcurrencyRange func() (lower, upper int) + + // MaxConcurrentDownloads specifies the maximum number of download + // compactions. These are compactions that copy an external file to the local + // store. + // + // This limit is independent of CompactionConcurrencyRange; at any point in + // time, we may be running CompactionConcurrencyRange non-download compactions + // and MaxConcurrentDownloads download compactions. + // + // MaxConcurrentDownloads() must be greater than 0. + // + // The default value is 1. + MaxConcurrentDownloads func() int // DisableAutomaticCompactions dictates whether automatic compactions are // scheduled or not. The default is false (enabled). This option is only used // externally when running a manual compaction, and internally for tests. DisableAutomaticCompactions bool + // DisableConsistencyCheck disables the consistency check that is performed on + // open. Should only be used when a database cannot be opened normally (e.g. + // some of the tables don't exist / aren't accessible). + DisableConsistencyCheck bool + + // DisableTableStats dictates whether tables should be loaded asynchronously + // to compute statistics that inform compaction heuristics. The collection + // of table stats improves compaction of tombstones, reclaiming disk space + // more quickly and in some cases reducing write amplification in the + // presence of tombstones. Disabling table stats may be useful in tests + // that require determinism as the asynchronicity of table stats collection + // introduces significant nondeterminism. + DisableTableStats bool + // NoSyncOnClose decides whether the Pebble instance will enforce a // close-time synchronization (e.g., fdatasync() or sync_file_range()) // on files it writes to. Setting this to true removes the guarantee for a @@ -903,18 +1004,13 @@ type Options struct { // disabled. ReadOnly bool - // TableCache is an initialized TableCache which should be set as an - // option if the DB needs to be initialized with a pre-existing table cache. - // If TableCache is nil, then a table cache which is unique to the DB instance - // is created. TableCache can be shared between db instances by setting it here. - // The TableCache set here must use the same underlying cache as Options.Cache + // FileCache is an initialized FileCache which should be set as an + // option if the DB needs to be initialized with a pre-existing file cache. + // If FileCache is nil, then a file cache which is unique to the DB instance + // is created. FileCache can be shared between db instances by setting it here. + // The FileCache set here must use the same underlying cache as Options.Cache // and pebble will panic otherwise. - TableCache *TableCache - - // TablePropertyCollectors is a list of TablePropertyCollector creation - // functions. A new TablePropertyCollector is created for each sstable built - // and lives for the lifetime of the table. - TablePropertyCollectors []func() TablePropertyCollector + FileCache *FileCache // BlockPropertyCollectors is a list of BlockPropertyCollector creation // functions. A new BlockPropertyCollector is created for each sstable @@ -937,6 +1033,25 @@ type Options struct { // (i.e. the directory passed to pebble.Open). WALDir string + // WALFailover may be set to configure Pebble to monitor writes to its + // write-ahead log and failover to writing write-ahead log entries to a + // secondary location (eg, a separate physical disk). WALFailover may be + // used to improve write availability in the presence of transient disk + // unavailability. + WALFailover *WALFailoverOptions + + // WALRecoveryDirs is a list of additional directories that should be + // scanned for the existence of additional write-ahead logs. WALRecoveryDirs + // is expected to be used when starting Pebble with a new WALDir or a new + // WALFailover configuration. The directories associated with the previous + // configuration may still contain WALs that are required for recovery of + // the current database state. + // + // If a previous WAL configuration may have stored WALs elsewhere but there + // is not a corresponding entry in WALRecoveryDirs, Open will error (unless + // Unsafe.AllowMissingWALDirs is true). + WALRecoveryDirs []wal.Dir + // WALMinSyncInterval is the minimum duration between syncs of the WAL. If // WAL syncs are requested faster than this interval, they will be // artificially delayed. Introducing a small artificial delay (500us) between @@ -949,33 +1064,68 @@ type Options struct { // changing options dynamically? WALMinSyncInterval func() time.Duration + // The controls below manage deletion pacing, which slows down + // deletions when compactions finish or when readers close and + // obsolete files must be cleaned up. Rapid deletion of many + // files simultaneously can increase disk latency on certain + // SSDs, and this functionality helps protect against that. + // TargetByteDeletionRate is the rate (in bytes per second) at which sstable file // deletions are limited to (under normal circumstances). // - // Deletion pacing is used to slow down deletions when compactions finish up - // or readers close and newly-obsolete files need cleaning up. Deleting lots - // of files at once can cause disk latency to go up on some SSDs, which this - // functionality guards against. - // // This value is only a best-effort target; the effective rate can be // higher if deletions are falling behind or disk space is running low. // // Setting this to 0 disables deletion pacing, which is also the default. TargetByteDeletionRate int + // FreeSpaceThresholdBytes specifies the minimum amount of free disk space that Pebble + // attempts to maintain. If free disk space drops below this threshold, deletions + // are accelerated above TargetByteDeletionRate until the threshold is restored. + // Default is 16GB. + FreeSpaceThresholdBytes uint64 + + // FreeSpaceTimeframe sets the duration (in seconds) within which Pebble attempts + // to restore the free disk space back to FreeSpaceThreshold. A lower value means + // more aggressive deletions. Default is 10s. + FreeSpaceTimeframe time.Duration + + // ObsoleteBytesMaxRatio specifies the maximum allowed ratio of obsolete files to + // live files. If this ratio is exceeded, Pebble speeds up deletions above the + // TargetByteDeletionRate until the ratio is restored. Default is 0.20. + ObsoleteBytesMaxRatio float64 + + // ObsoleteBytesTimeframe sets the duration (in seconds) within which Pebble aims + // to restore the obsolete-to-live bytes ratio below ObsoleteBytesMaxRatio. A lower + // value means more aggressive deletions. Default is 300s. + ObsoleteBytesTimeframe time.Duration + + // EnableSQLRowSpillMetrics specifies whether the Pebble instance will only be used + // to temporarily persist data spilled to disk for row-oriented SQL query execution. + EnableSQLRowSpillMetrics bool + + // AllocatorSizeClasses provides a sorted list containing the supported size + // classes of the underlying memory allocator. This provides hints to the + // sstable block writer's flushing policy to select block sizes that + // preemptively reduce internal fragmentation when loaded into the block cache. + AllocatorSizeClasses []int + + // Unsafe contains options that must be used very carefully and in exceptional + // circumstances. + Unsafe struct { + // AllowMissingWALDirs, if set to true, allows opening a DB when the WAL or + // WAL secondary directory was changed and the previous directory is not in + // WALRecoveryDirs. This can be used to move WALs without having to keep the + // previous directory in the options forever. + // + // CAUTION: Enabling this option will lead to data loss if the missing + // directory contained any WAL files that were not flushed to sstables. + AllowMissingWALDirs bool + } + // private options are only used by internal tests or are used internally // for facilitating upgrade paths of unconfigurable functionality. private struct { - // strictWALTail configures whether or not a database's WALs created - // prior to the most recent one should be interpreted strictly, - // requiring a clean EOF. RocksDB 6.2.1 and the version of Pebble - // included in CockroachDB 20.1 do not guarantee that closed WALs end - // cleanly. If this option is set within an OPTIONS file, Pebble - // interprets previous WALs strictly, requiring a clean EOF. - // Otherwise, it interprets them permissively in the same manner as - // RocksDB 6.2.1. - strictWALTail bool - // disableDeleteOnlyCompactions prevents the scheduling of delete-only // compactions that drop sstables wholy covered by range tombstones or // range key tombstones. @@ -993,9 +1143,6 @@ type Options struct { // do not want to allow users to actually configure. disableLazyCombinedIteration bool - // A private option to disable stats collection. - disableTableStats bool - // testingAlwaysWaitForCleanup is set by some tests to force waiting for // obsolete file deletion (to make events deterministic). testingAlwaysWaitForCleanup bool @@ -1012,9 +1159,146 @@ type Options struct { } } +// ValueSeparationPolicy controls the policy for separating values into +// external blob files. +type ValueSeparationPolicy struct { + // Enabled controls whether value separation is enabled. + Enabled bool + // MinimumSize imposes a lower bound on the size of values that can be + // separated into a blob file. Values smaller than this are always written + // to the sstable (but may still be written to a value block within the + // sstable). + // + // MinimumSize must be > 0. + MinimumSize int + // MaxBlobReferenceDepth limits the number of potentially overlapping (in + // the keyspace) blob files that can be referenced by a single sstable. If a + // compaction may produce an output sstable referencing more than this many + // overlapping blob files, the compaction will instead rewrite referenced + // values into new blob files. + // + // MaxBlobReferenceDepth must be > 0. + MaxBlobReferenceDepth int + // RewriteMinimumAge specifies how old a blob file must be in order for it + // to be eligible for a rewrite that reclaims disk space. Lower values + // reduce space amplification at the cost of write amplification + RewriteMinimumAge time.Duration + // TargetGarbageRatio is a value in the range [0, 1.0] and configures how + // aggressively blob files should be written in order to reduce space + // amplification induced by value separation. As compactions rewrite blob + // files, data may be duplicated. Older blob files containing the + // duplicated data may need to remain because other sstables are referencing + // other values contained in the same file. + // + // The DB can rewrite these blob files in place in order to reduce this + // space amplification, but this incurs write amplification. This option + // configures how much garbage may accrue before the DB will attempt to + // rewrite blob files to reduce it. A value of 0.20 indicates that once 20% + // of values in blob files are unreferenced, the DB should attempt to + // rewrite blob files to reclaim disk space. + // + // A value of 1.0 indicates that the DB should never attempt to rewrite blob + // files. + TargetGarbageRatio float64 +} + +// SpanPolicy contains policies that can vary by key range. The zero value is +// the default value. +type SpanPolicy struct { + // Prefer a faster compression algorithm for the keys in this span. + // + // This is useful for keys that are frequently read or written but which don't + // amount to a significant amount of space. + PreferFastCompression bool + + // DisableValueSeparationBySuffix disables discriminating KVs depending on + // suffix. + // + // Among a set of keys with the same prefix, Pebble's default heuristics + // optimize access to the KV with the smallest suffix. This is useful for MVCC + // keys (where the smallest suffix is the latest version), but should be + // disabled for keys where the suffix does not correspond to a version. + DisableValueSeparationBySuffix bool + + // ValueStoragePolicy is a hint used to determine where to store the values + // for KVs. + ValueStoragePolicy ValueStoragePolicy +} + +// ValueStoragePolicy is a hint used to determine where to store the values for +// KVs. +type ValueStoragePolicy uint8 + +const ( + // ValueStorageDefault is the default value; Pebble will respect global + // configuration for value blocks and value separation. + ValueStorageDefault ValueStoragePolicy = iota + + // ValueStorageLowReadLatency indicates Pebble should prefer storing values + // in-place. + ValueStorageLowReadLatency +) + +// SpanPolicyFunc is used to determine the SpanPolicy for a key region. +// +// The returned policy is valid from the start key until (and not including) the +// end key. +// +// A flush or compaction will call this function once for the first key to be +// output. If the compaction reaches the end key, the current output sst is +// finished and the function is called again. +// +// The end key can be empty, in which case the policy is valid for the entire +// keyspace after startKey. +type SpanPolicyFunc func(startKey []byte) (policy SpanPolicy, endKey []byte, err error) + +// MakeStaticSpanPolicyFunc returns a SpanPolicyFunc that applies a given policy +// to the given span (and the default policy outside the span). +func MakeStaticSpanPolicyFunc(cmp base.Compare, span KeyRange, policy SpanPolicy) SpanPolicyFunc { + return func(startKey []byte) (_ SpanPolicy, endKey []byte, _ error) { + if cmp(startKey, span.End) >= 0 { + // Start End + // v v + // -----|---------|-----|--- + // ^ + // startKey + return SpanPolicy{}, nil, nil + } + if cmp(startKey, span.Start) < 0 { + // Start End + // v v + // --|--|---------|----- + // ^ + // startKey + return SpanPolicy{}, span.Start, nil + } + // Start End + // v v + // -----|----|----|----- + // ^ + // startKey + return policy, span.End, nil + } +} + +// WALFailoverOptions configures the WAL failover mechanics to use during +// transient write unavailability on the primary WAL volume. +type WALFailoverOptions struct { + // Secondary indicates the secondary directory and VFS to use in the event a + // write to the primary WAL stalls. + Secondary wal.Dir + // FailoverOptions provides configuration of the thresholds and intervals + // involved in WAL failover. If any of its fields are left unspecified, + // reasonable defaults will be used. + wal.FailoverOptions +} + // ReadaheadConfig controls the use of read-ahead. type ReadaheadConfig = objstorageprovider.ReadaheadConfig +// JemallocSizeClasses exports sstable.JemallocSizeClasses. +var JemallocSizeClasses = sstable.JemallocSizeClasses + // DebugCheckLevels calls CheckLevels on the provided database. // It may be set in the DebugCheck field of Options to check // level invariants whenever a new version is installed. @@ -1022,21 +1306,89 @@ func DebugCheckLevels(db *DB) error { return db.CheckLevels(nil) } +// DBCompressionSettings contains compression settings for the entire store. It +// defines compression profiles for each LSM level. +type DBCompressionSettings struct { + Name string + Levels [manifest.NumLevels]*block.CompressionProfile +} + +// Predefined compression settings. +var ( + DBCompressionNone = UniformDBCompressionSettings(block.NoCompression) + DBCompressionFastest = UniformDBCompressionSettings(block.FastestCompression) + DBCompressionBalanced = func() DBCompressionSettings { + cs := DBCompressionSettings{Name: "Balanced"} + for i := 0; i < manifest.NumLevels-2; i++ { + cs.Levels[i] = block.FastestCompression + } + cs.Levels[manifest.NumLevels-2] = block.FastCompression // Zstd1 for value blocks. + cs.Levels[manifest.NumLevels-1] = block.BalancedCompression // Zstd1 for data and value blocks. + return cs + }() + DBCompressionGood = func() DBCompressionSettings { + cs := DBCompressionSettings{Name: "Good"} + for i := 0; i < manifest.NumLevels-2; i++ { + cs.Levels[i] = block.FastestCompression + } + cs.Levels[manifest.NumLevels-2] = block.BalancedCompression // Zstd1 for data and value blocks. + cs.Levels[manifest.NumLevels-1] = block.GoodCompression // Zstd3 for data and value blocks. + return cs + }() +) + +// UniformDBCompressionSettings returns a DBCompressionSettings which uses the +// same compression profile on all LSM levels. +func UniformDBCompressionSettings(profile *block.CompressionProfile) DBCompressionSettings { + cs := DBCompressionSettings{Name: profile.Name} + for i := range cs.Levels { + cs.Levels[i] = profile + } + return cs +} + +// ApplyCompressionSettings sets the Compression field in each LevelOptions to +// call the given function and return the compression profile for that level. +func (o *Options) ApplyCompressionSettings(csFn func() DBCompressionSettings) { + for i := range o.Levels { + levelIdx := i + o.Levels[i].Compression = func() *block.CompressionProfile { + return csFn().Levels[levelIdx] + } + } +} + // EnsureDefaults ensures that the default values for all options are set if a -// valid value was not already specified. Returns the new options. -func (o *Options) EnsureDefaults() *Options { - if o == nil { - o = &Options{} +// valid value was not already specified. +func (o *Options) EnsureDefaults() { + if o.Cache == nil && o.CacheSize == 0 { + o.CacheSize = cacheDefaultSize } + o.Comparer = o.Comparer.EnsureDefaults() + if o.BytesPerSync <= 0 { o.BytesPerSync = 512 << 10 // 512 KB } if o.Cleaner == nil { o.Cleaner = DeleteCleaner{} } - if o.Comparer == nil { - o.Comparer = DefaultComparer + + if o.FreeSpaceThresholdBytes == 0 { + o.FreeSpaceThresholdBytes = 16 << 30 // 16 GB + } + + if o.FreeSpaceTimeframe == 0 { + o.FreeSpaceTimeframe = 10 * time.Second + } + + if o.ObsoleteBytesMaxRatio == 0 { + o.ObsoleteBytesMaxRatio = 0.20 } + + if o.ObsoleteBytesTimeframe == 0 { + o.ObsoleteBytesTimeframe = 300 * time.Second + } + if o.Experimental.DisableIngestAsFlushable == nil { o.Experimental.DisableIngestAsFlushable = func() bool { return false } } @@ -1046,8 +1398,20 @@ func (o *Options) EnsureDefaults() *Options { if o.Experimental.CompactionDebtConcurrency <= 0 { o.Experimental.CompactionDebtConcurrency = 1 << 30 // 1 GB } - if o.Experimental.KeyValidationFunc == nil { - o.Experimental.KeyValidationFunc = func([]byte) error { return nil } + if o.Experimental.CompactionGarbageFractionForMaxConcurrency == nil { + // When 40% of the DB is garbage, the compaction concurrency is at the + // maximum permitted. + o.Experimental.CompactionGarbageFractionForMaxConcurrency = func() float64 { return 0.4 } + } + if o.Experimental.ValueSeparationPolicy == nil { + o.Experimental.ValueSeparationPolicy = func() ValueSeparationPolicy { + return ValueSeparationPolicy{Enabled: false} + } + } + if o.KeySchema == "" && len(o.KeySchemas) == 0 { + ks := colblk.DefaultKeySchema(o.Comparer, 16 /* bundleSize */) + o.KeySchema = ks.Name + o.KeySchemas = sstable.MakeKeySchemas(&ks) } if o.L0CompactionThreshold <= 0 { o.L0CompactionThreshold = 4 @@ -1084,22 +1448,18 @@ func (o *Options) EnsureDefaults() *Options { if o.LBaseMaxBytes <= 0 { o.LBaseMaxBytes = 64 << 20 // 64 MB } - if o.Levels == nil { - o.Levels = make([]LevelOptions, 1) - for i := range o.Levels { - if i > 0 { - l := &o.Levels[i] - if l.TargetFileSize <= 0 { - l.TargetFileSize = o.Levels[i-1].TargetFileSize * 2 - } - } - o.Levels[i].EnsureDefaults() - } - } else { - for i := range o.Levels { - o.Levels[i].EnsureDefaults() + if o.TargetFileSizes[0] <= 0 { + o.TargetFileSizes[0] = 2 << 20 // 2 MB + } + for i := 1; i < len(o.TargetFileSizes); i++ { + if o.TargetFileSizes[i] <= 0 { + o.TargetFileSizes[i] = o.TargetFileSizes[i-1] * 2 } } + o.Levels[0].EnsureL0Defaults() + for i := 1; i < len(o.Levels); i++ { + o.Levels[i].EnsureL1PlusDefaults(&o.Levels[i-1]) + } if o.Logger == nil { o.Logger = DefaultLogger } @@ -1122,23 +1482,31 @@ func (o *Options) EnsureDefaults() *Options { if o.Merger == nil { o.Merger = DefaultMerger } - o.private.strictWALTail = true - if o.MaxConcurrentCompactions == nil { - o.MaxConcurrentCompactions = func() int { return 1 } + if o.CompactionConcurrencyRange == nil { + o.CompactionConcurrencyRange = func() (int, int) { return 1, 1 } + } + if o.MaxConcurrentDownloads == nil { + o.MaxConcurrentDownloads = func() int { return 1 } } if o.NumPrevManifest <= 0 { o.NumPrevManifest = 1 } if o.FormatMajorVersion == FormatDefault { - o.FormatMajorVersion = FormatMostCompatible + o.FormatMajorVersion = FormatMinSupported + if o.Experimental.CreateOnShared != remote.CreateOnSharedNone { + o.FormatMajorVersion = FormatMinForSharedObjects + } } if o.FS == nil { o.WithFSDefaults() } if o.FlushSplitBytes <= 0 { - o.FlushSplitBytes = 2 * o.Levels[0].TargetFileSize + o.FlushSplitBytes = 2 * o.TargetFileSizes[0] + } + if o.WALFailover != nil { + o.WALFailover.FailoverOptions.EnsureDefaults() } if o.Experimental.LevelMultiplier <= 0 { o.Experimental.LevelMultiplier = defaultLevelMultiplier @@ -1149,31 +1517,61 @@ func (o *Options) EnsureDefaults() *Options { if o.Experimental.ReadSamplingMultiplier == 0 { o.Experimental.ReadSamplingMultiplier = 1 << 4 } - if o.Experimental.TableCacheShards <= 0 { - o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0) + if o.Experimental.NumDeletionsThreshold == 0 { + o.Experimental.NumDeletionsThreshold = sstable.DefaultNumDeletionsThreshold + } + if o.Experimental.DeletionSizeRatioThreshold == 0 { + o.Experimental.DeletionSizeRatioThreshold = sstable.DefaultDeletionSizeRatioThreshold } - if o.Experimental.CPUWorkPermissionGranter == nil { - o.Experimental.CPUWorkPermissionGranter = defaultCPUWorkGranter{} + if o.Experimental.EnableColumnarBlocks == nil { + o.Experimental.EnableColumnarBlocks = func() bool { return true } + } + if o.Experimental.TombstoneDenseCompactionThreshold == 0 { + o.Experimental.TombstoneDenseCompactionThreshold = 0.10 + } + if o.Experimental.FileCacheShards <= 0 { + o.Experimental.FileCacheShards = runtime.GOMAXPROCS(0) } if o.Experimental.MultiLevelCompactionHeuristic == nil { o.Experimental.MultiLevelCompactionHeuristic = WriteAmpHeuristic{} } + if o.Experimental.SpanPolicyFunc == nil { + o.Experimental.SpanPolicyFunc = func(startKey []byte) (SpanPolicy, []byte, error) { return SpanPolicy{}, nil, nil } + } + // TODO(jackson): Enable value separation by default once we have confidence + // in a default policy. o.initMaps() +} + +// TargetFileSize computes the target file size for the given output level. +func (o *Options) TargetFileSize(outputLevel int, baseLevel int) int64 { + if outputLevel == 0 { + return o.TargetFileSizes[0] + } + if baseLevel > outputLevel { + panic(fmt.Sprintf("invalid base level %d (output level %d)", baseLevel, outputLevel)) + } + return o.TargetFileSizes[outputLevel-baseLevel+1] +} + +// DefaultOptions returns a new Options object with the default values set. +func DefaultOptions() *Options { + o := &Options{} + o.EnsureDefaults() return o } // WithFSDefaults configures the Options to wrap the configured filesystem with // the default virtual file system middleware, like disk-health checking. -func (o *Options) WithFSDefaults() *Options { +func (o *Options) WithFSDefaults() { if o.FS == nil { o.FS = vfs.Default } - o.FS, o.private.fsCloser = vfs.WithDiskHealthChecks(o.FS, 5*time.Second, + o.FS, o.private.fsCloser = vfs.WithDiskHealthChecks(o.FS, 5*time.Second, nil, func(info vfs.DiskSlowInfo) { o.EventListener.DiskSlow(info) }) - return o } // AddEventListener adds the provided event listener to the Options, in addition @@ -1185,18 +1583,11 @@ func (o *Options) AddEventListener(l EventListener) { o.EventListener = &l } -func (o *Options) equal() Equal { - if o.Comparer.Equal == nil { - return bytes.Equal - } - return o.Comparer.Equal -} - // initMaps initializes the Comparers, Filters, and Mergers maps. func (o *Options) initMaps() { for i := range o.Levels { l := &o.Levels[i] - if l.FilterPolicy != nil { + if l.FilterPolicy != NoFilterPolicy { if o.Filters == nil { o.Filters = make(map[string]FilterPolicy) } @@ -1208,39 +1599,23 @@ func (o *Options) initMaps() { } } -// Level returns the LevelOptions for the specified level. -func (o *Options) Level(level int) LevelOptions { - if level < len(o.Levels) { - return o.Levels[level] - } - n := len(o.Levels) - 1 - l := o.Levels[n] - for i := n; i < level; i++ { - l.TargetFileSize *= 2 - } - return l -} - // Clone creates a shallow-copy of the supplied options. func (o *Options) Clone() *Options { - n := &Options{} - if o != nil { - *n = *o + if o == nil { + return &Options{} } - return n -} - -func filterPolicyName(p FilterPolicy) string { - if p == nil { - return "none" + n := *o + if o.WALFailover != nil { + c := *o.WALFailover + n.WALFailover = &c } - return p.Name() + return &n } func (o *Options) String() string { var buf bytes.Buffer - cacheSize := int64(cacheDefaultSize) + cacheSize := o.CacheSize if o.Cache != nil { cacheSize = o.Cache.MaxSize() } @@ -1253,15 +1628,21 @@ func (o *Options) String() string { fmt.Fprintf(&buf, " cache_size=%d\n", cacheSize) fmt.Fprintf(&buf, " cleaner=%s\n", o.Cleaner) fmt.Fprintf(&buf, " compaction_debt_concurrency=%d\n", o.Experimental.CompactionDebtConcurrency) + fmt.Fprintf(&buf, " compaction_garbage_fraction_for_max_concurrency=%.2f\n", + o.Experimental.CompactionGarbageFractionForMaxConcurrency()) fmt.Fprintf(&buf, " comparer=%s\n", o.Comparer.Name) fmt.Fprintf(&buf, " disable_wal=%t\n", o.DisableWAL) if o.Experimental.DisableIngestAsFlushable != nil && o.Experimental.DisableIngestAsFlushable() { fmt.Fprintf(&buf, " disable_ingest_as_flushable=%t\n", true) } + if o.Experimental.EnableColumnarBlocks != nil && o.Experimental.EnableColumnarBlocks() { + fmt.Fprintf(&buf, " enable_columnar_blocks=%t\n", true) + } fmt.Fprintf(&buf, " flush_delay_delete_range=%s\n", o.FlushDelayDeleteRange) fmt.Fprintf(&buf, " flush_delay_range_key=%s\n", o.FlushDelayRangeKey) fmt.Fprintf(&buf, " flush_split_bytes=%d\n", o.FlushSplitBytes) fmt.Fprintf(&buf, " format_major_version=%d\n", o.FormatMajorVersion) + fmt.Fprintf(&buf, " key_schema=%s\n", o.KeySchema) fmt.Fprintf(&buf, " l0_compaction_concurrency=%d\n", o.Experimental.L0CompactionConcurrency) fmt.Fprintf(&buf, " l0_compaction_file_threshold=%d\n", o.L0CompactionFileThreshold) fmt.Fprintf(&buf, " l0_compaction_threshold=%d\n", o.L0CompactionThreshold) @@ -1270,32 +1651,35 @@ func (o *Options) String() string { if o.Experimental.LevelMultiplier != defaultLevelMultiplier { fmt.Fprintf(&buf, " level_multiplier=%d\n", o.Experimental.LevelMultiplier) } - fmt.Fprintf(&buf, " max_concurrent_compactions=%d\n", o.MaxConcurrentCompactions()) + lower, upper := o.CompactionConcurrencyRange() + fmt.Fprintf(&buf, " concurrent_compactions=%d\n", lower) + fmt.Fprintf(&buf, " max_concurrent_compactions=%d\n", upper) + fmt.Fprintf(&buf, " max_concurrent_downloads=%d\n", o.MaxConcurrentDownloads()) fmt.Fprintf(&buf, " max_manifest_file_size=%d\n", o.MaxManifestFileSize) fmt.Fprintf(&buf, " max_open_files=%d\n", o.MaxOpenFiles) fmt.Fprintf(&buf, " mem_table_size=%d\n", o.MemTableSize) fmt.Fprintf(&buf, " mem_table_stop_writes_threshold=%d\n", o.MemTableStopWritesThreshold) fmt.Fprintf(&buf, " min_deletion_rate=%d\n", o.TargetByteDeletionRate) + fmt.Fprintf(&buf, " free_space_threshold_bytes=%d\n", o.FreeSpaceThresholdBytes) + fmt.Fprintf(&buf, " free_space_timeframe=%s\n", o.FreeSpaceTimeframe.String()) + fmt.Fprintf(&buf, " obsolete_bytes_max_ratio=%f\n", o.ObsoleteBytesMaxRatio) + fmt.Fprintf(&buf, " obsolete_bytes_timeframe=%s\n", o.ObsoleteBytesTimeframe.String()) fmt.Fprintf(&buf, " merger=%s\n", o.Merger.Name) + if o.Experimental.MultiLevelCompactionHeuristic != nil { + fmt.Fprintf(&buf, " multilevel_compaction_heuristic=%s\n", o.Experimental.MultiLevelCompactionHeuristic.String()) + } fmt.Fprintf(&buf, " read_compaction_rate=%d\n", o.Experimental.ReadCompactionRate) fmt.Fprintf(&buf, " read_sampling_multiplier=%d\n", o.Experimental.ReadSamplingMultiplier) - fmt.Fprintf(&buf, " strict_wal_tail=%t\n", o.private.strictWALTail) - fmt.Fprintf(&buf, " table_cache_shards=%d\n", o.Experimental.TableCacheShards) - fmt.Fprintf(&buf, " table_property_collectors=[") - for i := range o.TablePropertyCollectors { - if i > 0 { - fmt.Fprintf(&buf, ",") - } - // NB: This creates a new TablePropertyCollector, but Options.String() is - // called rarely so the overhead of doing so is not consequential. - fmt.Fprintf(&buf, "%s", o.TablePropertyCollectors[i]().Name()) - } - fmt.Fprintf(&buf, "]\n") + fmt.Fprintf(&buf, " num_deletions_threshold=%d\n", o.Experimental.NumDeletionsThreshold) + fmt.Fprintf(&buf, " deletion_size_ratio_threshold=%f\n", o.Experimental.DeletionSizeRatioThreshold) + fmt.Fprintf(&buf, " tombstone_dense_compaction_threshold=%f\n", o.Experimental.TombstoneDenseCompactionThreshold) + // We no longer care about strict_wal_tail, but set it to true in case an + // older version reads the options. + fmt.Fprintf(&buf, " strict_wal_tail=%t\n", true) + fmt.Fprintf(&buf, " table_cache_shards=%d\n", o.Experimental.FileCacheShards) fmt.Fprintf(&buf, " validate_on_ingest=%t\n", o.Experimental.ValidateOnIngest) fmt.Fprintf(&buf, " wal_dir=%s\n", o.WALDir) fmt.Fprintf(&buf, " wal_bytes_per_sync=%d\n", o.WALBytesPerSync) - fmt.Fprintf(&buf, " max_writer_concurrency=%d\n", o.Experimental.MaxWriterConcurrency) - fmt.Fprintf(&buf, " force_writer_parallelism=%t\n", o.Experimental.ForceWriterParallelism) fmt.Fprintf(&buf, " secondary_cache_size_bytes=%d\n", o.Experimental.SecondaryCacheSizeBytes) fmt.Fprintf(&buf, " create_on_shared=%d\n", o.Experimental.CreateOnShared) @@ -1315,6 +1699,32 @@ func (o *Options) String() string { fmt.Fprintln(&buf, " disable_lazy_combined_iteration=true") } + if o.Experimental.ValueSeparationPolicy != nil { + policy := o.Experimental.ValueSeparationPolicy() + if policy.Enabled { + fmt.Fprintln(&buf) + fmt.Fprintln(&buf, "[Value Separation]") + fmt.Fprintf(&buf, " enabled=%t\n", policy.Enabled) + fmt.Fprintf(&buf, " minimum_size=%d\n", policy.MinimumSize) + fmt.Fprintf(&buf, " max_blob_reference_depth=%d\n", policy.MaxBlobReferenceDepth) + fmt.Fprintf(&buf, " rewrite_minimum_age=%s\n", policy.RewriteMinimumAge) + fmt.Fprintf(&buf, " target_garbage_ratio=%.2f\n", policy.TargetGarbageRatio) + } + } + + if o.WALFailover != nil { + unhealthyThreshold, _ := o.WALFailover.FailoverOptions.UnhealthyOperationLatencyThreshold() + fmt.Fprintf(&buf, "\n") + fmt.Fprintf(&buf, "[WAL Failover]\n") + fmt.Fprintf(&buf, " secondary_dir=%s\n", o.WALFailover.Secondary.Dirname) + fmt.Fprintf(&buf, " primary_dir_probe_interval=%s\n", o.WALFailover.FailoverOptions.PrimaryDirProbeInterval) + fmt.Fprintf(&buf, " healthy_probe_latency_threshold=%s\n", o.WALFailover.FailoverOptions.HealthyProbeLatencyThreshold) + fmt.Fprintf(&buf, " healthy_interval=%s\n", o.WALFailover.FailoverOptions.HealthyInterval) + fmt.Fprintf(&buf, " unhealthy_sampling_interval=%s\n", o.WALFailover.FailoverOptions.UnhealthySamplingInterval) + fmt.Fprintf(&buf, " unhealthy_operation_latency_threshold=%s\n", unhealthyThreshold) + fmt.Fprintf(&buf, " elevated_write_stall_threshold_lag=%s\n", o.WALFailover.FailoverOptions.ElevatedWriteStallThresholdLag) + } + for i := range o.Levels { l := &o.Levels[i] fmt.Fprintf(&buf, "\n") @@ -1322,32 +1732,66 @@ func (o *Options) String() string { fmt.Fprintf(&buf, " block_restart_interval=%d\n", l.BlockRestartInterval) fmt.Fprintf(&buf, " block_size=%d\n", l.BlockSize) fmt.Fprintf(&buf, " block_size_threshold=%d\n", l.BlockSizeThreshold) - fmt.Fprintf(&buf, " compression=%s\n", l.Compression) - fmt.Fprintf(&buf, " filter_policy=%s\n", filterPolicyName(l.FilterPolicy)) + fmt.Fprintf(&buf, " compression=%s\n", l.Compression().Name) + fmt.Fprintf(&buf, " filter_policy=%s\n", l.FilterPolicy.Name()) fmt.Fprintf(&buf, " filter_type=%s\n", l.FilterType) fmt.Fprintf(&buf, " index_block_size=%d\n", l.IndexBlockSize) - fmt.Fprintf(&buf, " target_file_size=%d\n", l.TargetFileSize) + fmt.Fprintf(&buf, " target_file_size=%d\n", o.TargetFileSizes[i]) } return buf.String() } -func parseOptions(s string, fn func(section, key, value string) error) error { - var section string - for _, line := range strings.Split(s, "\n") { - line = strings.TrimSpace(line) - if len(line) == 0 { - // Skip blank lines. - continue +type parseOptionsFuncs struct { + visitNewSection func(i, j int, section string) error + visitKeyValue func(i, j int, section, key, value string) error + visitCommentOrWhitespace func(i, j int, whitespace string) error +} + +// parseOptions takes options serialized by Options.String() and parses them +// into keys and values. It calls fns.visitNewSection for the beginning of each +// new section, fns.visitKeyValue for each key-value pair, and +// visitCommentOrWhitespace for comments and whitespace between key-value pairs. +func parseOptions(s string, fns parseOptionsFuncs) error { + var section, mappedSection string + i := 0 + for i < len(s) { + rem := s[i:] + j := strings.IndexByte(rem, '\n') + if j < 0 { + j = len(rem) + } else { + j += 1 // Include the newline. } - if line[0] == ';' || line[0] == '#' { - // Skip comments. + line := strings.TrimSpace(s[i : i+j]) + startOff, endOff := i, i+j + i += j + + if len(line) == 0 || line[0] == ';' || line[0] == '#' { + // Skip blank lines and comments. + if fns.visitCommentOrWhitespace != nil { + if err := fns.visitCommentOrWhitespace(startOff, endOff, line); err != nil { + return err + } + } continue } n := len(line) if line[0] == '[' && line[n-1] == ']' { // Parse section. section = line[1 : n-1] + // RocksDB uses a similar (INI-style) syntax for the OPTIONS file, but + // different section names and keys. The "CFOptions ..." paths are the + // RocksDB versions which we map to the Pebble paths. + mappedSection = section + if section == `CFOptions "default"` { + mappedSection = "Options" + } + if fns.visitNewSection != nil { + if err := fns.visitNewSection(startOff, endOff, mappedSection); err != nil { + return err + } + } continue } @@ -1363,12 +1807,7 @@ func parseOptions(s string, fn func(section, key, value string) error) error { key := strings.TrimSpace(line[:pos]) value := strings.TrimSpace(line[pos+1:]) - // RocksDB uses a similar (INI-style) syntax for the OPTIONS file, but - // different section names and keys. The "CFOptions ..." paths are the - // RocksDB versions which we map to the Pebble paths. - mappedSection := section if section == `CFOptions "default"` { - mappedSection = "Options" switch key { case "comparator": key = "comparer" @@ -1376,9 +1815,10 @@ func parseOptions(s string, fn func(section, key, value string) error) error { key = "merger" } } - - if err := fn(mappedSection, key, value); err != nil { - return err + if fns.visitKeyValue != nil { + if err := fns.visitKeyValue(startOff, endOff, mappedSection, key, value); err != nil { + return err + } } } return nil @@ -1387,10 +1827,10 @@ func parseOptions(s string, fn func(section, key, value string) error) error { // ParseHooks contains callbacks to create options fields which can have // user-defined implementations. type ParseHooks struct { - NewCache func(size int64) *Cache NewCleaner func(name string) (Cleaner, error) NewComparer func(name string) (*Comparer, error) NewFilterPolicy func(name string) (FilterPolicy, error) + NewKeySchema func(name string) (KeySchema, error) NewMerger func(name string) (*Merger, error) SkipUnknown func(name, value string) bool } @@ -1399,12 +1839,34 @@ type ParseHooks struct { // options cannot be parsed into populated fields. For example, comparer and // merger. func (o *Options) Parse(s string, hooks *ParseHooks) error { - return parseOptions(s, func(section, key, value string) error { + var valSepPolicy ValueSeparationPolicy + var concurrencyLimit struct { + lower int + lowerSet bool + upper int + upperSet bool + } + + visitKeyValue := func(i, j int, section, key, value string) error { // WARNING: DO NOT remove entries from the switches below because doing so // causes a key previously written to the OPTIONS file to be considered unknown, // a backwards incompatible change. Instead, leave in support for parsing the // key but simply don't parse the value. + parseComparer := func(name string) (*Comparer, error) { + switch name { + case DefaultComparer.Name: + return DefaultComparer, nil + case testkeys.Comparer.Name: + return testkeys.Comparer, nil + default: + if hooks != nil && hooks.NewComparer != nil { + return hooks.NewComparer(name) + } + return nil, nil + } + } + switch { case section == "Version": switch key { @@ -1424,16 +1886,7 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { case "bytes_per_sync": o.BytesPerSync, err = strconv.Atoi(value) case "cache_size": - var n int64 - n, err = strconv.ParseInt(value, 10, 64) - if err == nil && hooks != nil && hooks.NewCache != nil { - if o.Cache != nil { - o.Cache.Unref() - } - o.Cache = hooks.NewCache(n) - } - // We avoid calling cache.New in parsing because it makes it - // too easy to leak a cache. + o.CacheSize, err = strconv.ParseInt(value, 10, 64) case "cleaner": switch value { case "archive": @@ -1446,16 +1899,20 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { } } case "comparer": - switch value { - case "leveldb.BytewiseComparator": - o.Comparer = DefaultComparer - default: - if hooks != nil && hooks.NewComparer != nil { - o.Comparer, err = hooks.NewComparer(value) - } + var comparer *Comparer + comparer, err = parseComparer(value) + if comparer != nil { + o.Comparer = comparer } case "compaction_debt_concurrency": o.Experimental.CompactionDebtConcurrency, err = strconv.ParseUint(value, 10, 64) + case "compaction_garbage_fraction_for_max_concurrency": + var frac float64 + frac, err = strconv.ParseFloat(value, 64) + if err == nil { + o.Experimental.CompactionGarbageFractionForMaxConcurrency = + func() float64 { return frac } + } case "delete_range_flush_delay": // NB: This is a deprecated serialization of the // `flush_delay_delete_range`. @@ -1474,6 +1931,11 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { o.private.disableLazyCombinedIteration, err = strconv.ParseBool(value) case "disable_wal": o.DisableWAL, err = strconv.ParseBool(value) + case "enable_columnar_blocks": + var v bool + if v, err = strconv.ParseBool(value); err == nil { + o.Experimental.EnableColumnarBlocks = func() bool { return v } + } case "flush_delay_delete_range": o.FlushDelayDeleteRange, err = time.ParseDuration(value) case "flush_delay_range_key": @@ -1488,11 +1950,41 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { var v uint64 v, err = strconv.ParseUint(value, 10, 64) if vers := FormatMajorVersion(v); vers > internalFormatNewest || vers == FormatDefault { - err = errors.Newf("unknown format major version %d", o.FormatMajorVersion) + err = errors.Newf("unsupported format major version %d", o.FormatMajorVersion) } if err == nil { o.FormatMajorVersion = FormatMajorVersion(v) } + case "key_schema": + o.KeySchema = value + if o.KeySchemas == nil { + o.KeySchemas = make(map[string]*KeySchema) + } + if _, ok := o.KeySchemas[o.KeySchema]; !ok { + if strings.HasPrefix(value, "DefaultKeySchema(") && strings.HasSuffix(value, ")") { + argsStr := strings.TrimSuffix(strings.TrimPrefix(value, "DefaultKeySchema("), ")") + args := strings.FieldsFunc(argsStr, func(r rune) bool { + return unicode.IsSpace(r) || r == ',' + }) + var comparer *base.Comparer + var bundleSize int + comparer, err = parseComparer(args[0]) + if err == nil { + bundleSize, err = strconv.Atoi(args[1]) + } + if err == nil { + schema := colblk.DefaultKeySchema(comparer, bundleSize) + o.KeySchema = schema.Name + o.KeySchemas[o.KeySchema] = &schema + } + } else if hooks != nil && hooks.NewKeySchema != nil { + var schema KeySchema + schema, err = hooks.NewKeySchema(value) + if err == nil { + o.KeySchemas[value] = &schema + } + } + } case "l0_compaction_concurrency": o.Experimental.L0CompactionConcurrency, err = strconv.Atoi(value) case "l0_compaction_file_threshold": @@ -1507,13 +1999,19 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { o.LBaseMaxBytes, err = strconv.ParseInt(value, 10, 64) case "level_multiplier": o.Experimental.LevelMultiplier, err = strconv.Atoi(value) + case "concurrent_compactions": + concurrencyLimit.lowerSet = true + concurrencyLimit.lower, err = strconv.Atoi(value) case "max_concurrent_compactions": - var concurrentCompactions int - concurrentCompactions, err = strconv.Atoi(value) - if concurrentCompactions <= 0 { + concurrencyLimit.upperSet = true + concurrencyLimit.upper, err = strconv.Atoi(value) + case "max_concurrent_downloads": + var concurrentDownloads int + concurrentDownloads, err = strconv.Atoi(value) + if concurrentDownloads <= 0 { err = errors.New("max_concurrent_compactions cannot be <= 0") } else { - o.MaxConcurrentCompactions = func() int { return concurrentCompactions } + o.MaxConcurrentDownloads = func() int { return concurrentDownloads } } case "max_manifest_file_size": o.MaxManifestFileSize, err = strconv.ParseInt(value, 10, 64) @@ -1528,13 +2026,51 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { // may be meaningful again eventually. case "min_deletion_rate": o.TargetByteDeletionRate, err = strconv.Atoi(value) + case "free_space_threshold_bytes": + o.FreeSpaceThresholdBytes, err = strconv.ParseUint(value, 10, 64) + case "free_space_timeframe": + o.FreeSpaceTimeframe, err = time.ParseDuration(value) + case "obsolete_bytes_max_ratio": + o.ObsoleteBytesMaxRatio, err = strconv.ParseFloat(value, 64) + case "obsolete_bytes_timeframe": + o.ObsoleteBytesTimeframe, err = time.ParseDuration(value) case "min_flush_rate": // Do nothing; option existed in older versions of pebble, and // may be meaningful again eventually. + case "multilevel_compaction_heuristic": + switch { + case value == "none": + o.Experimental.MultiLevelCompactionHeuristic = NoMultiLevel{} + case strings.HasPrefix(value, "wamp"): + fields := strings.FieldsFunc(strings.TrimPrefix(value, "wamp"), func(r rune) bool { + return unicode.IsSpace(r) || r == ',' || r == '(' || r == ')' + }) + if len(fields) != 2 { + err = errors.Newf("require 2 arguments") + } + var h WriteAmpHeuristic + if err == nil { + h.AddPropensity, err = strconv.ParseFloat(fields[0], 64) + } + if err == nil { + h.AllowL0, err = strconv.ParseBool(fields[1]) + } + if err == nil { + o.Experimental.MultiLevelCompactionHeuristic = h + } else { + err = errors.Wrapf(err, "unexpected wamp heuristic arguments: %s", value) + } + default: + err = errors.Newf("unrecognized multilevel compaction heuristic: %s", value) + } case "point_tombstone_weight": // Do nothing; deprecated. case "strict_wal_tail": - o.private.strictWALTail, err = strconv.ParseBool(value) + var strictWALTail bool + strictWALTail, err = strconv.ParseBool(value) + if err == nil && !strictWALTail { + err = errors.Newf("reading from versions with strict_wal_tail=false no longer supported") + } case "merger": switch value { case "nullptr": @@ -1550,8 +2086,16 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { o.Experimental.ReadCompactionRate, err = strconv.ParseInt(value, 10, 64) case "read_sampling_multiplier": o.Experimental.ReadSamplingMultiplier, err = strconv.ParseInt(value, 10, 64) + case "num_deletions_threshold": + o.Experimental.NumDeletionsThreshold, err = strconv.Atoi(value) + case "deletion_size_ratio_threshold": + val, parseErr := strconv.ParseFloat(value, 32) + o.Experimental.DeletionSizeRatioThreshold = float32(val) + err = parseErr + case "tombstone_dense_compaction_threshold": + o.Experimental.TombstoneDenseCompactionThreshold, err = strconv.ParseFloat(value, 64) case "table_cache_shards": - o.Experimental.TableCacheShards, err = strconv.Atoi(value) + o.Experimental.FileCacheShards, err = strconv.Atoi(value) case "table_format": switch value { case "leveldb": @@ -1560,7 +2104,7 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { return errors.Errorf("pebble: unknown table format: %q", errors.Safe(value)) } case "table_property_collectors": - // TODO(peter): set o.TablePropertyCollectors + // No longer implemented; ignore. case "validate_on_ingest": o.Experimental.ValidateOnIngest, err = strconv.ParseBool(value) case "wal_dir": @@ -1568,9 +2112,9 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { case "wal_bytes_per_sync": o.WALBytesPerSync, err = strconv.Atoi(value) case "max_writer_concurrency": - o.Experimental.MaxWriterConcurrency, err = strconv.Atoi(value) + // No longer implemented; ignore. case "force_writer_parallelism": - o.Experimental.ForceWriterParallelism, err = strconv.ParseBool(value) + // No longer implemented; ignore. case "secondary_cache_size_bytes": o.Experimental.SecondaryCacheSizeBytes, err = strconv.ParseInt(value, 10, 64) case "create_on_shared": @@ -1586,22 +2130,69 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { } return err - case strings.HasPrefix(section, "Level "): - var index int - if n, err := fmt.Sscanf(section, `Level "%d"`, &index); err != nil { - return err - } else if n != 1 { - if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section, value) { + case section == "Value Separation": + var err error + switch key { + case "enabled": + valSepPolicy.Enabled, err = strconv.ParseBool(value) + case "minimum_size": + var minimumSize int + minimumSize, err = strconv.Atoi(value) + valSepPolicy.MinimumSize = minimumSize + case "max_blob_reference_depth": + valSepPolicy.MaxBlobReferenceDepth, err = strconv.Atoi(value) + case "rewrite_minimum_age": + valSepPolicy.RewriteMinimumAge, err = time.ParseDuration(value) + case "target_garbage_ratio": + valSepPolicy.TargetGarbageRatio, err = strconv.ParseFloat(value, 64) + default: + if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { return nil } - return errors.Errorf("pebble: unknown section: %q", errors.Safe(section)) + return errors.Errorf("pebble: unknown option: %s.%s", errors.Safe(section), errors.Safe(key)) } + return err - if len(o.Levels) <= index { - newLevels := make([]LevelOptions, index+1) - copy(newLevels, o.Levels) - o.Levels = newLevels + case section == "WAL Failover": + if o.WALFailover == nil { + o.WALFailover = new(WALFailoverOptions) } + var err error + switch key { + case "secondary_dir": + o.WALFailover.Secondary = wal.Dir{Dirname: value, FS: vfs.Default} + case "primary_dir_probe_interval": + o.WALFailover.PrimaryDirProbeInterval, err = time.ParseDuration(value) + case "healthy_probe_latency_threshold": + o.WALFailover.HealthyProbeLatencyThreshold, err = time.ParseDuration(value) + case "healthy_interval": + o.WALFailover.HealthyInterval, err = time.ParseDuration(value) + case "unhealthy_sampling_interval": + o.WALFailover.UnhealthySamplingInterval, err = time.ParseDuration(value) + case "unhealthy_operation_latency_threshold": + var threshold time.Duration + threshold, err = time.ParseDuration(value) + o.WALFailover.UnhealthyOperationLatencyThreshold = func() (time.Duration, bool) { + return threshold, true + } + case "elevated_write_stall_threshold_lag": + o.WALFailover.ElevatedWriteStallThresholdLag, err = time.ParseDuration(value) + default: + if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { + return nil + } + return errors.Errorf("pebble: unknown option: %s.%s", + errors.Safe(section), errors.Safe(key)) + } + return err + + case strings.HasPrefix(section, "Level "): + m := regexp.MustCompile(`Level\s*"?(\d+)"?\s*$`).FindStringSubmatch(section) + if m == nil { + return errors.Errorf("pebble: unknown section: %q", errors.Safe(section)) + } + index, _ := strconv.Atoi(m[1]) + l := &o.Levels[index] var err error @@ -1613,21 +2204,16 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { case "block_size_threshold": l.BlockSizeThreshold, err = strconv.Atoi(value) case "compression": - switch value { - case "Default": - l.Compression = DefaultCompression - case "NoCompression": - l.Compression = NoCompression - case "Snappy": - l.Compression = SnappyCompression - case "ZSTD": - l.Compression = ZstdCompression - default: + profile := block.CompressionProfileByName(value) + if profile == nil { return errors.Errorf("pebble: unknown compression: %q", errors.Safe(value)) } + l.Compression = func() *sstable.CompressionProfile { return profile } case "filter_policy": if hooks != nil && hooks.NewFilterPolicy != nil { l.FilterPolicy, err = hooks.NewFilterPolicy(value) + } else { + l.FilterPolicy = NoFilterPolicy } case "filter_type": switch value { @@ -1639,7 +2225,7 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { case "index_block_size": l.IndexBlockSize, err = strconv.Atoi(value) case "target_file_size": - l.TargetFileSize, err = strconv.ParseInt(value, 10, 64) + o.TargetFileSizes[index], err = strconv.ParseInt(value, 10, 64) default: if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { return nil @@ -1651,13 +2237,56 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { if hooks != nil && hooks.SkipUnknown != nil && hooks.SkipUnknown(section+"."+key, value) { return nil } - return errors.Errorf("pebble: unknown section: %q", errors.Safe(section)) + return errors.Errorf("pebble: unknown section %q or key %q", errors.Safe(section), errors.Safe(key)) + } + err := parseOptions(s, parseOptionsFuncs{ + visitKeyValue: visitKeyValue, }) + if err != nil { + return err + } + o.Experimental.ValueSeparationPolicy = func() ValueSeparationPolicy { return valSepPolicy } + if concurrencyLimit.lowerSet || concurrencyLimit.upperSet { + if !concurrencyLimit.lowerSet { + concurrencyLimit.lower = 1 + } else if concurrencyLimit.lower < 1 { + return errors.New("baseline_concurrent_compactions cannot be <= 0") + } + if !concurrencyLimit.upperSet { + concurrencyLimit.upper = concurrencyLimit.lower + } else if concurrencyLimit.upper < concurrencyLimit.lower { + return errors.Newf("max_concurrent_compactions cannot be < %d", concurrencyLimit.lower) + } + o.CompactionConcurrencyRange = func() (int, int) { + return concurrencyLimit.lower, concurrencyLimit.upper + } + } + return nil +} + +// ErrMissingWALRecoveryDir is an error returned when a database is attempted to be +// opened without supplying a Options.WALRecoveryDir entry for a directory that +// may contain WALs required to recover a consistent database state. +type ErrMissingWALRecoveryDir struct { + Dir string + ExtraInfo string +} + +// Error implements error. +func (e ErrMissingWALRecoveryDir) Error() string { + return fmt.Sprintf("directory %q may contain relevant WALs but is not in WALRecoveryDirs%s", e.Dir, e.ExtraInfo) } -func (o *Options) checkOptions(s string) (strictWALTail bool, err error) { - // TODO(jackson): Refactor to avoid awkwardness of the strictWALTail return value. - return strictWALTail, parseOptions(s, func(section, key, value string) error { +// CheckCompatibility verifies the options are compatible with the previous options +// serialized by Options.String(). For example, the Comparer and Merger must be +// the same, or data will not be able to be properly read from the DB. +// +// This function only looks at specific keys and does not error out if the +// options are newer and contain unknown keys. +func (o *Options) CheckCompatibility(storeDir string, previousOptions string) error { + previousWALDir := "" + + visitKeyValue := func(i, j int, section, key, value string) error { switch section + "." + key { case "Options.comparer": if value != o.Comparer.Name { @@ -1671,22 +2300,71 @@ func (o *Options) checkOptions(s string) (strictWALTail bool, err error) { return errors.Errorf("pebble: merger name from file %q != merger name from options %q", errors.Safe(value), errors.Safe(o.Merger.Name)) } - case "Options.strict_wal_tail": - strictWALTail, err = strconv.ParseBool(value) - if err != nil { - return errors.Errorf("pebble: error parsing strict_wal_tail value %q: %w", value, err) + case "Options.wal_dir": + previousWALDir = value + case "WAL Failover.secondary_dir": + previousWALSecondaryDir := value + if err := o.checkWALDir(storeDir, previousWALSecondaryDir, "WALFailover.Secondary changed from previous options"); err != nil { + return err } } return nil - }) + } + if err := parseOptions(previousOptions, parseOptionsFuncs{visitKeyValue: visitKeyValue}); err != nil { + return err + } + if err := o.checkWALDir(storeDir, previousWALDir, "WALDir changed from previous options"); err != nil { + return err + } + return nil } -// Check verifies the options are compatible with the previous options -// serialized by Options.String(). For example, the Comparer and Merger must be -// the same, or data will not be able to be properly read from the DB. -func (o *Options) Check(s string) error { - _, err := o.checkOptions(s) - return err +// checkWALDir verifies that walDir is among o.WALDir, o.WALFailover.Secondary, +// or o.WALRecoveryDirs. An empty "walDir" maps to the storeDir. +func (o *Options) checkWALDir(storeDir, walDir, errContext string) error { + walPath := resolveStorePath(storeDir, walDir) + if walDir == "" { + walPath = storeDir + } + + if o.WALDir == "" { + if walPath == storeDir { + return nil + } + } else { + if walPath == resolveStorePath(storeDir, o.WALDir) { + return nil + } + } + + if o.WALFailover != nil && walPath == resolveStorePath(storeDir, o.WALFailover.Secondary.Dirname) { + return nil + } + + for _, d := range o.WALRecoveryDirs { + // TODO(radu): should we also check that d.FS is the same as walDir's FS? + if walPath == resolveStorePath(storeDir, d.Dirname) { + return nil + } + } + + if o.Unsafe.AllowMissingWALDirs { + o.Logger.Infof("directory %q may contain relevant WALs but is not in WALRecoveryDirs (AllowMissingWALDirs enabled)", walDir) + return nil + } + + var buf bytes.Buffer + fmt.Fprintf(&buf, "\n %s\n", errContext) + fmt.Fprintf(&buf, " o.WALDir: %q\n", o.WALDir) + if o.WALFailover != nil { + fmt.Fprintf(&buf, " o.WALFailover.Secondary.Dirname: %q\n", o.WALFailover.Secondary.Dirname) + } + fmt.Fprintf(&buf, " o.WALRecoveryDirs: %d", len(o.WALRecoveryDirs)) + for _, d := range o.WALRecoveryDirs { + fmt.Fprintf(&buf, "\n %q", d.Dirname) + } + + return ErrMissingWALRecoveryDir{Dir: walPath, ExtraInfo: buf.String()} } // Validate verifies that the options are mutually consistent. For example, @@ -1713,13 +2391,31 @@ func (o *Options) Validate() error { fmt.Fprintf(&buf, "MemTableStopWritesThreshold (%d) must be >= 2\n", o.MemTableStopWritesThreshold) } - if o.FormatMajorVersion > internalFormatNewest { - fmt.Fprintf(&buf, "FormatMajorVersion (%d) must be <= %d\n", - o.FormatMajorVersion, internalFormatNewest) + if o.FormatMajorVersion < FormatMinSupported || o.FormatMajorVersion > internalFormatNewest { + fmt.Fprintf(&buf, "FormatMajorVersion (%d) must be between %d and %d\n", + o.FormatMajorVersion, FormatMinSupported, internalFormatNewest) + } + if o.Experimental.CreateOnShared != remote.CreateOnSharedNone && o.FormatMajorVersion < FormatMinForSharedObjects { + fmt.Fprintf(&buf, "FormatMajorVersion (%d) when CreateOnShared is set must be at least %d\n", + o.FormatMajorVersion, FormatMinForSharedObjects) } - if o.TableCache != nil && o.Cache != o.TableCache.cache { - fmt.Fprintf(&buf, "underlying cache in the TableCache and the Cache dont match\n") + if len(o.KeySchemas) > 0 { + if o.KeySchema == "" { + fmt.Fprintf(&buf, "KeySchemas is set but KeySchema is not\n") + } + if _, ok := o.KeySchemas[o.KeySchema]; !ok { + fmt.Fprintf(&buf, "KeySchema %q not found in KeySchemas\n", o.KeySchema) + } + } + if policy := o.Experimental.ValueSeparationPolicy(); policy.Enabled { + if policy.MinimumSize <= 0 { + fmt.Fprintf(&buf, "ValueSeparationPolicy.MinimumSize (%d) must be > 0\n", policy.MinimumSize) + } + if policy.MaxBlobReferenceDepth <= 0 { + fmt.Fprintf(&buf, "ValueSeparationPolicy.MaxBlobReferenceDepth (%d) must be > 0\n", policy.MaxBlobReferenceDepth) + } } + if buf.Len() == 0 { return nil } @@ -1731,15 +2427,12 @@ func (o *Options) Validate() error { func (o *Options) MakeReaderOptions() sstable.ReaderOptions { var readerOpts sstable.ReaderOptions if o != nil { - readerOpts.Cache = o.Cache - readerOpts.LoadBlockSema = o.LoadBlockSema readerOpts.Comparer = o.Comparer readerOpts.Filters = o.Filters - if o.Merger != nil { - readerOpts.Merge = o.Merger.Merge - readerOpts.MergerName = o.Merger.Name - } + readerOpts.KeySchemas = o.KeySchemas + readerOpts.LoadBlockSema = o.LoadBlockSema readerOpts.LoggerAndTracer = o.LoggerAndTracer + readerOpts.Merger = o.Merger } return readerOpts } @@ -1750,28 +2443,187 @@ func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstab var writerOpts sstable.WriterOptions writerOpts.TableFormat = format if o != nil { - writerOpts.Cache = o.Cache writerOpts.Comparer = o.Comparer if o.Merger != nil { writerOpts.MergerName = o.Merger.Name } - writerOpts.TablePropertyCollectors = o.TablePropertyCollectors writerOpts.BlockPropertyCollectors = o.BlockPropertyCollectors } if format >= sstable.TableFormatPebblev3 { writerOpts.ShortAttributeExtractor = o.Experimental.ShortAttributeExtractor - writerOpts.RequiredInPlaceValueBound = o.Experimental.RequiredInPlaceValueBound if format >= sstable.TableFormatPebblev4 && level == numLevels-1 { writerOpts.WritingToLowestLevel = true } } - levelOpts := o.Level(level) + levelOpts := o.Levels[level] writerOpts.BlockRestartInterval = levelOpts.BlockRestartInterval writerOpts.BlockSize = levelOpts.BlockSize writerOpts.BlockSizeThreshold = levelOpts.BlockSizeThreshold - writerOpts.Compression = levelOpts.Compression + writerOpts.Compression = levelOpts.Compression() writerOpts.FilterPolicy = levelOpts.FilterPolicy writerOpts.FilterType = levelOpts.FilterType writerOpts.IndexBlockSize = levelOpts.IndexBlockSize + if o.KeySchema != "" { + var ok bool + writerOpts.KeySchema, ok = o.KeySchemas[o.KeySchema] + if !ok { + panic(fmt.Sprintf("invalid schema %q", redact.Safe(o.KeySchema))) + } + } + writerOpts.AllocatorSizeClasses = o.AllocatorSizeClasses + writerOpts.NumDeletionsThreshold = o.Experimental.NumDeletionsThreshold + writerOpts.DeletionSizeRatioThreshold = o.Experimental.DeletionSizeRatioThreshold return writerOpts } + +// MakeBlobWriterOptions constructs blob.FileWriterOptions from the corresponding +// options in the receiver. +func (o *Options) MakeBlobWriterOptions(level int) blob.FileWriterOptions { + lo := o.Levels[level] + return blob.FileWriterOptions{ + Compression: lo.Compression(), + ChecksumType: block.ChecksumTypeCRC32c, + FlushGovernor: block.MakeFlushGovernor( + lo.BlockSize, + lo.BlockSizeThreshold, + base.SizeClassAwareBlockSizeThreshold, + o.AllocatorSizeClasses, + ), + } +} + +func (o *Options) MakeObjStorageProviderSettings(dirname string) objstorageprovider.Settings { + s := objstorageprovider.Settings{ + Logger: o.Logger, + FS: o.FS, + FSDirName: dirname, + FSCleaner: o.Cleaner, + NoSyncOnClose: o.NoSyncOnClose, + BytesPerSync: o.BytesPerSync, + } + s.Local.ReadaheadConfig = o.Local.ReadaheadConfig + s.Remote.StorageFactory = o.Experimental.RemoteStorage + s.Remote.CreateOnShared = o.Experimental.CreateOnShared + s.Remote.CreateOnSharedLocator = o.Experimental.CreateOnSharedLocator + s.Remote.CacheSizeBytes = o.Experimental.SecondaryCacheSizeBytes + return s +} + +// UserKeyCategories describes a partitioning of the user key space. Each +// partition is a category with a name. The categories are used for informative +// purposes only (like pprof labels). Pebble does not treat keys differently +// based on the UserKeyCategories. +// +// The partitions are defined by their upper bounds. The last partition is +// assumed to go until the end of keyspace; its UpperBound is ignored. The rest +// of the partitions are ordered by their UpperBound. +type UserKeyCategories struct { + categories []UserKeyCategory + cmp base.Compare + // rangeNames[i][j] contains the string referring to the categories in the + // range [i, j], with j > i. + rangeNames [][]string +} + +// UserKeyCategory describes a partition of the user key space. +// +// User keys >= the previous category's UpperBound and < this category's +// UpperBound are part of this category. +type UserKeyCategory struct { + Name string + // UpperBound is the exclusive upper bound of the category. All user keys >= the + // previous category's UpperBound and < this UpperBound are part of this + // category. + UpperBound []byte +} + +// MakeUserKeyCategories creates a UserKeyCategories object with the given +// categories. The object is immutable and can be reused across different +// stores. +func MakeUserKeyCategories(cmp base.Compare, categories ...UserKeyCategory) UserKeyCategories { + n := len(categories) + if n == 0 { + return UserKeyCategories{} + } + if categories[n-1].UpperBound != nil { + panic("last category UpperBound must be nil") + } + // Verify that the partitions are ordered as expected. + for i := 1; i < n-1; i++ { + if cmp(categories[i-1].UpperBound, categories[i].UpperBound) >= 0 { + panic("invalid UserKeyCategories: key prefixes must be sorted") + } + } + + // Precalculate a table of range names to avoid allocations in the + // categorization path. + rangeNamesBuf := make([]string, n*n) + rangeNames := make([][]string, n) + for i := range rangeNames { + rangeNames[i] = rangeNamesBuf[:n] + rangeNamesBuf = rangeNamesBuf[n:] + for j := i + 1; j < n; j++ { + rangeNames[i][j] = categories[i].Name + "-" + categories[j].Name + } + } + return UserKeyCategories{ + categories: categories, + cmp: cmp, + rangeNames: rangeNames, + } +} + +// Len returns the number of categories defined. +func (kc *UserKeyCategories) Len() int { + return len(kc.categories) +} + +// CategorizeKey returns the name of the category containing the key. +func (kc *UserKeyCategories) CategorizeKey(userKey []byte) string { + idx := sort.Search(len(kc.categories)-1, func(i int) bool { + return kc.cmp(userKey, kc.categories[i].UpperBound) < 0 + }) + return kc.categories[idx].Name +} + +// CategorizeKeyRange returns the name of the category containing the key range. +// If the key range spans multiple categories, the result shows the first and +// last category separated by a dash, e.g. `cat1-cat5`. +func (kc *UserKeyCategories) CategorizeKeyRange(startUserKey, endUserKey []byte) string { + n := len(kc.categories) + p := sort.Search(n-1, func(i int) bool { + return kc.cmp(startUserKey, kc.categories[i].UpperBound) < 0 + }) + if p == n-1 || kc.cmp(endUserKey, kc.categories[p].UpperBound) < 0 { + // Fast path for a single category. + return kc.categories[p].Name + } + // Binary search among the remaining categories. + q := p + 1 + sort.Search(n-2-p, func(i int) bool { + return kc.cmp(endUserKey, kc.categories[p+1+i].UpperBound) < 0 + }) + return kc.rangeNames[p][q] +} + +const storePathIdentifier = "{store_path}" + +// MakeStoreRelativePath takes a path that is relative to the store directory +// and creates a path that can be used for Options.WALDir and wal.Dir.Dirname. +// +// This is used in metamorphic tests, so that the test run directory can be +// copied or moved. +func MakeStoreRelativePath(fs vfs.FS, relativePath string) string { + if relativePath == "" { + return storePathIdentifier + } + return fs.PathJoin(storePathIdentifier, relativePath) +} + +// resolveStorePath is the inverse of MakeStoreRelativePath(). It replaces any +// storePathIdentifier prefix with the store dir. +func resolveStorePath(storeDir, path string) string { + if remainder, ok := strings.CutPrefix(path, storePathIdentifier); ok { + return storeDir + remainder + } + return path +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/overlap.go b/vendor/github.com/cockroachdb/pebble/v2/overlap.go new file mode 100644 index 0000000..3f49176 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/overlap.go @@ -0,0 +1,67 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package pebble + +import ( + "context" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/overlap" +) + +// An overlapChecker provides facilities for checking whether any keys within a +// particular LSM version overlap a set of bounds. It is a thin wrapper for +// dataoverlap.Checker. +type overlapChecker struct { + comparer *base.Comparer + newIters tableNewIters + opts IterOptions + v *manifest.Version +} + +// DetermineLSMOverlap calculates the overlap.WithLSM for the given bounds. +func (c *overlapChecker) DetermineLSMOverlap( + ctx context.Context, bounds base.UserKeyBounds, +) (overlap.WithLSM, error) { + checker := overlap.MakeChecker(c.comparer.Compare, c) + return checker.LSMOverlap(ctx, bounds, c.v) +} + +var _ overlap.IteratorFactory = (*overlapChecker)(nil) + +// Points is part of the overlap.IteratorFactory implementation. +func (c *overlapChecker) Points( + ctx context.Context, m *manifest.TableMetadata, +) (base.InternalIterator, error) { + iters, err := c.newIters(ctx, m, &c.opts, internalIterOpts{}, iterPointKeys) + if err != nil { + return nil, err + } + return iters.point, nil +} + +// RangeDels is part of the overlap.IteratorFactory implementation. +func (c *overlapChecker) RangeDels( + ctx context.Context, m *manifest.TableMetadata, +) (keyspan.FragmentIterator, error) { + iters, err := c.newIters(ctx, m, &c.opts, internalIterOpts{}, iterRangeDeletions) + if err != nil { + return nil, err + } + return iters.rangeDeletion, nil +} + +// RangeKeys is part of the overlap.IteratorFactory implementation. +func (c *overlapChecker) RangeKeys( + ctx context.Context, m *manifest.TableMetadata, +) (keyspan.FragmentIterator, error) { + iters, err := c.newIters(ctx, m, &c.opts, internalIterOpts{}, iterRangeKeys) + if err != nil { + return nil, err + } + return iters.rangeKey, nil +} diff --git a/vendor/github.com/cockroachdb/pebble/pacer.go b/vendor/github.com/cockroachdb/pebble/v2/pacer.go similarity index 85% rename from vendor/github.com/cockroachdb/pebble/pacer.go rename to vendor/github.com/cockroachdb/pebble/v2/pacer.go index a959ff4..ce45798 100644 --- a/vendor/github.com/cockroachdb/pebble/pacer.go +++ b/vendor/github.com/cockroachdb/pebble/v2/pacer.go @@ -7,6 +7,8 @@ package pebble import ( "sync" "time" + + "github.com/cockroachdb/crlib/crtime" ) // deletionPacerInfo contains any info from the db necessary to make deletion @@ -58,18 +60,25 @@ const deletePacerHistory = 5 * time.Minute // normally limit deletes (when we are not falling behind or running out of // space). A value of 0.0 disables pacing. func newDeletionPacer( - now time.Time, targetByteDeletionRate int64, getInfo func() deletionPacerInfo, + now crtime.Mono, + freeSpaceThreshold uint64, + targetByteDeletionRate int64, + freeSpaceTimeframe time.Duration, + obsoleteBytesMaxRatio float64, + obsoleteBytesTimeframe time.Duration, + getInfo func() deletionPacerInfo, ) *deletionPacer { d := &deletionPacer{ - freeSpaceThreshold: 16 << 30, // 16 GB - freeSpaceTimeframe: 10 * time.Second, + freeSpaceThreshold: freeSpaceThreshold, + freeSpaceTimeframe: freeSpaceTimeframe, - obsoleteBytesMaxRatio: 0.20, - obsoleteBytesTimeframe: 5 * time.Minute, + obsoleteBytesMaxRatio: obsoleteBytesMaxRatio, + obsoleteBytesTimeframe: obsoleteBytesTimeframe, targetByteDeletionRate: targetByteDeletionRate, getInfo: getInfo, } + d.mu.history.Init(now, deletePacerHistory) return d } @@ -79,7 +88,7 @@ func newDeletionPacer( // deletion rate accordingly. // // ReportDeletion is thread-safe. -func (p *deletionPacer) ReportDeletion(now time.Time, bytesToDelete uint64) { +func (p *deletionPacer) ReportDeletion(now crtime.Mono, bytesToDelete uint64) { p.mu.Lock() defer p.mu.Unlock() p.mu.history.Add(now, int64(bytesToDelete)) @@ -89,7 +98,7 @@ func (p *deletionPacer) ReportDeletion(now time.Time, bytesToDelete uint64) { // deleting the given number of bytes. // // PacingDelay is thread-safe. -func (p *deletionPacer) PacingDelay(now time.Time, bytesToDelete uint64) (waitSeconds float64) { +func (p *deletionPacer) PacingDelay(now crtime.Mono, bytesToDelete uint64) (waitSeconds float64) { if p.targetByteDeletionRate == 0 { // Pacing disabled. return 0.0 @@ -136,7 +145,7 @@ func (p *deletionPacer) PacingDelay(now time.Time, bytesToDelete uint64) (waitSe // are effectively rounded down to the nearest epoch boundary. type history struct { epochDuration time.Duration - startTime time.Time + startTime crtime.Mono // currEpoch is the epoch of the most recent operation. currEpoch int64 // val contains the recent epoch values. @@ -151,7 +160,7 @@ const historyEpochs = 100 // Init the history helper to keep track of data over the given number of // seconds. -func (h *history) Init(now time.Time, timeframe time.Duration) { +func (h *history) Init(now crtime.Mono, timeframe time.Duration) { *h = history{ epochDuration: timeframe / time.Duration(historyEpochs), startTime: now, @@ -161,7 +170,7 @@ func (h *history) Init(now time.Time, timeframe time.Duration) { } // Add adds a value for the current time. -func (h *history) Add(now time.Time, val int64) { +func (h *history) Add(now crtime.Mono, val int64) { h.advance(now) h.val[h.currEpoch%historyEpochs] += val h.sum += val @@ -169,17 +178,17 @@ func (h *history) Add(now time.Time, val int64) { // Sum returns the sum of recent values. The result is approximate in that the // cut-off time is within 1% of the exact one. -func (h *history) Sum(now time.Time) int64 { +func (h *history) Sum(now crtime.Mono) int64 { h.advance(now) return h.sum } -func (h *history) epoch(t time.Time) int64 { +func (h *history) epoch(t crtime.Mono) int64 { return int64(t.Sub(h.startTime) / h.epochDuration) } // advance advances the time to the given time. -func (h *history) advance(now time.Time) { +func (h *history) advance(now crtime.Mono) { epoch := h.epoch(now) for h.currEpoch < epoch { h.currEpoch++ diff --git a/vendor/github.com/cockroachdb/pebble/range_keys.go b/vendor/github.com/cockroachdb/pebble/v2/range_keys.go similarity index 79% rename from vendor/github.com/cockroachdb/pebble/range_keys.go rename to vendor/github.com/cockroachdb/pebble/v2/range_keys.go index 13b5822..a308b42 100644 --- a/vendor/github.com/cockroachdb/pebble/range_keys.go +++ b/vendor/github.com/cockroachdb/pebble/v2/range_keys.go @@ -5,11 +5,15 @@ package pebble import ( - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/sstable" + "context" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/sstable" ) // constructRangeKeyIter constructs the range-key iterator stack, populating @@ -19,10 +23,17 @@ func (i *Iterator) constructRangeKeyIter() { &i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound, &i.hasPrefix, &i.prefixOrFullSeekKey, false /* internalKeys */, &i.rangeKey.rangeKeyBuffers.internal) + if i.opts.DebugRangeKeyStack { + // The default logger is preferable to i.opts.getLogger(), at least in the + // metamorphic test. + i.rangeKey.rangeKeyIter = keyspan.InjectLogging(i.rangeKey.rangeKeyIter, base.DefaultLogger) + } + // If there's an indexed batch with range keys, include it. if i.batch != nil { if i.batch.index == nil { - i.rangeKey.iterConfig.AddLevel(newErrorKeyspanIter(ErrNotIndexed)) + // This isn't an indexed batch. We shouldn't have gotten this far. + panic(errors.AssertionFailedf("creating an iterator over an unindexed batch")) } else { // Only include the batch's range key iterator if it has any keys. // NB: This can force reconstruction of the rangekey iterator stack @@ -35,58 +46,85 @@ func (i *Iterator) constructRangeKeyIter() { } } - // Next are the flushables: memtables and large batches. - if i.readState != nil { - for j := len(i.readState.memtables) - 1; j >= 0; j-- { - mem := i.readState.memtables[j] - // We only need to read from memtables which contain sequence numbers older - // than seqNum. - if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum { - continue + if !i.batchOnlyIter { + // Next are the flushables: memtables and large batches. + if i.readState != nil { + for j := len(i.readState.memtables) - 1; j >= 0; j-- { + mem := i.readState.memtables[j] + // We only need to read from memtables which contain sequence numbers older + // than seqNum. + if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum { + continue + } + if rki := mem.newRangeKeyIter(&i.opts); rki != nil { + i.rangeKey.iterConfig.AddLevel(rki) + } } - if rki := mem.newRangeKeyIter(&i.opts); rki != nil { - i.rangeKey.iterConfig.AddLevel(rki) + } + + current := i.version + if current == nil { + current = i.readState.current + } + // Next are the file levels: L0 sub-levels followed by lower levels. + + // Add file-specific iterators for L0 files containing range keys. We + // maintain a separate manifest.LevelMetadata for each level containing only + // files that contain range keys, however we don't compute a separate + // L0Sublevels data structure too. + // + // We first use L0's LevelMetadata to peek and see whether L0 contains any + // range keys at all. If it does, we create a range key level iterator per + // level that contains range keys using the information from L0Sublevels. + // Some sublevels may not contain any range keys, and we need to iterate + // through the fileMetadata to determine that. Since L0's file count should + // not significantly exceed ~1000 files (see L0CompactionFileThreshold), + // this should be okay. + if !current.RangeKeyLevels[0].Empty() { + // L0 contains at least 1 file containing range keys. + // Add level iterators for the L0 sublevels, iterating from newest to + // oldest. + for j := len(current.L0SublevelFiles) - 1; j >= 0; j-- { + iter := current.L0SublevelFiles[j].Iter() + if !containsAnyRangeKeys(iter) { + continue + } + + li := i.rangeKey.iterConfig.NewLevelIter() + li.Init( + i.ctx, + i.opts.SpanIterOptions(), + i.cmp, + i.newIterRangeKey, + iter.Filter(manifest.KeyTypeRange), + manifest.L0Sublevel(j), + manifest.KeyTypeRange, + ) + i.rangeKey.iterConfig.AddLevel(li) } } - } - current := i.version - if current == nil { - current = i.readState.current - } - // Next are the file levels: L0 sub-levels followed by lower levels. - // - // Add file-specific iterators for L0 files containing range keys. This is less - // efficient than using levelIters for sublevels of L0 files containing - // range keys, but range keys are expected to be sparse anyway, reducing the - // cost benefit of maintaining a separate L0Sublevels instance for range key - // files and then using it here. - // - // NB: We iterate L0's files in reverse order. They're sorted by - // LargestSeqNum ascending, and we need to add them to the merging iterator - // in LargestSeqNum descending to preserve the merging iterator's invariants - // around Key Trailer order. - iter := current.RangeKeyLevels[0].Iter() - for f := iter.Last(); f != nil; f = iter.Prev() { - spanIter, err := i.newIterRangeKey(f, i.opts.SpanIterOptions()) - if err != nil { - i.rangeKey.iterConfig.AddLevel(&errorKeyspanIter{err: err}) - continue + // Add level iterators for the non-empty non-L0 levels. + for level := 1; level < len(current.RangeKeyLevels); level++ { + if current.RangeKeyLevels[level].Empty() { + continue + } + li := i.rangeKey.iterConfig.NewLevelIter() + spanIterOpts := i.opts.SpanIterOptions() + li.Init(i.ctx, spanIterOpts, i.cmp, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(), + manifest.Level(level), manifest.KeyTypeRange) + i.rangeKey.iterConfig.AddLevel(li) } - i.rangeKey.iterConfig.AddLevel(spanIter) } +} - // Add level iterators for the non-empty non-L0 levels. - for level := 1; level < len(current.RangeKeyLevels); level++ { - if current.RangeKeyLevels[level].Empty() { - continue +func containsAnyRangeKeys(iter manifest.LevelIterator) bool { + for f := iter.First(); f != nil; f = iter.Next() { + if f.HasRangeKeys { + return true } - li := i.rangeKey.iterConfig.NewLevelIter() - spanIterOpts := i.opts.SpanIterOptions() - li.Init(spanIterOpts, i.cmp, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(), - manifest.Level(level), manifest.KeyTypeRange) - i.rangeKey.iterConfig.AddLevel(li) } + return false } // Range key masking @@ -187,9 +225,10 @@ func (i *Iterator) constructRangeKeyIter() { // result is ignored, and the block is read. type rangeKeyMasking struct { - cmp base.Compare - split base.Split - filter BlockPropertyFilterMask + cmp base.Compare + suffixCmp base.CompareRangeSuffixes + split base.Split + filter BlockPropertyFilterMask // maskActiveSuffix holds the suffix of a range key currently acting as a // mask, hiding point keys with suffixes greater than it. maskActiveSuffix // is only ever non-nil if IterOptions.RangeKeyMasking.Suffix is non-nil. @@ -205,9 +244,10 @@ type rangeKeyMasking struct { parent *Iterator } -func (m *rangeKeyMasking) init(parent *Iterator, cmp base.Compare, split base.Split) { - m.cmp = cmp - m.split = split +func (m *rangeKeyMasking) init(parent *Iterator, c *base.Comparer) { + m.cmp = c.Compare + m.suffixCmp = c.CompareRangeSuffixes + m.split = c.Split if parent.opts.RangeKeyMasking.Filter != nil { m.filter = parent.opts.RangeKeyMasking.Filter() } @@ -232,10 +272,10 @@ func (m *rangeKeyMasking) SpanChanged(s *keyspan.Span) { if s.Keys[j].Suffix == nil { continue } - if m.cmp(s.Keys[j].Suffix, m.parent.opts.RangeKeyMasking.Suffix) < 0 { + if m.suffixCmp(s.Keys[j].Suffix, m.parent.opts.RangeKeyMasking.Suffix) < 0 { continue } - if len(m.maskActiveSuffix) == 0 || m.cmp(m.maskActiveSuffix, s.Keys[j].Suffix) > 0 { + if len(m.maskActiveSuffix) == 0 || m.suffixCmp(m.maskActiveSuffix, s.Keys[j].Suffix) > 0 { m.maskSpan = s m.maskActiveSuffix = append(m.maskActiveSuffix[:0], s.Keys[j].Suffix...) } @@ -314,7 +354,7 @@ func (m *rangeKeyMasking) SkipPoint(userKey []byte) bool { // the InterleavingIter). Skip the point key if the range key's suffix is // greater than the point key's suffix. pointSuffix := userKey[m.split(userKey):] - if len(pointSuffix) > 0 && m.cmp(m.maskActiveSuffix, pointSuffix) < 0 { + if len(pointSuffix) > 0 && m.suffixCmp(m.maskActiveSuffix, pointSuffix) < 0 { m.parent.stats.RangeKeyStats.SkippedPoints++ return true } @@ -367,6 +407,14 @@ func (m *rangeKeyMasking) Intersects(prop []byte) (bool, error) { return m.filter.Intersects(prop) } +func (m *rangeKeyMasking) SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) { + if m.maskSpan == nil { + // No span is actively masking. + return true, nil + } + return m.filter.SyntheticSuffixIntersects(prop, suffix) +} + // KeyIsWithinLowerBound implements the limitedBlockPropertyFilter interface // defined in the sstable package. It's used to restrict the masking block // property filter to only applying within the bounds of the active range key. @@ -447,8 +495,8 @@ var _ internalIterator = (*lazyCombinedIter)(nil) // operations that land in the middle of a range key and must truncate to the // user-provided seek key. func (i *lazyCombinedIter) initCombinedIteration( - dir int8, pointKey *InternalKey, pointValue base.LazyValue, seekKey []byte, -) (*InternalKey, base.LazyValue) { + dir int8, pointKV *base.InternalKV, seekKey []byte, +) *base.InternalKV { // Invariant: i.parent.rangeKey is nil. // Invariant: !i.combinedIterState.initialized. if invariants.Enabled { @@ -496,11 +544,11 @@ func (i *lazyCombinedIter) initCombinedIteration( // key instead to `bar`. It is guaranteed that no range key exists // earlier than `bar`, otherwise a levelIter would've observed it and // set `combinedIterState.key` to its start key. - if pointKey != nil { - if dir == +1 && i.parent.cmp(i.combinedIterState.key, pointKey.UserKey) > 0 { - seekKey = pointKey.UserKey - } else if dir == -1 && i.parent.cmp(seekKey, pointKey.UserKey) < 0 { - seekKey = pointKey.UserKey + if pointKV != nil { + if dir == +1 && i.parent.cmp(i.combinedIterState.key, pointKV.K.UserKey) > 0 { + seekKey = pointKV.K.UserKey + } else if dir == -1 && i.parent.cmp(seekKey, pointKV.K.UserKey) < 0 { + seekKey = pointKV.K.UserKey } } } @@ -510,7 +558,6 @@ func (i *lazyCombinedIter) initCombinedIteration( // the range key iterator stack. It must not exist, otherwise we'd already // be performing combined iteration. i.parent.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) - i.parent.rangeKey.init(i.parent.comparer.Compare, i.parent.comparer.Split, &i.parent.opts) i.parent.constructRangeKeyIter() // Initialize the Iterator's interleaving iterator. @@ -538,7 +585,7 @@ func (i *lazyCombinedIter) initCombinedIteration( // // In the forward direction (invert for backwards), the seek key is a key // guaranteed to find the smallest range key that's greater than the last - // key the iterator returned. The range key may be less than pointKey, in + // key the iterator returned. The range key may be less than pointKV, in // which case the range key will be interleaved next instead of the point // key. if dir == +1 { @@ -546,103 +593,99 @@ func (i *lazyCombinedIter) initCombinedIteration( if i.parent.hasPrefix { prefix = i.parent.prefixOrFullSeekKey } - return i.parent.rangeKey.iiter.InitSeekGE(prefix, seekKey, pointKey, pointValue) + return i.parent.rangeKey.iiter.InitSeekGE(prefix, seekKey, pointKV) } - return i.parent.rangeKey.iiter.InitSeekLT(seekKey, pointKey, pointValue) + return i.parent.rangeKey.iiter.InitSeekLT(seekKey, pointKV) } -func (i *lazyCombinedIter) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*InternalKey, base.LazyValue) { +func (i *lazyCombinedIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { if i.combinedIterState.initialized { return i.parent.rangeKey.iiter.SeekGE(key, flags) } - k, v := i.pointIter.SeekGE(key, flags) + kv := i.pointIter.SeekGE(key, flags) if i.combinedIterState.triggered { - return i.initCombinedIteration(+1, k, v, key) + return i.initCombinedIteration(+1, kv, key) } - return k, v + return kv } func (i *lazyCombinedIter) SeekPrefixGE( prefix, key []byte, flags base.SeekGEFlags, -) (*InternalKey, base.LazyValue) { +) *base.InternalKV { if i.combinedIterState.initialized { return i.parent.rangeKey.iiter.SeekPrefixGE(prefix, key, flags) } - k, v := i.pointIter.SeekPrefixGE(prefix, key, flags) + kv := i.pointIter.SeekPrefixGE(prefix, key, flags) if i.combinedIterState.triggered { - return i.initCombinedIteration(+1, k, v, key) + return i.initCombinedIteration(+1, kv, key) } - return k, v + return kv } -func (i *lazyCombinedIter) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*InternalKey, base.LazyValue) { +func (i *lazyCombinedIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { if i.combinedIterState.initialized { return i.parent.rangeKey.iiter.SeekLT(key, flags) } - k, v := i.pointIter.SeekLT(key, flags) + kv := i.pointIter.SeekLT(key, flags) if i.combinedIterState.triggered { - return i.initCombinedIteration(-1, k, v, key) + return i.initCombinedIteration(-1, kv, key) } - return k, v + return kv } -func (i *lazyCombinedIter) First() (*InternalKey, base.LazyValue) { +func (i *lazyCombinedIter) First() *base.InternalKV { if i.combinedIterState.initialized { return i.parent.rangeKey.iiter.First() } - k, v := i.pointIter.First() + kv := i.pointIter.First() if i.combinedIterState.triggered { - return i.initCombinedIteration(+1, k, v, nil) + return i.initCombinedIteration(+1, kv, nil) } - return k, v + return kv } -func (i *lazyCombinedIter) Last() (*InternalKey, base.LazyValue) { +func (i *lazyCombinedIter) Last() *base.InternalKV { if i.combinedIterState.initialized { return i.parent.rangeKey.iiter.Last() } - k, v := i.pointIter.Last() + kv := i.pointIter.Last() if i.combinedIterState.triggered { - return i.initCombinedIteration(-1, k, v, nil) + return i.initCombinedIteration(-1, kv, nil) } - return k, v + return kv } -func (i *lazyCombinedIter) Next() (*InternalKey, base.LazyValue) { +func (i *lazyCombinedIter) Next() *base.InternalKV { if i.combinedIterState.initialized { return i.parent.rangeKey.iiter.Next() } - k, v := i.pointIter.Next() + kv := i.pointIter.Next() if i.combinedIterState.triggered { - return i.initCombinedIteration(+1, k, v, nil) + return i.initCombinedIteration(+1, kv, nil) } - return k, v + return kv } -func (i *lazyCombinedIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { +func (i *lazyCombinedIter) NextPrefix(succKey []byte) *base.InternalKV { if i.combinedIterState.initialized { return i.parent.rangeKey.iiter.NextPrefix(succKey) } - k, v := i.pointIter.NextPrefix(succKey) + kv := i.pointIter.NextPrefix(succKey) if i.combinedIterState.triggered { - return i.initCombinedIteration(+1, k, v, nil) + return i.initCombinedIteration(+1, kv, nil) } - return k, v + return kv } -func (i *lazyCombinedIter) Prev() (*InternalKey, base.LazyValue) { +func (i *lazyCombinedIter) Prev() *base.InternalKV { if i.combinedIterState.initialized { return i.parent.rangeKey.iiter.Prev() } - k, v := i.pointIter.Prev() + kv := i.pointIter.Prev() if i.combinedIterState.triggered { - return i.initCombinedIteration(-1, k, v, nil) + return i.initCombinedIteration(-1, kv, nil) } - return k, v + return kv } func (i *lazyCombinedIter) Error() error { @@ -667,6 +710,24 @@ func (i *lazyCombinedIter) SetBounds(lower, upper []byte) { i.pointIter.SetBounds(lower, upper) } +func (i *lazyCombinedIter) SetContext(ctx context.Context) { + if i.combinedIterState.initialized { + i.parent.rangeKey.iiter.SetContext(ctx) + return + } + i.pointIter.SetContext(ctx) +} + +// DebugTree is part of the InternalIterator interface. +func (i *lazyCombinedIter) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", i, i) + if i.combinedIterState.initialized { + i.parent.rangeKey.iiter.DebugTree(n) + } else { + i.pointIter.DebugTree(n) + } +} + func (i *lazyCombinedIter) String() string { if i.combinedIterState.initialized { return i.parent.rangeKey.iiter.String() diff --git a/vendor/github.com/cockroachdb/pebble/rangekey/rangekey.go b/vendor/github.com/cockroachdb/pebble/v2/rangekey/rangekey.go similarity index 72% rename from vendor/github.com/cockroachdb/pebble/rangekey/rangekey.go rename to vendor/github.com/cockroachdb/pebble/v2/rangekey/rangekey.go index 93e7fbe..8717b41 100644 --- a/vendor/github.com/cockroachdb/pebble/rangekey/rangekey.go +++ b/vendor/github.com/cockroachdb/pebble/v2/rangekey/rangekey.go @@ -6,9 +6,9 @@ package rangekey import ( - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/rangekey" - "github.com/cockroachdb/pebble/sstable" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/rangekey" + "github.com/cockroachdb/pebble/v2/sstable" ) // Fragmenter exports the keyspan.Fragmenter type. @@ -26,8 +26,8 @@ func IsRangeKey(ik sstable.InternalKey) bool { return rangekey.IsRangeKey(ik.Kind()) } -// Decode decodes an InternalKey into a keyspan.Span, if it is a range key. If +// Decode decodes an InternalKey into a Span, if it is a range key. If // keysDst is provided, keys will be appended to keysDst to reduce allocations. -func Decode(ik sstable.InternalKey, val []byte, keysDst []keyspan.Key) (Span, error) { +func Decode(ik sstable.InternalKey, val []byte, keysDst []Key) (Span, error) { return rangekey.Decode(ik, val, keysDst) } diff --git a/vendor/github.com/cockroachdb/pebble/read_compaction_queue.go b/vendor/github.com/cockroachdb/pebble/v2/read_compaction_queue.go similarity index 97% rename from vendor/github.com/cockroachdb/pebble/read_compaction_queue.go rename to vendor/github.com/cockroachdb/pebble/v2/read_compaction_queue.go index 450b7e9..7ceb6c8 100644 --- a/vendor/github.com/cockroachdb/pebble/read_compaction_queue.go +++ b/vendor/github.com/cockroachdb/pebble/v2/read_compaction_queue.go @@ -1,6 +1,6 @@ package pebble -import "github.com/cockroachdb/pebble/internal/base" +import "github.com/cockroachdb/pebble/v2/internal/base" // The maximum number of elements in the readCompactions queue. // We want to limit the number of elements so that we only do diff --git a/vendor/github.com/cockroachdb/pebble/read_state.go b/vendor/github.com/cockroachdb/pebble/v2/read_state.go similarity index 92% rename from vendor/github.com/cockroachdb/pebble/read_state.go rename to vendor/github.com/cockroachdb/pebble/v2/read_state.go index d3a78ba..c4ebff1 100644 --- a/vendor/github.com/cockroachdb/pebble/read_state.go +++ b/vendor/github.com/cockroachdb/pebble/v2/read_state.go @@ -4,7 +4,11 @@ package pebble -import "sync/atomic" +import ( + "sync/atomic" + + "github.com/cockroachdb/pebble/v2/internal/manifest" +) // readState encapsulates the state needed for reading (the current version and // list of memtables). Loading the readState is done without grabbing @@ -21,7 +25,7 @@ import "sync/atomic" type readState struct { db *DB refcnt atomic.Int32 - current *version + current *manifest.Version memtables flushableList } @@ -44,8 +48,8 @@ func (s *readState) unref() { } // The last reference to the readState was released. Check to see if there - // are new obsolete tables to delete. - s.db.maybeScheduleObsoleteTableDeletion() + // are new obsolete objects to delete. + s.db.maybeScheduleObsoleteObjectDeletion() } // unrefLocked removes a reference to the readState. If this was the last @@ -62,8 +66,8 @@ func (s *readState) unrefLocked() { mem.readerUnrefLocked(true) } - // In this code path, the caller is responsible for scheduling obsolete table - // deletion as necessary. + // In this code path, the caller is responsible for scheduling obsolete + // object deletions as necessary. } // loadReadState returns the current readState. The returned readState must be diff --git a/vendor/github.com/cockroachdb/pebble/record/log_writer.go b/vendor/github.com/cockroachdb/pebble/v2/record/log_writer.go similarity index 61% rename from vendor/github.com/cockroachdb/pebble/record/log_writer.go rename to vendor/github.com/cockroachdb/pebble/v2/record/log_writer.go index 4aa5d2d..a6380e3 100644 --- a/vendor/github.com/cockroachdb/pebble/record/log_writer.go +++ b/vendor/github.com/cockroachdb/pebble/v2/record/log_writer.go @@ -13,9 +13,10 @@ import ( "sync/atomic" "time" + "github.com/cockroachdb/crlib/crtime" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/crc" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/crc" "github.com/prometheus/client_golang/prometheus" ) @@ -84,11 +85,12 @@ type syncQueue struct { const dequeueBits = 32 +// unpack extracts the head and tail indices from a 64-bit unsigned integer. func (q *syncQueue) unpack(ptrs uint64) (head, tail uint32) { const mask = 1<> dequeueBits) & mask) tail = uint32(ptrs & mask) - return + return head, tail } func (q *syncQueue) push(wg *sync.WaitGroup, err *error) { @@ -150,12 +152,14 @@ func (q *syncQueue) pop(head, tail uint32, err error, queueSemChan chan struct{} *slot.err = err slot.wg = nil slot.err = nil - // We need to bump the tail count before signalling the wait group as - // signalling the wait group can trigger release a blocked goroutine which - // will try to enqueue before we've "freed" space in the queue. + // We need to bump the tail count before releasing the queueSemChan + // semaphore as releasing the semaphore can cause a blocked goroutine to + // acquire the semaphore and enqueue before we've "freed" space in the + // queue. q.headTail.Add(1) wg.Done() - // Is always non-nil in production. + // Is always non-nil in production, unless using wal package for WAL + // failover. if queueSemChan != nil { <-queueSemChan } @@ -164,17 +168,180 @@ func (q *syncQueue) pop(head, tail uint32, err error, queueSemChan chan struct{} return nil } +// pendingSyncs abstracts out the handling of pending sync requests. In +// standalone mode the implementation is a thin wrapper around syncQueue. In +// the mode where the LogWriter can be subject to failover, there is no queue +// kept in the LogWriter and the signaling to those waiting for sync is +// handled in the wal package. +// +// To avoid heap allocations due to the use of this interface, the parameters +// and return values follow some strict rules: +// - The PendingSync parameter can be reused by the caller after push returns. +// The implementation should be a pointer backed by a struct that is already +// heap allocated, which the caller can reuse for the next push call. +// - The pendingSyncSnapshot return value must be backed by the pendingSyncs +// implementation, so calling snapshotForPop again will cause the previous +// snapshot to be overwritten. +type pendingSyncs interface { + push(PendingSync) + setBlocked() + clearBlocked() + empty() bool + snapshotForPop() pendingSyncsSnapshot + pop(snap pendingSyncsSnapshot, err error) error +} + +type pendingSyncsSnapshot interface { + empty() bool +} + +// PendingSync abstracts the sync specification for a record queued on the +// LogWriter. The only implementations are provided in this package since +// syncRequested is not exported. +type PendingSync interface { + syncRequested() bool +} + +// The implementation of pendingSyncs in standalone mode. +type pendingSyncsWithSyncQueue struct { + syncQueue + syncQueueLen *base.GaugeSampleMetric + snapshotBacking syncQueueSnapshot + // See the comment for LogWriterConfig.QueueSemChan. + queueSemChan chan struct{} +} + +var _ pendingSyncs = &pendingSyncsWithSyncQueue{} + +func (q *pendingSyncsWithSyncQueue) push(ps PendingSync) { + ps2 := ps.(*pendingSyncForSyncQueue) + q.syncQueue.push(ps2.wg, ps2.err) +} + +func (q *pendingSyncsWithSyncQueue) snapshotForPop() pendingSyncsSnapshot { + head, tail, realLength := q.syncQueue.load() + q.snapshotBacking = syncQueueSnapshot{ + head: head, + tail: tail, + } + q.syncQueueLen.AddSample(int64(realLength)) + return &q.snapshotBacking +} + +func (q *pendingSyncsWithSyncQueue) pop(snap pendingSyncsSnapshot, err error) error { + s := snap.(*syncQueueSnapshot) + return q.syncQueue.pop(s.head, s.tail, err, q.queueSemChan) +} + +// The implementation of pendingSyncsSnapshot in standalone mode. +type syncQueueSnapshot struct { + head, tail uint32 +} + +func (s *syncQueueSnapshot) empty() bool { + return s.head == s.tail +} + +// The implementation of pendingSync in standalone mode. +type pendingSyncForSyncQueue struct { + wg *sync.WaitGroup + err *error +} + +func (ps *pendingSyncForSyncQueue) syncRequested() bool { + return ps.wg != nil +} + +// The implementation of pendingSyncs in failover mode. +type pendingSyncsWithHighestSyncIndex struct { + // The highest "index" queued that is requesting a sync. Initialized + // to NoSyncIndex, and reset to NoSyncIndex after the sync. + index atomic.Int64 + snapshotBacking PendingSyncIndex + // blocked is an atomic boolean which indicates whether syncing is currently + // blocked or can proceed. It is used by the implementation of + // min-sync-interval to block syncing until the min interval has passed. + blocked atomic.Bool + externalSyncQueueCallback ExternalSyncQueueCallback +} + +// NoSyncIndex is the value of PendingSyncIndex when a sync is not requested. +const NoSyncIndex = -1 + +func (si *pendingSyncsWithHighestSyncIndex) init( + externalSyncQueueCallback ExternalSyncQueueCallback, +) { + si.index.Store(NoSyncIndex) + si.externalSyncQueueCallback = externalSyncQueueCallback +} + +func (si *pendingSyncsWithHighestSyncIndex) push(ps PendingSync) { + ps2 := ps.(*PendingSyncIndex) + si.index.Store(ps2.Index) +} + +func (si *pendingSyncsWithHighestSyncIndex) setBlocked() { + si.blocked.Store(true) +} + +func (si *pendingSyncsWithHighestSyncIndex) clearBlocked() { + si.blocked.Store(false) +} + +func (si *pendingSyncsWithHighestSyncIndex) empty() bool { + return si.load() == NoSyncIndex +} + +func (si *pendingSyncsWithHighestSyncIndex) snapshotForPop() pendingSyncsSnapshot { + si.snapshotBacking = PendingSyncIndex{Index: si.load()} + return &si.snapshotBacking +} + +func (si *pendingSyncsWithHighestSyncIndex) load() int64 { + index := si.index.Load() + if index != NoSyncIndex && si.blocked.Load() { + index = NoSyncIndex + } + return index +} + +func (si *pendingSyncsWithHighestSyncIndex) pop(snap pendingSyncsSnapshot, err error) error { + index := snap.(*PendingSyncIndex) + if index.Index == NoSyncIndex { + return nil + } + // Set to NoSyncIndex if a higher index has not queued. + si.index.CompareAndSwap(index.Index, NoSyncIndex) + si.externalSyncQueueCallback(*index, err) + return nil +} + +// PendingSyncIndex implements both pendingSyncsSnapshot and PendingSync. +type PendingSyncIndex struct { + // Index is some state meaningful to the user of LogWriter. The LogWriter + // itself only examines whether Index is equal to NoSyncIndex. + Index int64 +} + +func (s *PendingSyncIndex) empty() bool { + return s.Index == NoSyncIndex +} + +func (s *PendingSyncIndex) syncRequested() bool { + return s.Index != NoSyncIndex +} + // flusherCond is a specialized condition variable that allows its condition to // change and readiness be signalled without holding its associated mutex. In // particular, when a waiter is added to syncQueue atomically, this condition // variable can be signalled without holding flusher.Mutex. type flusherCond struct { mu *sync.Mutex - q *syncQueue + q pendingSyncs cond sync.Cond } -func (c *flusherCond) init(mu *sync.Mutex, q *syncQueue) { +func (c *flusherCond) init(mu *sync.Mutex, q pendingSyncs) { c.mu = mu c.q = q // Yes, this is a bit circular, but that is intentional. flusherCond.cond.L @@ -259,8 +426,12 @@ type LogWriter struct { logNum uint32 // blockNum is the zero based block number for the current block. blockNum int64 - // err is any accumulated error. TODO(peter): This needs to be protected in - // some fashion. Perhaps using atomic.Value. + // err is any accumulated error. It originates in flusher.err, and is + // updated to reflect flusher.err when a block is full and getting enqueued. + // Therefore, there is a lag between when flusher.err has a non-nil error, + // and when that non-nil error is reflected in LogWriter.err. On close, it + // is set to errClosedWriter to inform accidental future calls to + // SyncRecord*. err error // block is the current block being written. Protected by flusher.Mutex. block *block @@ -286,8 +457,10 @@ type LogWriter struct { minSyncInterval durationFunc fsyncLatency prometheus.Histogram pending []*block - syncQ syncQueue - metrics *LogWriterMetrics + // Pushing and popping from pendingSyncs does not require flusher mutex to + // be held. + pendingSyncs pendingSyncs + metrics *LogWriterMetrics } // afterFunc is a hook to allow tests to mock out the timer functionality @@ -295,8 +468,22 @@ type LogWriter struct { // time.AfterFunc. afterFunc func(d time.Duration, f func()) syncTimer - // See the comment for LogWriterConfig.QueueSemChan. - queueSemChan chan struct{} + // Backing for both pendingSyncs implementations. + pendingSyncsBackingQ pendingSyncsWithSyncQueue + pendingSyncsBackingIndex pendingSyncsWithHighestSyncIndex + + pendingSyncForSyncQueueBacking pendingSyncForSyncQueue + + // syncedOffset is the offset in the log that is durably synced after a + // flush. This member is used to write the WAL Sync chunk format's "Offset" + // field in the header. + syncedOffset atomic.Uint64 + + // emitFragment is set at runtime depending on which FormatMajorVersion + // is used. emitFragment will be set to writing WAL Sync chunk formats + // if the FormatMajorVersion is greater than or equal to FormatWALSyncChunks, + // otherwise it will write the recyclable chunk format. + emitFragment func(n int, p []byte) (remainingP []byte) } // LogWriterConfig is a struct used for configuring new LogWriters @@ -308,8 +495,30 @@ type LogWriterConfig struct { // the syncQueue from overflowing (which will cause a panic). All production // code ensures this is non-nil. QueueSemChan chan struct{} + + // ExternalSyncQueueCallback is set to non-nil when the LogWriter is used + // as part of a WAL implementation that can failover between LogWriters. + // + // In this case, QueueSemChan is always nil, and SyncRecordGeneralized must + // be used with a PendingSync parameter that is implemented by + // PendingSyncIndex. When an index is synced (which implies all earlier + // indices are also synced), this callback is invoked. The caller must not + // hold any mutex when invoking this callback, since the lock ordering + // requirement in this case is that any higher layer locks (in the wal + // package) precede the lower layer locks (in the record package). These + // callbacks are serialized since they are invoked from the flushLoop. + ExternalSyncQueueCallback ExternalSyncQueueCallback + + // WriteWALSyncOffsets determines whether to write WAL sync chunk offsets. + // The format major version can change (ratchet) at runtime, so this must be + // a function rather than a static bool to ensure we use the latest format version. + WriteWALSyncOffsets func() bool } +// ExternalSyncQueueCallback is to be run when a PendingSync has been +// processed, either successfully or with an error. +type ExternalSyncQueueCallback func(doneSync PendingSyncIndex, err error) + // initialAllocatedBlocksCap is the initial capacity of the various slices // intended to hold LogWriter blocks. The LogWriter may allocate more blocks // than this threshold allows. @@ -323,7 +532,12 @@ var blockPool = sync.Pool{ } // NewLogWriter returns a new LogWriter. -func NewLogWriter(w io.Writer, logNum base.FileNum, logWriterConfig LogWriterConfig) *LogWriter { +// +// The io.Writer may also be used as an io.Closer and syncer. No other methods +// will be called on the writer. +func NewLogWriter( + w io.Writer, logNum base.DiskFileNum, logWriterConfig LogWriterConfig, +) *LogWriter { c, _ := w.(io.Closer) s, _ := w.(syncer) r := &LogWriter{ @@ -338,14 +552,32 @@ func NewLogWriter(w io.Writer, logNum base.FileNum, logWriterConfig LogWriterCon afterFunc: func(d time.Duration, f func()) syncTimer { return time.AfterFunc(d, f) }, - queueSemChan: logWriterConfig.QueueSemChan, } + + if logWriterConfig.WriteWALSyncOffsets() { + r.emitFragment = r.emitFragmentSyncOffsets + } else { + r.emitFragment = r.emitFragmentRecyclable + } + + m := &LogWriterMetrics{} + if logWriterConfig.ExternalSyncQueueCallback != nil { + r.pendingSyncsBackingIndex.init(logWriterConfig.ExternalSyncQueueCallback) + r.flusher.pendingSyncs = &r.pendingSyncsBackingIndex + } else { + r.pendingSyncsBackingQ = pendingSyncsWithSyncQueue{ + syncQueueLen: &m.SyncQueueLen, + queueSemChan: logWriterConfig.QueueSemChan, + } + r.flusher.pendingSyncs = &r.pendingSyncsBackingQ + } + r.free.blocks = make([]*block, 0, initialAllocatedBlocksCap) r.block = blockPool.Get().(*block) - r.flusher.ready.init(&r.flusher.Mutex, &r.flusher.syncQ) + r.flusher.ready.init(&r.flusher.Mutex, r.flusher.pendingSyncs) r.flusher.closed = make(chan struct{}) r.flusher.pending = make([]*block, 0, cap(r.free.blocks)) - r.flusher.metrics = &LogWriterMetrics{} + r.flusher.metrics = m f := &r.flusher f.minSyncInterval = logWriterConfig.WALMinSyncInterval @@ -362,12 +594,12 @@ func (w *LogWriter) flushLoop(context.Context) { f.Lock() // Initialize idleStartTime to when the loop starts. - idleStartTime := time.Now() + idleStartTime := crtime.NowMono() var syncTimer syncTimer defer func() { // Capture the idle duration between the last piece of work and when the // loop terminated. - f.metrics.WriteThroughput.IdleDuration += time.Since(idleStartTime) + f.metrics.WriteThroughput.IdleDuration += idleStartTime.Elapsed() if syncTimer != nil { syncTimer.Stop() } @@ -375,6 +607,11 @@ func (w *LogWriter) flushLoop(context.Context) { f.Unlock() }() + // writtenOffset is the amount of data that has been written + // but not necessarily synced. This is used to update logWriter's + // syncedOffset after a sync. + var writtenOffset uint64 = 0 + // The flush loop performs flushing of full and partial data blocks to the // underlying writer (LogWriter.w), syncing of the writer, and notification // to sync requests that they have completed. @@ -421,14 +658,14 @@ func (w *LogWriter) flushLoop(context.Context) { // the current block can be added to the pending blocks list after we release // the flusher lock, but it won't be part of pending. written := w.block.written.Load() - if len(f.pending) > 0 || written > w.block.flushed || !f.syncQ.empty() { + if len(f.pending) > 0 || written > w.block.flushed || !f.pendingSyncs.empty() { break } if f.close { // If the writer is closed, pretend the sync timer fired immediately so // that we can process any queued sync requests. - f.syncQ.clearBlocked() - if !f.syncQ.empty() { + f.pendingSyncs.clearBlocked() + if !f.pendingSyncs.empty() { break } return @@ -437,7 +674,19 @@ func (w *LogWriter) flushLoop(context.Context) { continue } // Found work to do, so no longer idle. - workStartTime := time.Now() + // + // NB: it is safe to read pending before loading from the syncQ since + // mutations to pending require the w.flusher mutex, which is held here. + // There is no risk that someone will concurrently add to pending, so the + // following sequence, which would pick up a syncQ entry without the + // corresponding data, is impossible: + // + // Thread enqueueing This thread + // 1. read pending + // 2. add block to pending + // 3. add to syncQ + // 4. read syncQ + workStartTime := crtime.NowMono() idleDuration := workStartTime.Sub(idleStartTime) pending = append(pending[:0], f.pending...) f.pending = f.pending[:0] @@ -446,8 +695,7 @@ func (w *LogWriter) flushLoop(context.Context) { // Grab the list of sync waiters. Note that syncQueue.load() will return // 0,0 while we're waiting for the min-sync-interval to expire. This // allows flushing to proceed even if we're not ready to sync. - head, tail, realSyncQLen := f.syncQ.load() - f.metrics.SyncQueueLen.AddSample(int64(realSyncQLen)) + snap := f.pendingSyncs.snapshotForPop() // Grab the portion of the current block that requires flushing. Note that // the current block can be added to the pending blocks list after we @@ -459,29 +707,35 @@ func (w *LogWriter) flushLoop(context.Context) { data := w.block.buf[w.block.flushed:written] w.block.flushed = written + fErr := f.err + f.Unlock() // If flusher has an error, we propagate it to waiters. Note in spite of // error we consume the pending list above to free blocks for writers. - if f.err != nil { - f.syncQ.pop(head, tail, f.err, w.queueSemChan) + if fErr != nil { + // NB: pop may invoke ExternalSyncQueueCallback, which is why we have + // called f.Unlock() above. We will acquire the lock again below. + _ = f.pendingSyncs.pop(snap, fErr) // Update the idleStartTime if work could not be done, so that we don't // include the duration we tried to do work as idle. We don't bother // with the rest of the accounting, which means we will undercount. - idleStartTime = time.Now() + idleStartTime = crtime.NowMono() + f.Lock() continue } - f.Unlock() - synced, syncLatency, bytesWritten, err := w.flushPending(data, pending, head, tail) + writtenOffset += uint64(len(data)) + synced, syncLatency, bytesWritten, err := w.flushPending(data, pending, snap) f.Lock() if synced && f.fsyncLatency != nil { + w.syncedOffset.Store(writtenOffset) f.fsyncLatency.Observe(float64(syncLatency)) } f.err = err if f.err != nil { - f.syncQ.clearBlocked() + f.pendingSyncs.clearBlocked() // Update the idleStartTime if work could not be done, so that we don't // include the duration we tried to do work as idle. We don't bother // with the rest of the accounting, which means we will undercount. - idleStartTime = time.Now() + idleStartTime = crtime.NowMono() continue } @@ -489,10 +743,10 @@ func (w *LogWriter) flushLoop(context.Context) { // A sync was performed. Make sure we've waited for the min sync // interval before syncing again. if min := f.minSyncInterval(); min > 0 { - f.syncQ.setBlocked() + f.pendingSyncs.setBlocked() if syncTimer == nil { syncTimer = w.afterFunc(min, func() { - f.syncQ.clearBlocked() + f.pendingSyncs.clearBlocked() f.ready.Signal() }) } else { @@ -501,7 +755,7 @@ func (w *LogWriter) flushLoop(context.Context) { } } // Finished work, and started idling. - idleStartTime = time.Now() + idleStartTime = crtime.NowMono() workDuration := idleStartTime.Sub(workStartTime) f.metrics.WriteThroughput.Bytes += bytesWritten f.metrics.WriteThroughput.WorkDuration += workDuration @@ -510,7 +764,7 @@ func (w *LogWriter) flushLoop(context.Context) { } func (w *LogWriter) flushPending( - data []byte, pending []*block, head, tail uint32, + data []byte, pending []*block, snap pendingSyncsSnapshot, ) (synced bool, syncLatency time.Duration, bytesWritten int64, err error) { defer func() { // Translate panics into errors. The errors will cause flushLoop to shut @@ -533,14 +787,16 @@ func (w *LogWriter) flushPending( _, err = w.w.Write(data) } - synced = head != tail + synced = !snap.empty() if synced { if err == nil && w.s != nil { syncLatency, err = w.syncWithLatency() + } else { + synced = false } f := &w.flusher - if popErr := f.syncQ.pop(head, tail, err, w.queueSemChan); popErr != nil { - return synced, syncLatency, bytesWritten, popErr + if popErr := f.pendingSyncs.pop(snap, err); popErr != nil { + return synced, syncLatency, bytesWritten, firstError(err, popErr) } } @@ -548,9 +804,9 @@ func (w *LogWriter) flushPending( } func (w *LogWriter) syncWithLatency() (time.Duration, error) { - start := time.Now() + start := crtime.NowMono() err := w.s.Sync() - syncLatency := time.Since(start) + syncLatency := start.Elapsed() return syncLatency, err } @@ -593,6 +849,16 @@ func (w *LogWriter) queueBlock() { // Close flushes and syncs any unwritten data and closes the writer. // Where required, external synchronisation is provided by commitPipeline.mu. func (w *LogWriter) Close() error { + return w.closeInternal(PendingSyncIndex{Index: NoSyncIndex}) +} + +// CloseWithLastQueuedRecord is like Close, but optionally accepts a +// lastQueuedRecord, that the caller will be notified about when synced. +func (w *LogWriter) CloseWithLastQueuedRecord(lastQueuedRecord PendingSyncIndex) error { + return w.closeInternal(lastQueuedRecord) +} + +func (w *LogWriter) closeInternal(lastQueuedRecord PendingSyncIndex) error { f := &w.flusher // Emit an EOF trailer signifying the end of this log. This helps readers @@ -619,18 +885,21 @@ func (w *LogWriter) Close() error { syncLatency, err = w.syncWithLatency() } f.Lock() - if f.fsyncLatency != nil { + if err == nil && f.fsyncLatency != nil { f.fsyncLatency.Observe(float64(syncLatency)) } free := w.free.blocks f.Unlock() + // NB: the caller of closeInternal may not care about a non-nil cerr below + // if all queued writes have been successfully written and synced. + if lastQueuedRecord.Index != NoSyncIndex { + w.pendingSyncsBackingIndex.externalSyncQueueCallback(lastQueuedRecord, err) + } if w.c != nil { cerr := w.c.Close() w.c = nil - if cerr != nil { - return cerr - } + err = firstError(err, cerr) } for _, b := range free { @@ -643,6 +912,15 @@ func (w *LogWriter) Close() error { return err } +// firstError returns the first non-nil error of err0 and err1, or nil if both +// are nil. +func firstError(err0, err1 error) error { + if err0 != nil { + return err0 + } + return err1 +} + // WriteRecord writes a complete record. Returns the offset just past the end // of the record. // External synchronisation provided by commitPipeline.mu. @@ -659,6 +937,16 @@ func (w *LogWriter) WriteRecord(p []byte) (int64, error) { func (w *LogWriter) SyncRecord( p []byte, wg *sync.WaitGroup, err *error, ) (logSize int64, err2 error) { + w.pendingSyncForSyncQueueBacking = pendingSyncForSyncQueue{ + wg: wg, + err: err, + } + return w.SyncRecordGeneralized(p, &w.pendingSyncForSyncQueueBacking) +} + +// SyncRecordGeneralized is a version of SyncRecord that accepts a +// PendingSync. +func (w *LogWriter) SyncRecordGeneralized(p []byte, ps PendingSync) (logSize int64, err2 error) { if w.err != nil { return -1, w.err } @@ -671,14 +959,14 @@ func (w *LogWriter) SyncRecord( p = w.emitFragment(i, p) } - if wg != nil { + if ps.syncRequested() { // If we've been asked to persist the record, add the WaitGroup to the sync // queue and signal the flushLoop. Note that flushLoop will write partial // blocks to the file if syncing has been requested. The contract is that // any record written to the LogWriter to this point will be flushed to the // OS and synced to disk. f := &w.flusher - f.syncQ.push(wg, err) + f.pendingSyncs.push(ps) f.ready.Signal() } @@ -696,6 +984,11 @@ func (w *LogWriter) Size() int64 { return w.blockNum*blockSize + int64(w.block.written.Load()) } +// emitEOFTrailer writes a special recyclable chunk header to signal EOF. +// The reason why this function writes the recyclable chunk header instead +// of having a function for writing recyclable and WAL sync chunks as +// emitFragment does it because there is no reason to add 8 additional +// bytes to the EOFTrailer for the SyncedOffset as it will be zeroed out anyway. func (w *LogWriter) emitEOFTrailer() { // Write a recyclable chunk header with a different log number. Readers // will treat the header as EOF when the log number does not match. @@ -703,12 +996,12 @@ func (w *LogWriter) emitEOFTrailer() { i := b.written.Load() binary.LittleEndian.PutUint32(b.buf[i+0:i+4], 0) // CRC binary.LittleEndian.PutUint16(b.buf[i+4:i+6], 0) // Size - b.buf[i+6] = recyclableFullChunkType + b.buf[i+6] = recyclableFullChunkEncoding binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum+1) // Log number b.written.Store(i + int32(recyclableHeaderSize)) } -func (w *LogWriter) emitFragment(n int, p []byte) (remainingP []byte) { +func (w *LogWriter) emitFragmentRecyclable(n int, p []byte) (remainingP []byte) { b := w.block i := b.written.Load() first := n == 0 @@ -716,15 +1009,15 @@ func (w *LogWriter) emitFragment(n int, p []byte) (remainingP []byte) { if last { if first { - b.buf[i+6] = recyclableFullChunkType + b.buf[i+6] = recyclableFullChunkEncoding } else { - b.buf[i+6] = recyclableLastChunkType + b.buf[i+6] = recyclableLastChunkEncoding } } else { if first { - b.buf[i+6] = recyclableFirstChunkType + b.buf[i+6] = recyclableFirstChunkEncoding } else { - b.buf[i+6] = recyclableMiddleChunkType + b.buf[i+6] = recyclableMiddleChunkEncoding } } @@ -739,18 +1032,59 @@ func (w *LogWriter) emitFragment(n int, p []byte) (remainingP []byte) { if blockSize-b.written.Load() < recyclableHeaderSize { // There is no room for another fragment in the block, so fill the // remaining bytes with zeros and queue the block for flushing. - for i := b.written.Load(); i < blockSize; i++ { - b.buf[i] = 0 + clear(b.buf[b.written.Load():]) + w.queueBlock() + } + return p[r:] +} + +func (w *LogWriter) emitFragmentSyncOffsets(n int, p []byte) (remainingP []byte) { + b := w.block + i := b.written.Load() + first := n == 0 + last := blockSize-i-walSyncHeaderSize >= int32(len(p)) + + if last { + if first { + b.buf[i+6] = walSyncFullChunkEncoding + } else { + b.buf[i+6] = walSyncLastChunkEncoding + } + } else { + if first { + b.buf[i+6] = walSyncFirstChunkEncoding + } else { + b.buf[i+6] = walSyncMiddleChunkEncoding } + } + + binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum) + binary.LittleEndian.PutUint64(b.buf[i+11:i+19], w.syncedOffset.Load()) + + r := copy(b.buf[i+walSyncHeaderSize:], p) + j := i + int32(walSyncHeaderSize+r) + binary.LittleEndian.PutUint32(b.buf[i+0:i+4], crc.New(b.buf[i+6:j]).Value()) + binary.LittleEndian.PutUint16(b.buf[i+4:i+6], uint16(r)) + b.written.Store(j) + + if blockSize-b.written.Load() < walSyncHeaderSize { + // There is no room for another fragment in the block, so fill the + // remaining bytes with zeros and queue the block for flushing. + clear(b.buf[b.written.Load():]) w.queueBlock() } return p[r:] } -// Metrics must be called after Close. The callee will no longer modify the -// returned LogWriterMetrics. -func (w *LogWriter) Metrics() *LogWriterMetrics { - return w.flusher.metrics +// Metrics must typically be called after Close, since the callee will no +// longer modify the returned LogWriterMetrics. It is also current if there is +// nothing left to flush in the flush loop, but that is an implementation +// detail that callers should not rely on. +func (w *LogWriter) Metrics() LogWriterMetrics { + w.flusher.Lock() + defer w.flusher.Unlock() + m := *w.flusher.metrics + return m } // LogWriterMetrics contains misc metrics for the log writer. diff --git a/vendor/github.com/cockroachdb/pebble/v2/record/record.go b/vendor/github.com/cockroachdb/pebble/v2/record/record.go new file mode 100644 index 0000000..1addb81 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/record/record.go @@ -0,0 +1,893 @@ +// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package record reads and writes sequences of records. Each record is a stream +// of bytes that completes before the next record starts. +// +// When reading, call Next to obtain an io.Reader for the next record. Next will +// return io.EOF when there are no more records. It is valid to call Next +// without reading the current record to exhaustion. +// +// When writing, call Next to obtain an io.Writer for the next record. Calling +// Next finishes the current record. Call Close to finish the final record. +// +// Optionally, call Flush to finish the current record and flush the underlying +// writer without starting a new record. To start a new record after flushing, +// call Next. +// +// Neither Readers or Writers are safe to use concurrently. +// +// Example code: +// +// func read(r io.Reader) ([]string, error) { +// var ss []string +// records := record.NewReader(r) +// for { +// rec, err := records.Next() +// if err == io.EOF { +// break +// } +// s, err := io.ReadAll(rec) +// ss = append(ss, string(s)) +// } +// return ss, nil +// } +// +// func write(w io.Writer, ss []string) error { +// records := record.NewWriter(w) +// for _, s := range ss { +// rec, err := records.Next() +// if err != nil { +// return err +// } +// if _, err := rec.Write([]byte(s)), err != nil { +// return err +// } +// } +// return records.Close() +// } +// +// The wire format is that the stream is divided into 32KiB blocks, and each +// block contains a number of tightly packed chunks. Chunks cannot cross block +// boundaries. The last block may be shorter than 32 KiB. Any unused bytes in a +// block must be zero. +// +// A record maps to one or more chunks. There are two chunk formats: legacy and +// recyclable. The legacy chunk format: +// +// +----------+-----------+-----------+--- ... ---+ +// | CRC (4B) | Size (2B) | Type (1B) | Payload | +// +----------+-----------+-----------+--- ... ---+ +// +// CRC is computed over the type and payload +// Size is the length of the payload in bytes +// Type is the chunk type +// +// There are four chunk types: whether the chunk is the full record, or the +// first, middle or last chunk of a multi-chunk record. A multi-chunk record +// has one first chunk, zero or more middle chunks, and one last chunk. +// +// The recyclable chunk format is similar to the legacy format, but extends +// the chunk header with an additional log number field. This allows reuse +// (recycling) of log files which can provide significantly better performance +// when syncing frequently as it avoids needing to update the file +// metadata. Additionally, recycling log files is a prequisite for using direct +// IO with log writing. The recyclable format is: +// +// +----------+-----------+-----------+----------------+--- ... ---+ +// | CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload | +// +----------+-----------+-----------+----------------+--- ... ---+ +// +// Recyclable chunks are distinguished from legacy chunks by the addition of 4 +// extra "recyclable" chunk types that map directly to the legacy chunk types +// (i.e. full, first, middle, last). The CRC is computed over the type, log +// number, and payload. +// +// The WAL sync chunk format allows for detection of data corruption in some +// circumstances. The WAL sync format extends the recyclable header with an +// additional offset field. This allows "reading ahead" to be done in order to +// decipher whether an invalid or zeroed chunk was an artifact of corruption or the +// logical end of the log. SyncOffset is a promise that the log should have been +// synced up until the offset. A promised synced offset is needed because cloud +// providers may write blocks out of order, rendering "read aheads" scanning for +// logNum inaccurate. +// The WAL sync format is: +// +----------+-----------+-----------+----------------+------------------+--- ... ---+ +// | CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Sync Offset (8B) | Payload | +// +----------+-----------+-----------+----------------+------------------+--- ... ---+ +// + +package record + +// The C++ Level-DB code calls this the log, but it has been renamed to record +// to avoid clashing with the standard log package, and because it is generally +// useful outside of logging. The C++ code also uses the term "physical record" +// instead of "chunk", but "chunk" is shorter and less confusing. + +import ( + "encoding/binary" + "io" + "math" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/bitflip" + "github.com/cockroachdb/pebble/v2/internal/crc" +) + +// These constants are part of the wire format and should not be changed. +const ( + invalidChunkEncoding = 0 + + fullChunkEncoding = 1 + firstChunkEncoding = 2 + middleChunkEncoding = 3 + lastChunkEncoding = 4 + + recyclableFullChunkEncoding = 5 + recyclableFirstChunkEncoding = 6 + recyclableMiddleChunkEncoding = 7 + recyclableLastChunkEncoding = 8 + + walSyncFullChunkEncoding = 9 + walSyncFirstChunkEncoding = 10 + walSyncMiddleChunkEncoding = 11 + walSyncLastChunkEncoding = 12 +) + +const ( + blockSize = 32 * 1024 + blockSizeMask = blockSize - 1 + legacyHeaderSize = 7 + recyclableHeaderSize = legacyHeaderSize + 4 + walSyncHeaderSize = recyclableHeaderSize + 8 +) + +// chunkPosition represents the type of a chunk in the log. +// Records can be split into multiple chunks and marked +// by the following position types: +// - invalidChunkPosition: an invalid chunk +// - fullChunkPosition: a complete record stored in a single chunk +// - firstChunkPosition: first chunk of a multi-chunk record +// - middleChunkPosition: intermediate chunk in a multi-chunk record +// - lastChunkPosition: final chunk of a multi-chunk record +type chunkPosition int + +const ( + invalidChunkPosition chunkPosition = iota + fullChunkPosition + firstChunkPosition + middleChunkPosition + lastChunkPosition +) + +// wireFormat specifies the encoding format used for chunks. +// wireFormat is used for backwards compatibility and new +// wire formats may be introduced to support additional +// WAL chunks. +type wireFormat int + +const ( + invalidWireFormat wireFormat = iota + legacyWireFormat + recyclableWireFormat + walSyncWireFormat +) + +// headerFormat represents the format of a chunk which has +// a chunkPosition, wireFormat, and a headerSize. +type headerFormat struct { + chunkPosition + wireFormat + headerSize int +} + +// headerFormatMappings translates encodings to headerFormats +var headerFormatMappings = [...]headerFormat{ + invalidChunkEncoding: {chunkPosition: invalidChunkPosition, wireFormat: invalidWireFormat, headerSize: 0}, + fullChunkEncoding: {chunkPosition: fullChunkPosition, wireFormat: legacyWireFormat, headerSize: legacyHeaderSize}, + firstChunkEncoding: {chunkPosition: firstChunkPosition, wireFormat: legacyWireFormat, headerSize: legacyHeaderSize}, + middleChunkEncoding: {chunkPosition: middleChunkPosition, wireFormat: legacyWireFormat, headerSize: legacyHeaderSize}, + lastChunkEncoding: {chunkPosition: lastChunkPosition, wireFormat: legacyWireFormat, headerSize: legacyHeaderSize}, + recyclableFullChunkEncoding: {chunkPosition: fullChunkPosition, wireFormat: recyclableWireFormat, headerSize: recyclableHeaderSize}, + recyclableFirstChunkEncoding: {chunkPosition: firstChunkPosition, wireFormat: recyclableWireFormat, headerSize: recyclableHeaderSize}, + recyclableMiddleChunkEncoding: {chunkPosition: middleChunkPosition, wireFormat: recyclableWireFormat, headerSize: recyclableHeaderSize}, + recyclableLastChunkEncoding: {chunkPosition: lastChunkPosition, wireFormat: recyclableWireFormat, headerSize: recyclableHeaderSize}, + walSyncFullChunkEncoding: {chunkPosition: fullChunkPosition, wireFormat: walSyncWireFormat, headerSize: walSyncHeaderSize}, + walSyncFirstChunkEncoding: {chunkPosition: firstChunkPosition, wireFormat: walSyncWireFormat, headerSize: walSyncHeaderSize}, + walSyncMiddleChunkEncoding: {chunkPosition: middleChunkPosition, wireFormat: walSyncWireFormat, headerSize: walSyncHeaderSize}, + walSyncLastChunkEncoding: {chunkPosition: lastChunkPosition, wireFormat: walSyncWireFormat, headerSize: walSyncHeaderSize}, +} + +var ( + // ErrNotAnIOSeeker is returned if the io.Reader underlying a Reader does not implement io.Seeker. + ErrNotAnIOSeeker = errors.New("pebble/record: reader does not implement io.Seeker") + + // ErrNoLastRecord is returned if LastRecordOffset is called and there is no previous record. + ErrNoLastRecord = errors.New("pebble/record: no last record exists") + + // ErrZeroedChunk is returned if a chunk is encountered that is zeroed. This + // usually occurs due to log file preallocation. + ErrZeroedChunk = errors.New("pebble/record: zeroed chunk") + + // ErrInvalidChunk is returned if a chunk is encountered with an invalid + // header, length, or checksum. This usually occurs when a log is recycled, + // but can also occur due to corruption. + ErrInvalidChunk = errors.New("pebble/record: invalid chunk") + + // ErrUnexpectedEOF is returned if a log file ends unexpectedly. It + // indicates the unexpected end of the log file or an in-progress record + // envelope itself. ErrUnexpectedEOF may be returned by Reader when it + // encounters an invalid chunk but observes no evidence that the invalid + // chunk is caused by corruption (i.e., no future chunk indicates that + // offset should be valid and durably synced.) + // + // This error is defined separately from io.ErrUnexpectedEOF to disambiguate + // this case from from the case of an unexpected end of the record's payload + // while decoding at a higher-level (eg, version edit decoding). If a + // higher-level decoding routine returns record.ErrUnexpectedEOF, it + // unambiguously indicates that the log file itself ended unexpectedly. The + // record.Reader will never return io.ErrUnexpectedEOF, just record.ErrUnexpectedEOF. + ErrUnexpectedEOF = errors.New("pebble/record: unexpected EOF") +) + +// IsInvalidRecord returns true if the error matches one of the error types +// returned for invalid records. These are treated in a way similar to io.EOF +// in recovery code. +func IsInvalidRecord(err error) bool { + return err == ErrZeroedChunk || err == ErrInvalidChunk || err == ErrUnexpectedEOF +} + +// Reader reads records from an underlying io.Reader. +type Reader struct { + // r is the underlying reader. + r io.Reader + // logNum is the low 32-bits of the log's file number. May be zero when used + // with log files that do not have a file number (e.g. the MANIFEST). + logNum uint32 + // blockNum is the zero based block number currently held in buf. + blockNum int64 + // seq is the sequence number of the current record. + seq int + // buf[begin:end] is the unread portion of the current chunk's payload. The + // low bound, begin, excludes the chunk header. + begin, end int + // n is the number of bytes of buf that are valid. Once reading has started, + // only the final block can have n < blockSize. + n int + // last is whether the current chunk is the last chunk of the record. + last bool + // err is any accumulated error. + err error + // buf is the buffer. + buf [blockSize]byte + // invalidOffset is the first encountered chunk offset found during nextChunk() + // that had garbage values. It is used to clarify whether or not a garbage chunk + // encountered during WAL replay was the logical EOF or confirmed corruption. + invalidOffset uint64 + + // loggerForTesting is a logging helper used by the Reader to accumulate log messages. + loggerForTesting loggerForTesting +} + +type loggerForTesting interface { + logf(format string, args ...interface{}) +} + +// NewReader returns a new reader. If the file contains records encoded using +// the recyclable record format, then the log number in those records must +// match the specified logNum. +func NewReader(r io.Reader, logNum base.DiskFileNum) *Reader { + return &Reader{ + r: r, + logNum: uint32(logNum), + blockNum: -1, + // invalidOffset is initialized as MaxUint64 so that reading ahead + // with the old chunk wire formats results in ErrUnexpectedEOF. + invalidOffset: math.MaxUint64, + } +} + +// nextChunk sets r.buf[r.i:r.j] to hold the next chunk's payload, reading the +// next block into the buffer if necessary. +func (r *Reader) nextChunk(wantFirst bool) error { + for { + if r.end+legacyHeaderSize <= r.n { + checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4]) + length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6]) + chunkEncoding := r.buf[r.end+6] + + if int(chunkEncoding) >= len(headerFormatMappings) { + r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin) + return ErrInvalidChunk + } + headerFormat := headerFormatMappings[chunkEncoding] + chunkPosition, wireFormat, headerSize := headerFormat.chunkPosition, headerFormat.wireFormat, headerFormat.headerSize + + if checksum == 0 && length == 0 && chunkPosition == invalidChunkPosition { + // remaining bytes < 11 + // The remaining bytes in the block is < 11 so regardless of which chunk format is + // being written (Recyclable or walSync), we should skip to the next block. + if r.end+recyclableHeaderSize > r.n { + // Skip the rest of the block if the recyclable header size does not + // fit within it. The end of a block will be zeroed out if the log writer + // cannot fit another chunk into it, even a chunk with no payload like + // the EOF Trailer. + r.end = r.n + continue + } + + // check if 11 <= remaining bytes < 19 + // If so, the remaining bytes in the block can fit a recyclable header but not a + // walSync header. In this case, check if the remainder of the chunk is all zeroes. + // + // If the remainder was all zeroes, then we tolerate this and continue to + // the next block. However, if there was non-zero content in the remaining chunk, + // then was possibly an artifact of corruption found and ErrZeroedChunk should be + // returned. + if r.end+walSyncHeaderSize > r.n { + // Check that the remainder of the chunk is all zeroes. + for i := r.end; i < r.n; i++ { + if r.buf[i] != 0 { + r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin) + return ErrZeroedChunk + } + } + r.end = r.n + continue + } + + // The last case is when there was more than 19 bytes which means there shouldn't be + // a zeroed header. Thus, this case should also return ErrZeroedChunk. + r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin) + return ErrZeroedChunk + } + + if wireFormat == invalidWireFormat { + r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin) + return ErrInvalidChunk + } + if wireFormat == recyclableWireFormat || wireFormat == walSyncWireFormat { + if r.end+headerSize > r.n { + r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin) + return ErrInvalidChunk + } + + logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11]) + if logNum != r.logNum { + // An EOF trailer encodes a log number that is 1 more than the + // current log number. + if logNum == 1+r.logNum && wantFirst { + return io.EOF + } + // Otherwise, treat this chunk as invalid in order to prevent reading + // of a partial record. + r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin) + return ErrInvalidChunk + } + } + + r.begin = r.end + headerSize + r.end = r.begin + int(length) + if r.end > r.n { + // The chunk straddles a 32KB boundary (or the end of file). + r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin) + return ErrInvalidChunk + } + data := r.buf[r.begin-headerSize+6 : r.end] + if checksum != crc.New(data).Value() { + computeChecksum := func(data []byte) uint32 { return crc.New(data).Value() } + // Check if there was a bit flip. + found, indexFound, bitFound := bitflip.CheckSliceForBitFlip(data, computeChecksum, checksum) + err := ErrInvalidChunk + if found { + err = errors.WithSafeDetails(err, ". bit flip found: block num %d. wal offset %d. byte index %d. got: 0x%x. want: 0x%x.", + errors.Safe(r.blockNum), errors.Safe(r.invalidOffset), errors.Safe(indexFound), errors.Safe(data[indexFound]), errors.Safe(data[indexFound]^(1<= 0 { + if !wantFirst || r.end != r.n { + // This can happen if the previous instance of the log ended with a + // partial block at the same blockNum as the new log but extended + // beyond the partial block of the new log. + r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin) + return ErrInvalidChunk + } + return io.EOF + } + n, err := io.ReadFull(r.r, r.buf[:]) + if err != nil && err != io.ErrUnexpectedEOF { + if err == io.EOF && !wantFirst { + r.invalidOffset = uint64(r.blockNum)*blockSize + uint64(r.begin) + return ErrUnexpectedEOF + } + return err + } + r.begin, r.end, r.n = 0, 0, n + r.blockNum++ + } +} + +// Next returns a reader for the next record. It returns io.EOF if there are no +// more records. The reader returned becomes stale after the next Next call, +// and should no longer be used. +func (r *Reader) Next() (io.Reader, error) { + r.seq++ + if r.err != nil { + return nil, r.err + } + r.begin = r.end + r.err = r.nextChunk(true) + if errors.Is(r.err, ErrInvalidChunk) || errors.Is(r.err, ErrZeroedChunk) { + readAheadResult := r.readAheadForCorruption() + return nil, readAheadResult + } + if r.err != nil { + return nil, r.err + } + return singleReader{r, r.seq}, nil +} + +// readAheadForCorruption scans ahead in the log to detect corruption. +// It loads in blocks and reads chunks until it either detects corruption +// due to an offset (encoded in a chunk header) exceeding the invalid offset, +// or encountering end of file when loading a new block. +// +// This function is called from Reader.Read() and Reader.Next() after an error +// is recorded in r.err after a call to Reader.nextChunk(). Concretely, the function +// pre-conditions are that r.err has the error returned from nextChunk() when +// it is called from Read() or Next(); similarly, r.invalidOffset will have +// the first invalid offset encountered during a call to nextChunk(). +// +// The function post-conditions are that the error stored in r.err is returned +// if there is confirmation of a corruption, otherwise ErrUnexpectedEOF is +// returned after reading all the blocks without corruption confirmation. +func (r *Reader) readAheadForCorruption() error { + if r.loggerForTesting != nil { + r.loggerForTesting.logf("Starting read ahead for corruption. Block corrupted %d.\n", r.blockNum) + } + + for { + // Load the next block into r.buf. + n, err := io.ReadFull(r.r, r.buf[:]) + r.begin, r.end, r.n = 0, 0, n + r.blockNum++ + if r.loggerForTesting != nil { + r.loggerForTesting.logf("Read block %d with %d bytes\n", r.blockNum, n) + } + + if errors.Is(err, io.EOF) { + // ErrUnexpectedEOF is returned instead of io.EOF because io library + // functions clear an error when it is io.EOF. ErrUnexpectedEOF is + // returned so that the error is not cleared when the io library + // makes calls to Reader.Read(). + // + // Since no sync offset was found to indicate that the invalid chunk + // should have been valid, the chunk represents an abrupt, unclean + // termination of the logical log. This abrupt end of file + // represented by ErrUnexpectedEOF. + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tEncountered io.EOF; returning ErrUnexpectedEOF since no sync offset found.\n") + } + return ErrUnexpectedEOF + } + // The last block of a log can be less than 32KiB, which is + // the length of r.buf. Thus, we should still parse the data in + // the last block when io.ReadFull returns io.ErrUnexpectedEOF. + // However, if the error is not ErrUnexpectedEOF, then this + // error should be surfaced. + if err != nil && err != io.ErrUnexpectedEOF { + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tError reading block %d: %v", r.blockNum, err) + } + return err + } + + for r.end+legacyHeaderSize <= r.n { + checksum := binary.LittleEndian.Uint32(r.buf[r.end+0 : r.end+4]) + length := binary.LittleEndian.Uint16(r.buf[r.end+4 : r.end+6]) + chunkEncoding := r.buf[r.end+6] + + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tBlock %d: Processing chunk at offset %d, checksum=%d, length=%d, encoding=%d\n", r.blockNum, r.end, checksum, length, chunkEncoding) + } + + if int(chunkEncoding) >= len(headerFormatMappings) { + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tInvalid chunk encoding encountered (value: %d); stopping chunk scan in block %d\n", chunkEncoding, r.blockNum) + } + break + } + + headerFormat := headerFormatMappings[chunkEncoding] + chunkPosition, wireFormat, headerSize := headerFormat.chunkPosition, headerFormat.wireFormat, headerFormat.headerSize + if checksum == 0 && length == 0 && chunkPosition == invalidChunkPosition { + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tFound invalid chunk marker at block %d offset %d; aborting this block scan\n", r.blockNum, r.end) + } + break + } + if wireFormat == invalidWireFormat { + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tInvalid wire format detected in block %d at offset %d\n", r.blockNum, r.end) + } + break + } + if wireFormat == recyclableWireFormat || wireFormat == walSyncWireFormat { + if r.end+headerSize > r.n { + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tIncomplete header in block %d at offset %d; breaking out\n", r.blockNum, r.end) + } + break + } + logNum := binary.LittleEndian.Uint32(r.buf[r.end+7 : r.end+11]) + if logNum != r.logNum { + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tMismatch log number in block %d at offset %d (expected %d, got %d)\n", r.blockNum, r.end, r.logNum, logNum) + } + break + } + } + + r.begin = r.end + headerSize + r.end = r.begin + int(length) + if r.end > r.n { + // The chunk straddles a 32KB boundary (or the end of file). + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tChunk in block %d spans beyond block boundaries (begin=%d, end=%d, n=%d)\n", r.blockNum, r.begin, r.end, r.n) + } + break + } + if checksum != crc.New(r.buf[r.begin-headerSize+6:r.end]).Value() { + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tChecksum mismatch in block %d at offset %d; potential corruption\n", r.blockNum, r.end) + } + break + } + + // Decode offset in header when chunk has the WAL Sync wire format. + if wireFormat == walSyncWireFormat { + syncedOffset := binary.LittleEndian.Uint64(r.buf[r.begin-headerSize+11 : r.begin-headerSize+19]) + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tBlock %d: Found WAL sync chunk with syncedOffset=%d (invalidOffset=%d)\n", r.blockNum, syncedOffset, r.invalidOffset) + } + // If the encountered chunk offset promises durability beyond the invalid offset, + // the invalid offset must have been corruption. + if syncedOffset > r.invalidOffset { + if r.loggerForTesting != nil { + r.loggerForTesting.logf("\tCorruption confirmed: syncedOffset %d exceeds invalidOffset %d\n", syncedOffset, r.invalidOffset) + } + return r.err + } + } + } + } +} + +// Offset returns the current offset within the file. If called immediately +// before a call to Next(), Offset() will return the record offset. +func (r *Reader) Offset() int64 { + if r.blockNum < 0 { + return 0 + } + return int64(r.blockNum)*blockSize + int64(r.end) +} + +// seekRecord seeks in the underlying io.Reader such that calling r.Next +// returns the record whose first chunk header starts at the provided offset. +// Its behavior is undefined if the argument given is not such an offset, as +// the bytes at that offset may coincidentally appear to be a valid header. +// +// It returns ErrNotAnIOSeeker if the underlying io.Reader does not implement +// io.Seeker. +// +// seekRecord will fail and return an error if the Reader previously +// encountered an error, including io.EOF. +// +// The offset is always relative to the start of the underlying io.Reader, so +// negative values will result in an error as per io.Seeker. +func (r *Reader) seekRecord(offset int64) error { + r.seq++ + if r.err != nil { + return r.err + } + + s, ok := r.r.(io.Seeker) + if !ok { + return ErrNotAnIOSeeker + } + + // Only seek to an exact block offset. + c := int(offset & blockSizeMask) + if _, r.err = s.Seek(offset&^blockSizeMask, io.SeekStart); r.err != nil { + return r.err + } + + // Clear the state of the internal reader. + r.begin, r.end, r.n = 0, 0, 0 + r.blockNum, r.last = -1, false + if r.err = r.nextChunk(false); r.err != nil { + return r.err + } + + // Now skip to the offset requested within the block. A subsequent + // call to Next will return the block at the requested offset. + r.begin, r.end = c, c + + return nil +} + +type singleReader struct { + r *Reader + seq int +} + +func (x singleReader) Read(p []byte) (int, error) { + r := x.r + if r.seq != x.seq { + return 0, errors.New("pebble/record: stale reader") + } + if r.err != nil { + return 0, r.err + } + for r.begin == r.end { + if r.last { + return 0, io.EOF + } + r.err = r.nextChunk(false) + if errors.Is(r.err, ErrInvalidChunk) || errors.Is(r.err, ErrZeroedChunk) { + readAheadResult := r.readAheadForCorruption() + return 0, readAheadResult + } + if r.err != nil { + return 0, r.err + } + } + n := copy(p, r.buf[r.begin:r.end]) + r.begin += n + return n, nil +} + +// Writer writes records to an underlying io.Writer. +type Writer struct { + // w is the underlying writer. + w io.Writer + // seq is the sequence number of the current record. + seq int + // f is w as a flusher. + f flusher + // buf[i:j] is the bytes that will become the current chunk. + // The low bound, i, includes the chunk header. + i, j int + // buf[:written] has already been written to w. + // written is zero unless Flush has been called. + written int + // baseOffset is the base offset in w at which writing started. If + // w implements io.Seeker, it's relative to the start of w, 0 otherwise. + baseOffset int64 + // blockNumber is the zero based block number currently held in buf. + blockNumber int64 + // lastRecordOffset is the offset in w where the last record was + // written (including the chunk header). It is a relative offset to + // baseOffset, thus the absolute offset of the last record is + // baseOffset + lastRecordOffset. + lastRecordOffset int64 + // first is whether the current chunk is the first chunk of the record. + first bool + // pending is whether a chunk is buffered but not yet written. + pending bool + // err is any accumulated error. + err error + // buf is the buffer. + buf [blockSize]byte +} + +// NewWriter returns a new Writer. +func NewWriter(w io.Writer) *Writer { + f, _ := w.(flusher) + + var o int64 + if s, ok := w.(io.Seeker); ok { + var err error + if o, err = s.Seek(0, io.SeekCurrent); err != nil { + o = 0 + } + } + return &Writer{ + w: w, + f: f, + baseOffset: o, + lastRecordOffset: -1, + } +} + +// fillHeader fills in the header for the pending chunk. +func (w *Writer) fillHeader(last bool) { + if w.i+legacyHeaderSize > w.j || w.j > blockSize { + panic("pebble/record: bad writer state") + } + if last { + if w.first { + w.buf[w.i+6] = fullChunkEncoding + } else { + w.buf[w.i+6] = lastChunkEncoding + } + } else { + if w.first { + w.buf[w.i+6] = firstChunkEncoding + } else { + w.buf[w.i+6] = middleChunkEncoding + } + } + binary.LittleEndian.PutUint32(w.buf[w.i+0:w.i+4], crc.New(w.buf[w.i+6:w.j]).Value()) + binary.LittleEndian.PutUint16(w.buf[w.i+4:w.i+6], uint16(w.j-w.i-legacyHeaderSize)) +} + +// writeBlock writes the buffered block to the underlying writer, and reserves +// space for the next chunk's header. +func (w *Writer) writeBlock() { + _, w.err = w.w.Write(w.buf[w.written:]) + w.i = 0 + w.j = legacyHeaderSize + w.written = 0 + w.blockNumber++ +} + +// writePending finishes the current record and writes the buffer to the +// underlying writer. +func (w *Writer) writePending() { + if w.err != nil { + return + } + if w.pending { + w.fillHeader(true) + w.pending = false + } + _, w.err = w.w.Write(w.buf[w.written:w.j]) + w.written = w.j +} + +// Close finishes the current record and closes the writer. +func (w *Writer) Close() error { + w.seq++ + w.writePending() + if w.err != nil { + return w.err + } + w.err = errors.New("pebble/record: closed Writer") + return nil +} + +// Flush finishes the current record, writes to the underlying writer, and +// flushes it if that writer implements interface{ Flush() error }. +func (w *Writer) Flush() error { + w.seq++ + w.writePending() + if w.err != nil { + return w.err + } + if w.f != nil { + w.err = w.f.Flush() + return w.err + } + return nil +} + +// Next returns a writer for the next record. The writer returned becomes stale +// after the next Close, Flush or Next call, and should no longer be used. +func (w *Writer) Next() (io.Writer, error) { + w.seq++ + if w.err != nil { + return nil, w.err + } + if w.pending { + w.fillHeader(true) + } + w.i = w.j + w.j = w.j + legacyHeaderSize + // Check if there is room in the block for the header. + if w.j > blockSize { + // Fill in the rest of the block with zeroes. + clear(w.buf[w.i:]) + w.writeBlock() + if w.err != nil { + return nil, w.err + } + } + w.lastRecordOffset = w.baseOffset + w.blockNumber*blockSize + int64(w.i) + w.first = true + w.pending = true + return singleWriter{w, w.seq}, nil +} + +// WriteRecord writes a complete record. Returns the offset just past the end +// of the record. +func (w *Writer) WriteRecord(p []byte) (int64, error) { + if w.err != nil { + return -1, w.err + } + t, err := w.Next() + if err != nil { + return -1, err + } + if _, err := t.Write(p); err != nil { + return -1, err + } + w.writePending() + offset := w.blockNumber*blockSize + int64(w.j) + return offset, w.err +} + +// Size returns the current size of the file. +func (w *Writer) Size() int64 { + if w == nil { + return 0 + } + return w.blockNumber*blockSize + int64(w.j) +} + +// LastRecordOffset returns the offset in the underlying io.Writer of the last +// record so far - the one created by the most recent Next call. It is the +// offset of the first chunk header, suitable to pass to Reader.SeekRecord. +// +// If that io.Writer also implements io.Seeker, the return value is an absolute +// offset, in the sense of io.SeekStart, regardless of whether the io.Writer +// was initially at the zero position when passed to NewWriter. Otherwise, the +// return value is a relative offset, being the number of bytes written between +// the NewWriter call and any records written prior to the last record. +// +// If there is no last record, i.e. nothing was written, LastRecordOffset will +// return ErrNoLastRecord. +func (w *Writer) LastRecordOffset() (int64, error) { + if w.err != nil { + return 0, w.err + } + if w.lastRecordOffset < 0 { + return 0, ErrNoLastRecord + } + return w.lastRecordOffset, nil +} + +type singleWriter struct { + w *Writer + seq int +} + +func (x singleWriter) Write(p []byte) (int, error) { + w := x.w + if w.seq != x.seq { + return 0, errors.New("pebble/record: stale writer") + } + if w.err != nil { + return 0, w.err + } + n0 := len(p) + for len(p) > 0 { + // Write a block, if it is full. + if w.j == blockSize { + w.fillHeader(false) + w.writeBlock() + if w.err != nil { + return 0, w.err + } + w.first = false + } + // Copy bytes into the buffer. + n := copy(w.buf[w.j:], p) + w.j += n + p = p[n:] + } + return n0, nil +} diff --git a/vendor/github.com/cockroachdb/pebble/record/rotation.go b/vendor/github.com/cockroachdb/pebble/v2/record/rotation.go similarity index 100% rename from vendor/github.com/cockroachdb/pebble/record/rotation.go rename to vendor/github.com/cockroachdb/pebble/v2/record/rotation.go diff --git a/vendor/github.com/cockroachdb/pebble/scan_internal.go b/vendor/github.com/cockroachdb/pebble/v2/scan_internal.go similarity index 66% rename from vendor/github.com/cockroachdb/pebble/scan_internal.go rename to vendor/github.com/cockroachdb/pebble/v2/scan_internal.go index 08bebfc..c87de56 100644 --- a/vendor/github.com/cockroachdb/pebble/scan_internal.go +++ b/vendor/github.com/cockroachdb/pebble/v2/scan_internal.go @@ -7,20 +7,33 @@ package pebble import ( "context" "fmt" + "slices" "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/internal/keyspan" - "github.com/cockroachdb/pebble/internal/manifest" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/remote" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/keyspan/keyspanimpl" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/remote" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/block" ) const ( - // In skip-shared iteration mode, keys in levels sharedLevelsStart and greater - // (i.e. lower in the LSM) are skipped. + // In skip-shared iteration mode, keys in levels greater than + // sharedLevelsStart (i.e. lower in the LSM) are skipped. Keys + // in sharedLevelsStart are returned iff they are not in a + // shared file. sharedLevelsStart = remote.SharedLevelsStart + + // In skip-external iteration mode, keys in levels greater + // than externalSkipStart are skipped. Keys in + // externalSkipStart are returned iff they are not in an + // external file. + externalSkipStart = 6 ) // ErrInvalidSkipSharedIteration is returned by ScanInternal if it was called @@ -34,7 +47,7 @@ var ErrInvalidSkipSharedIteration = errors.New("pebble: cannot use skip-shared i // by another pebble instance. This struct must contain all fields that are // required for a Pebble instance to ingest a foreign sstable on shared storage, // including constructing any relevant objstorage.Provider / remoteobjcat.Catalog -// data structures, as well as creating virtual FileMetadatas. +// data structures, as well as creating virtual TableMetadatas. // // Note that the Pebble instance creating and returning a SharedSSTMeta might // not be the one that created the underlying sstable on shared storage to begin @@ -72,21 +85,23 @@ type SharedSSTMeta struct { // Size contains an estimate of the size of this sstable. Size uint64 - // fileNum at time of creation in the creator instance. Only used for + // tableNum at time of creation in the creator instance. Only used for // debugging/tests. - fileNum base.FileNum + tableNum base.TableNum } -func (s *SharedSSTMeta) cloneFromFileMeta(f *fileMetadata) { +func (s *SharedSSTMeta) cloneFromFileMeta(f *manifest.TableMetadata) { *s = SharedSSTMeta{ - Smallest: f.Smallest.Clone(), - Largest: f.Largest.Clone(), - SmallestRangeKey: f.SmallestRangeKey.Clone(), - LargestRangeKey: f.LargestRangeKey.Clone(), - SmallestPointKey: f.SmallestPointKey.Clone(), - LargestPointKey: f.LargestPointKey.Clone(), + Smallest: f.Smallest().Clone(), + Largest: f.Largest().Clone(), + SmallestPointKey: f.PointKeyBounds.Smallest().Clone(), + LargestPointKey: f.PointKeyBounds.Largest().Clone(), Size: f.Size, - fileNum: f.FileNum, + tableNum: f.TableNum, + } + if f.HasRangeKeys { + s.SmallestRangeKey = f.RangeKeyBounds.Smallest().Clone() + s.LargestRangeKey = f.RangeKeyBounds.Largest().Clone() } } @@ -115,9 +130,9 @@ type pointCollapsingIterator struct { comparer *base.Comparer merge base.Merge err error - seqNum uint64 + seqNum base.SeqNum // The current position of `iter`. Always owned by the underlying iter. - iterKey *InternalKey + iterKV *base.InternalKV // The last saved key. findNextEntry and similar methods are expected to save // the current value of iterKey to savedKey if they're iterating away from the // current key but still need to retain it. See comments in findNextEntry on @@ -131,11 +146,9 @@ type pointCollapsingIterator struct { // current key owned by this iterator (i.e. backed by savedKeyBuf). savedKey InternalKey savedKeyBuf []byte - // Value at the current iterator position, at iterKey. - iterValue base.LazyValue // If fixedSeqNum is non-zero, all emitted points are verified to have this // fixed sequence number. - fixedSeqNum uint64 + fixedSeqNum base.SeqNum } func (p *pointCollapsingIterator) Span() *keyspan.Span { @@ -145,80 +158,76 @@ func (p *pointCollapsingIterator) Span() *keyspan.Span { // SeekPrefixGE implements the InternalIterator interface. func (p *pointCollapsingIterator) SeekPrefixGE( prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { +) *base.InternalKV { p.resetKey() - p.iterKey, p.iterValue = p.iter.SeekPrefixGE(prefix, key, flags) + p.iterKV = p.iter.SeekPrefixGE(prefix, key, flags) p.pos = pcIterPosCur - if p.iterKey == nil { - return nil, base.LazyValue{} + if p.iterKV == nil { + return nil } return p.findNextEntry() } // SeekGE implements the InternalIterator interface. -func (p *pointCollapsingIterator) SeekGE( - key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { +func (p *pointCollapsingIterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { p.resetKey() - p.iterKey, p.iterValue = p.iter.SeekGE(key, flags) + p.iterKV = p.iter.SeekGE(key, flags) p.pos = pcIterPosCur - if p.iterKey == nil { - return nil, base.LazyValue{} + if p.iterKV == nil { + return nil } return p.findNextEntry() } // SeekLT implements the InternalIterator interface. -func (p *pointCollapsingIterator) SeekLT( - key []byte, flags base.SeekLTFlags, -) (*base.InternalKey, base.LazyValue) { +func (p *pointCollapsingIterator) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { panic("unimplemented") } func (p *pointCollapsingIterator) resetKey() { p.savedKey.UserKey = p.savedKeyBuf[:0] p.savedKey.Trailer = 0 - p.iterKey = nil + p.iterKV = nil p.pos = pcIterPosCur } -func (p *pointCollapsingIterator) verifySeqNum(key *base.InternalKey) *base.InternalKey { +func (p *pointCollapsingIterator) verifySeqNum(kv *base.InternalKV) *base.InternalKV { if !invariants.Enabled { - return key + return kv } - if p.fixedSeqNum == 0 || key == nil || key.Kind() == InternalKeyKindRangeDelete { - return key + if p.fixedSeqNum == 0 || kv == nil || kv.Kind() == InternalKeyKindRangeDelete { + return kv } - if key.SeqNum() != p.fixedSeqNum { - panic(fmt.Sprintf("expected foreign point key to have seqnum %d, got %d", p.fixedSeqNum, key.SeqNum())) + if kv.SeqNum() != p.fixedSeqNum { + panic(fmt.Sprintf("expected foreign point key to have seqnum %d, got %d", p.fixedSeqNum, kv.SeqNum())) } - return key + return kv } // findNextEntry is called to return the next key. p.iter must be positioned at the // start of the first user key we are interested in. -func (p *pointCollapsingIterator) findNextEntry() (*base.InternalKey, base.LazyValue) { +func (p *pointCollapsingIterator) findNextEntry() *base.InternalKV { p.saveKey() // Saves a comparison in the fast path firstIteration := true - for p.iterKey != nil { - // NB: p.savedKey is either the current key (iff p.iterKey == firstKey), + for p.iterKV != nil { + // NB: p.savedKey is either the current key (iff p.iterKV == firstKey), // or the previous key. - if !firstIteration && !p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) { + if !firstIteration && !p.comparer.Equal(p.iterKV.K.UserKey, p.savedKey.UserKey) { p.saveKey() continue } firstIteration = false - if s := p.iter.Span(); s != nil && s.CoversAt(p.seqNum, p.iterKey.SeqNum()) { + if s := p.iter.Span(); s != nil && s.CoversAt(p.seqNum, p.iterKV.SeqNum()) { // All future keys for this user key must be deleted. if p.savedKey.Kind() == InternalKeyKindSingleDelete { panic("cannot process singledel key in point collapsing iterator") } // Fast forward to the next user key. p.saveKey() - p.iterKey, p.iterValue = p.iter.Next() - for p.iterKey != nil && p.savedKey.SeqNum() >= p.iterKey.SeqNum() && p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) { - p.iterKey, p.iterValue = p.iter.Next() + p.iterKV = p.iter.Next() + for p.iterKV != nil && p.savedKey.SeqNum() >= p.iterKV.SeqNum() && p.comparer.Equal(p.iterKV.K.UserKey, p.savedKey.UserKey) { + p.iterKV = p.iter.Next() } continue } @@ -242,7 +251,7 @@ func (p *pointCollapsingIterator) findNextEntry() (*base.InternalKey, base.LazyV // of blocks and can determine user key changes without doing key saves // or comparisons. p.pos = pcIterPosCur - return p.verifySeqNum(p.iterKey), p.iterValue + return p.verifySeqNum(p.iterKV) case InternalKeyKindSingleDelete: // Panic, as this iterator is not expected to observe single deletes. panic("cannot process singledel key in point collapsing iterator") @@ -254,84 +263,84 @@ func (p *pointCollapsingIterator) findNextEntry() (*base.InternalKey, base.LazyV // We should pass them as-is, but also account for any points ahead of // them. p.pos = pcIterPosCur - return p.verifySeqNum(p.iterKey), p.iterValue + return p.verifySeqNum(p.iterKV) default: - panic(fmt.Sprintf("unexpected kind: %d", p.iterKey.Kind())) + panic(fmt.Sprintf("unexpected kind: %d", p.iterKV.Kind())) } } p.resetKey() - return nil, base.LazyValue{} + return nil } // First implements the InternalIterator interface. -func (p *pointCollapsingIterator) First() (*base.InternalKey, base.LazyValue) { +func (p *pointCollapsingIterator) First() *base.InternalKV { p.resetKey() - p.iterKey, p.iterValue = p.iter.First() + p.iterKV = p.iter.First() p.pos = pcIterPosCur - if p.iterKey == nil { - return nil, base.LazyValue{} + if p.iterKV == nil { + return nil } return p.findNextEntry() } // Last implements the InternalIterator interface. -func (p *pointCollapsingIterator) Last() (*base.InternalKey, base.LazyValue) { +func (p *pointCollapsingIterator) Last() *base.InternalKV { panic("unimplemented") } func (p *pointCollapsingIterator) saveKey() { - if p.iterKey == nil { + if p.iterKV == nil { p.savedKey = InternalKey{UserKey: p.savedKeyBuf[:0]} return } - p.savedKeyBuf = append(p.savedKeyBuf[:0], p.iterKey.UserKey...) - p.savedKey = InternalKey{UserKey: p.savedKeyBuf, Trailer: p.iterKey.Trailer} + p.savedKeyBuf = append(p.savedKeyBuf[:0], p.iterKV.K.UserKey...) + p.savedKey = InternalKey{UserKey: p.savedKeyBuf, Trailer: p.iterKV.K.Trailer} } // Next implements the InternalIterator interface. -func (p *pointCollapsingIterator) Next() (*base.InternalKey, base.LazyValue) { +func (p *pointCollapsingIterator) Next() *base.InternalKV { switch p.pos { case pcIterPosCur: p.saveKey() - if p.iterKey != nil && p.iterKey.Kind() == InternalKeyKindRangeDelete { + if p.iterKV != nil && p.iterKV.Kind() == InternalKeyKindRangeDelete { // Step over the interleaved range delete and process the very next // internal key, even if it's at the same user key. This is because a // point for that user key has not been returned yet. - p.iterKey, p.iterValue = p.iter.Next() + p.iterKV = p.iter.Next() break } // Fast forward to the next user key. - key, val := p.iter.Next() - // p.iterKey.SeqNum() >= key.SeqNum() is an optimization that allows us to - // use p.iterKey.SeqNum() < key.SeqNum() as a sign that the user key has + kv := p.iter.Next() + // p.iterKV.SeqNum() >= key.SeqNum() is an optimization that allows us to + // use p.iterKV.SeqNum() < key.SeqNum() as a sign that the user key has // changed, without needing to do the full key comparison. - for key != nil && p.savedKey.SeqNum() >= key.SeqNum() && - p.comparer.Equal(p.savedKey.UserKey, key.UserKey) { - key, val = p.iter.Next() + for kv != nil && p.savedKey.SeqNum() >= kv.SeqNum() && + p.comparer.Equal(p.savedKey.UserKey, kv.K.UserKey) { + kv = p.iter.Next() } - if key == nil { + if kv == nil { // There are no keys to return. p.resetKey() - return nil, base.LazyValue{} + return nil } - p.iterKey, p.iterValue = key, val + p.iterKV = kv case pcIterPosNext: p.pos = pcIterPosCur } - if p.iterKey == nil { + if p.iterKV == nil { p.resetKey() - return nil, base.LazyValue{} + return nil } return p.findNextEntry() } // NextPrefix implements the InternalIterator interface. -func (p *pointCollapsingIterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) { +func (p *pointCollapsingIterator) NextPrefix(succKey []byte) *base.InternalKV { panic("unimplemented") } // Prev implements the InternalIterator interface. -func (p *pointCollapsingIterator) Prev() (*base.InternalKey, base.LazyValue) { +func (p *pointCollapsingIterator) Prev() *base.InternalKV { panic("unimplemented") } @@ -354,6 +363,16 @@ func (p *pointCollapsingIterator) SetBounds(lower, upper []byte) { p.iter.SetBounds(lower, upper) } +func (p *pointCollapsingIterator) SetContext(ctx context.Context) { + p.iter.SetContext(ctx) +} + +// DebugTree is part of the InternalIterator interface. +func (p *pointCollapsingIterator) DebugTree(tp treeprinter.Node) { + n := tp.Childf("%T(%p)", p, p) + p.iter.DebugTree(n) +} + // String implements the InternalIterator interface. func (p *pointCollapsingIterator) String() string { return p.iter.String() @@ -400,23 +419,24 @@ type IteratorLevel struct { // *must* return the range delete as well as the range key unset/delete that did // the shadowing. type scanInternalIterator struct { - db *DB - opts scanInternalOptions - comparer *base.Comparer - merge Merge - iter internalIterator - readState *readState - version *version - rangeKey *iteratorRangeKeyState - pointKeyIter internalIterator - iterKey *InternalKey - iterValue LazyValue - alloc *iterAlloc - newIters tableNewIters - newIterRangeKey keyspan.TableNewSpanIter - seqNum uint64 - iterLevels []IteratorLevel - mergingIter *mergingIter + ctx context.Context + db *DB + opts scanInternalOptions + comparer *base.Comparer + merge Merge + iter internalIterator + readState *readState + version *manifest.Version + rangeKey *iteratorRangeKeyState + pointKeyIter internalIterator + iterKV *base.InternalKV + alloc *iterAlloc + newIters tableNewIters + newIterRangeKey keyspanimpl.TableNewSpanIter + seqNum base.SeqNum + iterLevels []IteratorLevel + mergingIter *mergingIter + blobValueFetcher blob.ValueFetcher // boundsBuf holds two buffers used to store the lower and upper bounds. // Whenever the InternalIterator's bounds change, the new bounds are copied @@ -426,6 +446,66 @@ type scanInternalIterator struct { boundsBufIdx int } +// truncateExternalFile truncates an External file's [SmallestUserKey, +// LargestUserKey] fields to [lower, upper). A ExternalFile is +// produced that is suitable for external consumption by other Pebble +// instances. +// +// truncateSharedFile reads the file to try to create the smallest +// possible bounds. Here, we blindly truncate them. This may mean we +// include this SST in iterations it isn't really needed in. Since we +// don't expect External files to be long-lived in the pebble +// instance, We think this is OK. +// +// TODO(ssd) 2024-01-26: Potentially de-duplicate with +// truncateSharedFile. +func (d *DB) truncateExternalFile( + ctx context.Context, + lower, upper []byte, + level int, + file *manifest.TableMetadata, + objMeta objstorage.ObjectMetadata, +) (*ExternalFile, error) { + cmp := d.cmp + sst := &ExternalFile{ + Level: uint8(level), + ObjName: objMeta.Remote.CustomObjectName, + Locator: objMeta.Remote.Locator, + HasPointKey: file.HasPointKeys, + HasRangeKey: file.HasRangeKeys, + Size: file.Size, + SyntheticPrefix: slices.Clone(file.SyntheticPrefixAndSuffix.Prefix()), + SyntheticSuffix: slices.Clone(file.SyntheticPrefixAndSuffix.Suffix()), + } + + needsLowerTruncate := cmp(lower, file.Smallest().UserKey) > 0 + if needsLowerTruncate { + sst.StartKey = slices.Clone(lower) + } else { + sst.StartKey = slices.Clone(file.Smallest().UserKey) + } + + cmpUpper := cmp(upper, file.Largest().UserKey) + needsUpperTruncate := cmpUpper < 0 + if needsUpperTruncate { + sst.EndKey = slices.Clone(upper) + sst.EndKeyIsInclusive = false + } else { + sst.EndKey = slices.Clone(file.Largest().UserKey) + sst.EndKeyIsInclusive = !file.Largest().IsExclusiveSentinel() + } + + if cmp(sst.StartKey, sst.EndKey) > 0 { + return nil, base.AssertionFailedf("pebble: invalid external file bounds after truncation [%q, %q)", sst.StartKey, sst.EndKey) + } + + if cmp(sst.StartKey, sst.EndKey) == 0 && !sst.EndKeyIsInclusive { + return nil, base.AssertionFailedf("pebble: invalid external file bounds after truncation [%q, %q)", sst.StartKey, sst.EndKey) + } + + return sst, nil +} + // truncateSharedFile truncates a shared file's [Smallest, Largest] fields to // [lower, upper), potentially opening iterators on the file to find keys within // the requested bounds. A SharedSSTMeta is produced that is suitable for @@ -438,7 +518,7 @@ func (d *DB) truncateSharedFile( ctx context.Context, lower, upper []byte, level int, - file *fileMetadata, + file *manifest.TableMetadata, objMeta objstorage.ObjectMetadata, ) (sst *SharedSSTMeta, shouldSkip bool, err error) { cmp := d.cmp @@ -449,8 +529,8 @@ func (d *DB) truncateSharedFile( if err != nil { return nil, false, err } - needsLowerTruncate := cmp(lower, file.Smallest.UserKey) > 0 - needsUpperTruncate := cmp(upper, file.Largest.UserKey) < 0 || (cmp(upper, file.Largest.UserKey) == 0 && !file.Largest.IsExclusiveSentinel()) + needsLowerTruncate := cmp(lower, file.Smallest().UserKey) > 0 + needsUpperTruncate := cmp(upper, file.Largest().UserKey) < 0 || (cmp(upper, file.Largest().UserKey) == 0 && !file.Largest().IsExclusiveSentinel()) // Fast path: file is entirely within [lower, upper). if !needsLowerTruncate && !needsUpperTruncate { return sst, false, nil @@ -458,46 +538,38 @@ func (d *DB) truncateSharedFile( // We will need to truncate file bounds in at least one direction. Open all // relevant iterators. - iter, rangeDelIter, err := d.newIters(ctx, file, &IterOptions{ + iters, err := d.newIters(ctx, file, &IterOptions{ LowerBound: lower, UpperBound: upper, - level: manifest.Level(level), - }, internalIterOpts{}) + layer: manifest.Level(level), + }, internalIterOpts{}, iterPointKeys|iterRangeDeletions|iterRangeKeys) if err != nil { return nil, false, err } - defer iter.Close() + defer func() { _ = iters.CloseAll() }() + iter := iters.point + rangeDelIter := iters.rangeDeletion + rangeKeyIter := iters.rangeKey if rangeDelIter != nil { - rangeDelIter = keyspan.Truncate( - cmp, rangeDelIter, lower, upper, nil, nil, - false, /* panicOnUpperTruncate */ - ) - defer rangeDelIter.Close() - } - rangeKeyIter, err := d.tableNewRangeKeyIter(file, keyspan.SpanIterOptions{}) - if err != nil { - return nil, false, err + rangeDelIter = keyspan.Truncate(cmp, rangeDelIter, base.UserKeyBoundsEndExclusive(lower, upper)) } if rangeKeyIter != nil { - rangeKeyIter = keyspan.Truncate( - cmp, rangeKeyIter, lower, upper, nil, nil, - false, /* panicOnUpperTruncate */ - ) - defer rangeKeyIter.Close() + rangeKeyIter = keyspan.Truncate(cmp, rangeKeyIter, base.UserKeyBoundsEndExclusive(lower, upper)) } // Check if we need to truncate on the left side. This means finding a new // LargestPointKey and LargestRangeKey that is >= lower. if needsLowerTruncate { sst.SmallestPointKey.UserKey = sst.SmallestPointKey.UserKey[:0] sst.SmallestPointKey.Trailer = 0 - key, _ := iter.SeekGE(lower, base.SeekGEFlagsNone) - foundPointKey := key != nil - if key != nil { - sst.SmallestPointKey.CopyFrom(*key) + kv := iter.SeekGE(lower, base.SeekGEFlagsNone) + foundPointKey := kv != nil + if kv != nil { + sst.SmallestPointKey.CopyFrom(kv.K) } if rangeDelIter != nil { - span := rangeDelIter.SeekGE(lower) - if span != nil && (len(sst.SmallestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.SmallestKey(), sst.SmallestPointKey) < 0) { + if span, err := rangeDelIter.SeekGE(lower); err != nil { + return nil, false, err + } else if span != nil && (len(sst.SmallestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.SmallestKey(), sst.SmallestPointKey) < 0) { sst.SmallestPointKey.CopyFrom(span.SmallestKey()) foundPointKey = true } @@ -510,10 +582,13 @@ func (d *DB) truncateSharedFile( sst.SmallestRangeKey.UserKey = sst.SmallestRangeKey.UserKey[:0] sst.SmallestRangeKey.Trailer = 0 if rangeKeyIter != nil { - span := rangeKeyIter.SeekGE(lower) - if span != nil { + span, err := rangeKeyIter.SeekGE(lower) + switch { + case err != nil: + return nil, false, err + case span != nil: sst.SmallestRangeKey.CopyFrom(span.SmallestKey()) - } else { + default: // There are no range keys in the span we're interested in. sst.SmallestRangeKey = InternalKey{} sst.LargestRangeKey = InternalKey{} @@ -525,14 +600,15 @@ func (d *DB) truncateSharedFile( if needsUpperTruncate { sst.LargestPointKey.UserKey = sst.LargestPointKey.UserKey[:0] sst.LargestPointKey.Trailer = 0 - key, _ := iter.SeekLT(upper, base.SeekLTFlagsNone) - foundPointKey := key != nil - if key != nil { - sst.LargestPointKey.CopyFrom(*key) + kv := iter.SeekLT(upper, base.SeekLTFlagsNone) + foundPointKey := kv != nil + if kv != nil { + sst.LargestPointKey.CopyFrom(kv.K) } if rangeDelIter != nil { - span := rangeDelIter.SeekLT(upper) - if span != nil && (len(sst.LargestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.LargestKey(), sst.LargestPointKey) > 0) { + if span, err := rangeDelIter.SeekLT(upper); err != nil { + return nil, false, err + } else if span != nil && (len(sst.LargestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.LargestKey(), sst.LargestPointKey) > 0) { sst.LargestPointKey.CopyFrom(span.LargestKey()) foundPointKey = true } @@ -545,10 +621,13 @@ func (d *DB) truncateSharedFile( sst.LargestRangeKey.UserKey = sst.LargestRangeKey.UserKey[:0] sst.LargestRangeKey.Trailer = 0 if rangeKeyIter != nil { - span := rangeKeyIter.SeekLT(upper) - if span != nil { + span, err := rangeKeyIter.SeekLT(upper) + switch { + case err != nil: + return nil, false, err + case span != nil: sst.LargestRangeKey.CopyFrom(span.LargestKey()) - } else { + default: // There are no range keys in the span we're interested in. sst.SmallestRangeKey = InternalKey{} sst.LargestRangeKey = InternalKey{} @@ -583,7 +662,7 @@ func (d *DB) truncateSharedFile( if len(sst.Smallest.UserKey) == 0 { return nil, true, nil } - sst.Size, err = d.tableCache.estimateSize(file, sst.Smallest.UserKey, sst.Largest.UserKey) + sst.Size, err = d.fileCache.estimateSize(file, sst.Smallest.UserKey, sst.Largest.UserKey) if err != nil { return nil, false, err } @@ -602,6 +681,10 @@ func scanInternalImpl( if opts.visitSharedFile != nil && (lower == nil || upper == nil) { panic("lower and upper bounds must be specified in skip-shared iteration mode") } + if opts.visitSharedFile != nil && opts.visitExternalFile != nil { + return base.AssertionFailedf("cannot provide both a shared-file and external-file visitor") + } + // Before starting iteration, check if any files in levels sharedLevelsStart // and below are *not* shared. Error out if that is the case, as skip-shared // iteration will not produce a consistent point-in-time view of this range @@ -614,37 +697,73 @@ func scanInternalImpl( if current == nil { current = iter.readState.current } - if opts.visitSharedFile != nil { + + if opts.visitSharedFile != nil || opts.visitExternalFile != nil { if provider == nil { panic("expected non-nil Provider in skip-shared iteration mode") } - for level := sharedLevelsStart; level < numLevels; level++ { + + firstLevelWithRemote := opts.skipLevelForOpts() + for level := firstLevelWithRemote; level < numLevels; level++ { files := current.Levels[level].Iter() - for f := files.SeekGE(cmp, lower); f != nil && cmp(f.Smallest.UserKey, upper) < 0; f = files.Next() { + for f := files.SeekGE(cmp, lower); f != nil && cmp(f.Smallest().UserKey, upper) < 0; f = files.Next() { + if cmp(lower, f.Largest().UserKey) == 0 && f.Largest().IsExclusiveSentinel() { + continue + } + var objMeta objstorage.ObjectMetadata var err error - objMeta, err = provider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum) + objMeta, err = provider.Lookup(base.FileTypeTable, f.TableBacking.DiskFileNum) if err != nil { return err } - if !objMeta.IsShared() { - return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s is not shared", objMeta.DiskFileNum) + + // We allow a mix of files at the first level. + if level != firstLevelWithRemote { + if !objMeta.IsShared() && !objMeta.IsExternal() { + return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s is not shared or external", objMeta.DiskFileNum) + } + } + + if objMeta.IsShared() && opts.visitSharedFile == nil { + return errors.Wrapf(ErrInvalidSkipSharedIteration, "shared file is present but no shared file visitor is defined") } - if !base.Visible(f.LargestSeqNum, seqNum, base.InternalKeySeqNumMax) { - return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s contains keys newer than snapshot", objMeta.DiskFileNum) + + if objMeta.IsExternal() && opts.visitExternalFile == nil { + return errors.Wrapf(ErrInvalidSkipSharedIteration, "external file is present but no external file visitor is defined") } - var sst *SharedSSTMeta - var skip bool - sst, skip, err = iter.db.truncateSharedFile(ctx, lower, upper, level, f, objMeta) - if err != nil { - return err + + if !base.Visible(f.LargestSeqNum, seqNum, base.SeqNumMax) { + return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s contains keys newer than snapshot", objMeta.DiskFileNum) } - if skip { - continue + + if level != firstLevelWithRemote && (!objMeta.IsShared() && !objMeta.IsExternal()) { + return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s is not shared or external", objMeta.DiskFileNum) } - if err = opts.visitSharedFile(sst); err != nil { - return err + + if objMeta.IsShared() { + var sst *SharedSSTMeta + var skip bool + sst, skip, err = iter.db.truncateSharedFile(ctx, lower, upper, level, f, objMeta) + if err != nil { + return err + } + if skip { + continue + } + if err = opts.visitSharedFile(sst); err != nil { + return err + } + } else if objMeta.IsExternal() { + sst, err := iter.db.truncateExternalFile(ctx, lower, upper, level, f, objMeta) + if err != nil { + return err + } + if err := opts.visitExternalFile(sst); err != nil { + return err + } } + } } } @@ -668,10 +787,10 @@ func scanInternalImpl( // call visitRangeKey. keysCopy := make([]keyspan.Key, len(span.Keys)) for i := range span.Keys { - keysCopy[i] = span.Keys[i] + keysCopy[i].CopyFrom(span.Keys[i]) keysCopy[i].Trailer = base.MakeTrailer(0, span.Keys[i].Kind()) } - keyspan.SortKeysByTrailer(&keysCopy) + keyspan.SortKeysByTrailer(keysCopy) if err := opts.visitRangeKey(span.Start, span.End, keysCopy); err != nil { return err } @@ -703,8 +822,20 @@ func scanInternalImpl( return nil } +func (opts *scanInternalOptions) skipLevelForOpts() int { + if opts.visitSharedFile != nil { + return sharedLevelsStart + } + if opts.visitExternalFile != nil { + return externalSkipStart + } + return numLevels +} + // constructPointIter constructs a merging iterator and sets i.iter to it. -func (i *scanInternalIterator) constructPointIter(memtables flushableList, buf *iterAlloc) { +func (i *scanInternalIterator) constructPointIter( + category block.Category, memtables flushableList, buf *iterAlloc, +) error { // Merging levels and levels from iterAlloc. mlevels := buf.mlevels[:0] levels := buf.levels[:0] @@ -722,11 +853,12 @@ func (i *scanInternalIterator) constructPointIter(memtables flushableList, buf * numMergingLevels += len(current.L0SublevelFiles) numLevelIters += len(current.L0SublevelFiles) + skipStart := i.opts.skipLevelForOpts() for level := 1; level < len(current.Levels); level++ { if current.Levels[level].Empty() { continue } - if i.opts.skipSharedLevels && level >= sharedLevelsStart { + if level > skipStart { continue } numMergingLevels++ @@ -740,9 +872,9 @@ func (i *scanInternalIterator) constructPointIter(memtables flushableList, buf * levels = make([]levelIter, 0, numLevelIters) } // TODO(bilal): Push these into the iterAlloc buf. - var rangeDelMiter keyspan.MergingIter + var rangeDelMiter keyspanimpl.MergingIter rangeDelIters := make([]keyspan.FragmentIterator, 0, numMergingLevels) - rangeDelLevels := make([]keyspan.LevelIter, 0, numLevelIters) + rangeDelLevels := make([]keyspanimpl.LevelIter, 0, numLevelIters) i.iterLevels = make([]IteratorLevel, numMergingLevels) mlevelsIndex := 0 @@ -769,17 +901,20 @@ func (i *scanInternalIterator) constructPointIter(memtables flushableList, buf * levels = levels[:numLevelIters] rangeDelLevels = rangeDelLevels[:numLevelIters] i.opts.IterOptions.snapshotForHideObsoletePoints = i.seqNum - addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) { + i.opts.IterOptions.Category = category + + internalOpts := internalIterOpts{ + blobValueFetcher: &i.blobValueFetcher, + } + + addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Layer) { li := &levels[levelsIndex] rli := &rangeDelLevels[levelsIndex] - li.init( - context.Background(), i.opts.IterOptions, i.comparer, i.newIters, files, level, - internalIterOpts{}) - li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext) + li.init(i.ctx, i.opts.IterOptions, i.comparer, i.newIters, files, level, internalOpts) mlevels[mlevelsIndex].iter = li - rli.Init(keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters}, - i.comparer.Compare, tableNewRangeDelIter(context.Background(), i.newIters), files, level, + rli.Init(i.ctx, keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters}, + i.comparer.Compare, tableNewRangeDelIter(i.newIters), files, level, manifest.KeyTypePoint) rangeDelIters = append(rangeDelIters, rli) @@ -800,16 +935,36 @@ func (i *scanInternalIterator) constructPointIter(memtables flushableList, buf * if current.Levels[level].Empty() { continue } - if i.opts.skipSharedLevels && level >= sharedLevelsStart { + + if level > skipStart { continue } i.iterLevels[mlevelsIndex] = IteratorLevel{Kind: IteratorLevelLSM, Level: level} - addLevelIterForFiles(current.Levels[level].Iter(), manifest.Level(level)) + levIter := current.Levels[level].Iter() + if level == skipStart { + nonRemoteFiles := make([]*manifest.TableMetadata, 0) + for f := levIter.First(); f != nil; f = levIter.Next() { + meta, err := i.db.objProvider.Lookup(base.FileTypeTable, f.TableBacking.DiskFileNum) + if err != nil { + return err + } + if (meta.IsShared() && i.opts.visitSharedFile != nil) || + (meta.IsExternal() && i.opts.visitExternalFile != nil) { + // Skip this file. + continue + } + nonRemoteFiles = append(nonRemoteFiles, f) + } + levSlice := manifest.NewLevelSliceKeySorted(i.db.cmp, nonRemoteFiles) + levIter = levSlice.Iter() + } + + addLevelIterForFiles(levIter, manifest.Level(level)) } buf.merging.init(&i.opts.IterOptions, &InternalIteratorStats{}, i.comparer.Compare, i.comparer.Split, mlevels...) buf.merging.snapshot = i.seqNum - rangeDelMiter.Init(i.comparer.Compare, keyspan.VisibleTransform(i.seqNum), new(keyspan.MergingBuffers), rangeDelIters...) + rangeDelMiter.Init(i.comparer, keyspan.VisibleTransform(i.seqNum), new(keyspanimpl.MergingBuffers), rangeDelIters...) if i.opts.includeObsoleteKeys { iiter := &keyspan.InterleavingIter{} @@ -832,13 +987,14 @@ func (i *scanInternalIterator) constructPointIter(memtables flushableList, buf * i.pointKeyIter = pcIter } i.iter = i.pointKeyIter + return nil } // constructRangeKeyIter constructs the range-key iterator stack, populating // i.rangeKey.rangeKeyIter with the resulting iterator. This is similar to // Iterator.constructRangeKeyIter, except it doesn't handle batches and ensures // iterConfig does *not* elide unsets/deletes. -func (i *scanInternalIterator) constructRangeKeyIter() { +func (i *scanInternalIterator) constructRangeKeyIter() error { // We want the bounded iter from iterConfig, but not the collapsing of // RangeKeyUnsets and RangeKeyDels. i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init( @@ -876,51 +1032,69 @@ func (i *scanInternalIterator) constructRangeKeyIter() { // NB: We iterate L0's files in reverse order. They're sorted by // LargestSeqNum ascending, and we need to add them to the merging iterator // in LargestSeqNum descending to preserve the merging iterator's invariants - // around Key Trailer order. + // around Key InternalKeyTrailer order. iter := current.RangeKeyLevels[0].Iter() for f := iter.Last(); f != nil; f = iter.Prev() { - spanIter, err := i.newIterRangeKey(f, i.opts.SpanIterOptions()) + spanIter, err := i.newIterRangeKey(i.ctx, f, i.opts.SpanIterOptions()) if err != nil { - i.rangeKey.iterConfig.AddLevel(&errorKeyspanIter{err: err}) - continue + return err } i.rangeKey.iterConfig.AddLevel(spanIter) } - // Add level iterators for the non-empty non-L0 levels. + skipStart := i.opts.skipLevelForOpts() for level := 1; level < len(current.RangeKeyLevels); level++ { if current.RangeKeyLevels[level].Empty() { continue } - if i.opts.skipSharedLevels && level >= sharedLevelsStart { + if level > skipStart { continue } li := i.rangeKey.iterConfig.NewLevelIter() spanIterOpts := i.opts.SpanIterOptions() - li.Init(spanIterOpts, i.comparer.Compare, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(), + levIter := current.RangeKeyLevels[level].Iter() + if level == skipStart { + nonRemoteFiles := make([]*manifest.TableMetadata, 0) + for f := levIter.First(); f != nil; f = levIter.Next() { + meta, err := i.db.objProvider.Lookup(base.FileTypeTable, f.TableBacking.DiskFileNum) + if err != nil { + return err + } + if (meta.IsShared() && i.opts.visitSharedFile != nil) || + (meta.IsExternal() && i.opts.visitExternalFile != nil) { + // Skip this file. + continue + } + nonRemoteFiles = append(nonRemoteFiles, f) + } + levSlice := manifest.NewLevelSliceKeySorted(i.db.cmp, nonRemoteFiles) + levIter = levSlice.Iter() + } + li.Init(i.ctx, spanIterOpts, i.comparer.Compare, i.newIterRangeKey, levIter, manifest.Level(level), manifest.KeyTypeRange) i.rangeKey.iterConfig.AddLevel(li) } + return nil } // seekGE seeks this iterator to the first key that's greater than or equal // to the specified user key. func (i *scanInternalIterator) seekGE(key []byte) bool { - i.iterKey, i.iterValue = i.iter.SeekGE(key, base.SeekGEFlagsNone) - return i.iterKey != nil + i.iterKV = i.iter.SeekGE(key, base.SeekGEFlagsNone) + return i.iterKV != nil } // unsafeKey returns the unsafe InternalKey at the current position. The value // is nil if the iterator is invalid or exhausted. func (i *scanInternalIterator) unsafeKey() *InternalKey { - return i.iterKey + return &i.iterKV.K } // lazyValue returns a value pointer to the value at the current iterator // position. Behaviour undefined if unsafeKey() returns a Range key or Rangedel // kind key. func (i *scanInternalIterator) lazyValue() LazyValue { - return i.iterValue + return i.iterKV.LazyValue() } // unsafeRangeDel returns a range key span. Behaviour undefined if UnsafeKey returns @@ -941,8 +1115,8 @@ func (i *scanInternalIterator) unsafeSpan() *keyspan.Span { // next advances the iterator in the forward direction, and returns the // iterator's new validity state. func (i *scanInternalIterator) next() bool { - i.iterKey, i.iterValue = i.iter.Next() - return i.iterKey != nil + i.iterKV = i.iter.Next() + return i.iterKV != nil } // error returns an error from the internal iterator, if there's any. @@ -951,10 +1125,9 @@ func (i *scanInternalIterator) error() error { } // close closes this iterator, and releases any pooled objects. -func (i *scanInternalIterator) close() error { - if err := i.iter.Close(); err != nil { - return err - } +func (i *scanInternalIterator) close() { + _ = i.iter.Close() + _ = i.blobValueFetcher.Close() if i.readState != nil { i.readState.unref() } @@ -985,7 +1158,6 @@ func (i *scanInternalIterator) close() error { iterAllocPool.Put(alloc) i.alloc = nil } - return nil } func (i *scanInternalIterator) initializeBoundBufs(lower, upper []byte) { diff --git a/vendor/github.com/cockroachdb/pebble/snapshot.go b/vendor/github.com/cockroachdb/pebble/v2/snapshot.go similarity index 82% rename from vendor/github.com/cockroachdb/pebble/snapshot.go rename to vendor/github.com/cockroachdb/pebble/v2/snapshot.go index d745e8a..9d877af 100644 --- a/vendor/github.com/cockroachdb/pebble/snapshot.go +++ b/vendor/github.com/cockroachdb/pebble/v2/snapshot.go @@ -9,24 +9,20 @@ import ( "io" "math" "sync" - "sync/atomic" "time" - "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/rangekey" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/manifest" + "github.com/cockroachdb/pebble/v2/rangekey" + "github.com/cockroachdb/pebble/v2/sstable/block" ) -// ErrSnapshotExcised is returned from WaitForFileOnlySnapshot if an excise -// overlapping with one of the EventuallyFileOnlySnapshot's KeyRanges gets -// applied before the transition of that EFOS to a file-only snapshot. -var ErrSnapshotExcised = errors.New("pebble: snapshot excised before conversion to file-only snapshot") - // Snapshot provides a read-only point-in-time view of the DB state. type Snapshot struct { // The db the snapshot was created from. db *DB - seqNum uint64 + seqNum base.SeqNum // Set if part of an EventuallyFileOnlySnapshot. efos *EventuallyFileOnlySnapshot @@ -67,7 +63,9 @@ func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Ite if s.db == nil { panic(ErrClosed) } - return s.db.newIter(ctx, nil /* batch */, snapshotIterOpts{seqNum: s.seqNum}, o), nil + return s.db.newIter(ctx, nil /* batch */, newIterOpts{ + snapshot: snapshotIterOpts{seqNum: s.seqNum}, + }, o), nil } // ScanInternal scans all internal keys within the specified bounds, truncating @@ -78,21 +76,24 @@ func (s *Snapshot) NewIterWithContext(ctx context.Context, o *IterOptions) (*Ite // point keys deleted by range dels and keys masked by range keys. func (s *Snapshot) ScanInternal( ctx context.Context, + category block.Category, lower, upper []byte, visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, - visitRangeDel func(start, end []byte, seqNum uint64) error, + visitRangeDel func(start, end []byte, seqNum base.SeqNum) error, visitRangeKey func(start, end []byte, keys []rangekey.Key) error, visitSharedFile func(sst *SharedSSTMeta) error, + visitExternalFile func(sst *ExternalFile) error, ) error { if s.db == nil { panic(ErrClosed) } scanInternalOpts := &scanInternalOptions{ - visitPointKey: visitPointKey, - visitRangeDel: visitRangeDel, - visitRangeKey: visitRangeKey, - visitSharedFile: visitSharedFile, - skipSharedLevels: visitSharedFile != nil, + category: category, + visitPointKey: visitPointKey, + visitRangeDel: visitRangeDel, + visitRangeKey: visitRangeKey, + visitSharedFile: visitSharedFile, + visitExternalFile: visitExternalFile, IterOptions: IterOptions{ KeyTypes: IterKeyTypePointsAndRanges, LowerBound: lower, @@ -100,7 +101,10 @@ func (s *Snapshot) ScanInternal( }, } - iter := s.db.newInternalIter(snapshotIterOpts{seqNum: s.seqNum}, scanInternalOpts) + iter, err := s.db.newInternalIter(ctx, snapshotIterOpts{seqNum: s.seqNum}, scanInternalOpts) + if err != nil { + return err + } defer iter.close() return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts) @@ -114,7 +118,8 @@ func (s *Snapshot) closeLocked() error { // If s was the previous earliest snapshot, we might be able to reclaim // disk space by dropping obsolete records that were pinned by s. if e := s.db.mu.snapshots.earliest(); e > s.seqNum { - s.db.maybeScheduleCompactionPicker(pickElisionOnly) + // NB: maybeScheduleCompaction also picks elision-only compactions. + s.db.maybeScheduleCompaction() } s.db = nil return nil @@ -160,19 +165,19 @@ func (l *snapshotList) count() int { return count } -func (l *snapshotList) earliest() uint64 { - v := uint64(math.MaxUint64) +func (l *snapshotList) earliest() base.SeqNum { + v := base.SeqNum(math.MaxUint64) if !l.empty() { v = l.root.next.seqNum } return v } -func (l *snapshotList) toSlice() []uint64 { +func (l *snapshotList) toSlice() []base.SeqNum { if l.empty() { return nil } - var results []uint64 + var results []base.SeqNum for i := l.root.next; i != &l.root; i = i.next { results = append(results, i.seqNum) } @@ -220,13 +225,12 @@ func (l *snapshotList) remove(s *Snapshot) { // the snapshot is closed may prefer EventuallyFileOnlySnapshots for their // reduced write amplification. Callers that desire the benefits of the file-only // state that requires no pinning of memtables should call -// `WaitForFileOnlySnapshot()` (and possibly re-mint an EFOS if it returns -// ErrSnapshotExcised) before relying on the EFOS to keep producing iterators +// `WaitForFileOnlySnapshot()` before relying on the EFOS to keep producing iterators // with zero write-amp and zero pinning of memtables in memory. // // EventuallyFileOnlySnapshots interact with the IngestAndExcise operation in -// subtle ways. No new iterators can be created once -// EventuallyFileOnlySnapshot.excised is set to true. +// subtle ways. The IngestAndExcise can force the transition of an EFOS to a +// file-only snapshot if an excise overlaps with the EFOS bounds. type EventuallyFileOnlySnapshot struct { mu struct { // NB: If both this mutex and db.mu are being grabbed, db.mu should be @@ -241,25 +245,19 @@ type EventuallyFileOnlySnapshot struct { // The wrapped regular snapshot, if not a file-only snapshot yet. snap *Snapshot // The wrapped version reference, if a file-only snapshot. - vers *version + vers *manifest.Version } // Key ranges to watch for an excise on. protectedRanges []KeyRange - // excised, if true, signals that the above ranges were excised during the - // lifetime of this snapshot. - excised atomic.Bool // The db the snapshot was created from. db *DB - seqNum uint64 - + seqNum base.SeqNum closed chan struct{} } -func (d *DB) makeEventuallyFileOnlySnapshot( - keyRanges []KeyRange, internalKeyRanges []internalKeyRange, -) *EventuallyFileOnlySnapshot { +func (d *DB) makeEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFileOnlySnapshot { isFileOnly := true d.mu.Lock() @@ -267,11 +265,10 @@ func (d *DB) makeEventuallyFileOnlySnapshot( seqNum := d.mu.versions.visibleSeqNum.Load() // Check if any of the keyRanges overlap with a memtable. for i := range d.mu.mem.queue { - mem := d.mu.mem.queue[i] - if ingestMemtableOverlaps(d.cmp, mem, internalKeyRanges) { + d.mu.mem.queue[i].computePossibleOverlaps(func(bounded) shouldContinue { isFileOnly = false - break - } + return stopIteration + }, sliceAsBounded(keyRanges)...) } es := &EventuallyFileOnlySnapshot{ db: d, @@ -303,7 +300,7 @@ func (d *DB) makeEventuallyFileOnlySnapshot( // call. // // d.mu must be held when calling this method. -func (es *EventuallyFileOnlySnapshot) transitionToFileOnlySnapshot(vers *version) error { +func (es *EventuallyFileOnlySnapshot) transitionToFileOnlySnapshot(vers *manifest.Version) error { es.mu.Lock() select { case <-es.closed: @@ -371,9 +368,6 @@ func (es *EventuallyFileOnlySnapshot) waitForFlush(ctx context.Context, dur time earliestUnflushedSeqNum = es.db.getEarliestUnflushedSeqNumLocked() } - if es.excised.Load() { - return ErrSnapshotExcised - } return nil } @@ -431,13 +425,7 @@ func (es *EventuallyFileOnlySnapshot) Get(key []byte) (value []byte, closer io.C if err != nil { return nil, nil, err } - var valid bool - if es.db.opts.Comparer.Split != nil { - valid = iter.SeekPrefixGE(key) - } else { - valid = iter.SeekGE(key) - } - if !valid { + if !iter.SeekPrefixGE(key) { if err = firstError(iter.Error(), iter.Close()); err != nil { return nil, nil, err } @@ -471,21 +459,11 @@ func (es *EventuallyFileOnlySnapshot) NewIterWithContext( defer es.mu.Unlock() if es.mu.vers != nil { sOpts := snapshotIterOpts{seqNum: es.seqNum, vers: es.mu.vers} - return es.db.newIter(ctx, nil /* batch */, sOpts, o), nil + return es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o), nil } - if es.excised.Load() { - return nil, ErrSnapshotExcised - } sOpts := snapshotIterOpts{seqNum: es.seqNum} - iter := es.db.newIter(ctx, nil /* batch */, sOpts, o) - - // If excised is true, then keys relevant to the snapshot might not be - // present in the readState being used by the iterator. Error out. - if es.excised.Load() { - iter.Close() - return nil, ErrSnapshotExcised - } + iter := es.db.newIter(ctx, nil /* batch */, newIterOpts{snapshot: sOpts}, o) return iter, nil } @@ -497,19 +475,31 @@ func (es *EventuallyFileOnlySnapshot) NewIterWithContext( // point keys deleted by range dels and keys masked by range keys. func (es *EventuallyFileOnlySnapshot) ScanInternal( ctx context.Context, + category block.Category, lower, upper []byte, visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, - visitRangeDel func(start, end []byte, seqNum uint64) error, + visitRangeDel func(start, end []byte, seqNum base.SeqNum) error, visitRangeKey func(start, end []byte, keys []rangekey.Key) error, visitSharedFile func(sst *SharedSSTMeta) error, + visitExternalFile func(sst *ExternalFile) error, ) error { if es.db == nil { panic(ErrClosed) } - if es.excised.Load() { - return ErrSnapshotExcised - } var sOpts snapshotIterOpts + opts := &scanInternalOptions{ + category: category, + IterOptions: IterOptions{ + KeyTypes: IterKeyTypePointsAndRanges, + LowerBound: lower, + UpperBound: upper, + }, + visitPointKey: visitPointKey, + visitRangeDel: visitRangeDel, + visitRangeKey: visitRangeKey, + visitSharedFile: visitSharedFile, + visitExternalFile: visitExternalFile, + } es.mu.Lock() if es.mu.vers != nil { sOpts = snapshotIterOpts{ @@ -522,26 +512,11 @@ func (es *EventuallyFileOnlySnapshot) ScanInternal( } } es.mu.Unlock() - opts := &scanInternalOptions{ - IterOptions: IterOptions{ - KeyTypes: IterKeyTypePointsAndRanges, - LowerBound: lower, - UpperBound: upper, - }, - visitPointKey: visitPointKey, - visitRangeDel: visitRangeDel, - visitRangeKey: visitRangeKey, - visitSharedFile: visitSharedFile, - skipSharedLevels: visitSharedFile != nil, + iter, err := es.db.newInternalIter(ctx, sOpts, opts) + if err != nil { + return err } - iter := es.db.newInternalIter(sOpts, opts) defer iter.close() - // If excised is true, then keys relevant to the snapshot might not be - // present in the readState being used by the iterator. Error out. - if es.excised.Load() { - return ErrSnapshotExcised - } - return scanInternalImpl(ctx, lower, upper, iter, opts) } diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/attributes.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/attributes.go new file mode 100644 index 0000000..55bac79 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/attributes.go @@ -0,0 +1,66 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import "strings" + +// Attributes is a bitset containing features in use in an sstable. +type Attributes uint32 + +const ( + AttributeValueBlocks Attributes = 1 << iota + AttributeRangeKeySets + AttributeRangeKeyUnsets + AttributeRangeKeyDels + AttributeRangeDels + AttributeTwoLevelIndex + AttributeBlobValues + AttributePointKeys +) + +// Intersects checks if any bits in attr are set in a. +func (a Attributes) Intersects(attr Attributes) bool { + return a&attr != 0 +} + +// Has checks if all bits in attr are set in a. +func (a Attributes) Has(attr Attributes) bool { + return a&attr == attr +} + +// Add sets the bits in attr to a. +func (a *Attributes) Add(attr Attributes) { + *a = *a | attr +} + +// String converts the Attributes fs to a string representation for testing. +func (a Attributes) String() string { + var attributes []string + if a.Has(AttributeValueBlocks) { + attributes = append(attributes, "ValueBlocks") + } + if a.Has(AttributeRangeKeySets) { + attributes = append(attributes, "RangeKeySets") + } + if a.Has(AttributeRangeKeyUnsets) { + attributes = append(attributes, "RangeKeyUnsets") + } + if a.Has(AttributeRangeKeyDels) { + attributes = append(attributes, "RangeKeyDels") + } + if a.Has(AttributeRangeDels) { + attributes = append(attributes, "RangeDels") + } + if a.Has(AttributeTwoLevelIndex) { + attributes = append(attributes, "TwoLevelIndex") + } + if a.Has(AttributeBlobValues) { + attributes = append(attributes, "BlobValues") + } + if a.Has(AttributePointKeys) { + attributes = append(attributes, "PointKeys") + } + return "[" + strings.Join(attributes, ",") + "]" +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/blob.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/blob.go new file mode 100644 index 0000000..d16f770 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/blob.go @@ -0,0 +1,525 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package blob + +import ( + "bytes" + "context" + "encoding/binary" + "fmt" + "sync" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/crc" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/block/blockkind" +) + +var ( + errClosed = errors.New("blob: writer closed") +) + +// FileFormat identifies the format of a blob file. +type FileFormat uint8 + +// String implements the fmt.Stringer interface. +func (f FileFormat) String() string { + switch f { + case FileFormatV1: + return "blobV1" + default: + return "unknown" + } +} + +const ( + // FileFormatV1 is the first version of the blob file format. + FileFormatV1 FileFormat = 1 +) + +const ( + fileFooterLength = 38 + fileMagic = "\xf0\x9f\xaa\xb3\xf0\x9f\xa6\x80" // 🪳🦀 +) + +// FileWriterOptions are used to configure the FileWriter. +type FileWriterOptions struct { + Compression *block.CompressionProfile + ChecksumType block.ChecksumType + FlushGovernor block.FlushGovernor + // Only CPUMeasurer.MeasureCPUBlobFileSecondary is used. + CpuMeasurer base.CPUMeasurer +} + +func (o *FileWriterOptions) ensureDefaults() { + if o.Compression == nil { + o.Compression = block.SnappyCompression + } + if o.ChecksumType == block.ChecksumTypeNone { + o.ChecksumType = block.ChecksumTypeCRC32c + } + if o.FlushGovernor == (block.FlushGovernor{}) { + o.FlushGovernor = block.MakeFlushGovernor( + base.DefaultBlockSize, + base.DefaultBlockSizeThreshold, + base.SizeClassAwareBlockSizeThreshold, + nil) + } + if o.CpuMeasurer == nil { + o.CpuMeasurer = base.NoopCPUMeasurer{} + } +} + +// FileWriterStats aggregates statistics about a blob file written by a +// FileWriter. +type FileWriterStats struct { + BlockCount uint32 + ValueCount uint32 + UncompressedValueBytes uint64 + FileLen uint64 +} + +// String implements the fmt.Stringer interface. +func (s FileWriterStats) String() string { + var buf bytes.Buffer + fmt.Fprintf(&buf, "{BlockCount: %d, ValueCount: %d, UncompressedValueBytes: %d, FileLen: %d}", + s.BlockCount, s.ValueCount, s.UncompressedValueBytes, s.FileLen) + return buf.String() +} + +// A FileWriter writes a blob file. +type FileWriter struct { + fileNum base.DiskFileNum + w objstorage.Writable + err error + valuesEncoder blobValueBlockEncoder + // indexEncoder is an encoder for the index block. Every blob file has an + // index block encoding the offsets at which each block is written. + // Additionally, when rewriting a blob file, the index block's virtualBlocks + // column is also populated to remap blockIDs to the physical block indexes. + indexEncoder indexBlockEncoder + stats FileWriterStats + flushGov block.FlushGovernor + checksummer block.Checksummer + compressor block.Compressor + cpuMeasurer base.CPUMeasurer + writeQueue struct { + wg sync.WaitGroup + ch chan compressedBlock + err error + } +} + +type compressedBlock struct { + pb block.PhysicalBlock + bh *block.TempBuffer + off uint64 +} + +// NewFileWriter creates a new FileWriter. +func NewFileWriter(fn base.DiskFileNum, w objstorage.Writable, opts FileWriterOptions) *FileWriter { + opts.ensureDefaults() + fw := writerPool.Get().(*FileWriter) + fw.fileNum = fn + fw.w = w + fw.valuesEncoder.Init() + fw.flushGov = opts.FlushGovernor + fw.indexEncoder.Init() + fw.checksummer = block.Checksummer{Type: opts.ChecksumType} + fw.compressor = block.MakeCompressor(opts.Compression) + fw.cpuMeasurer = opts.CpuMeasurer + fw.writeQueue.ch = make(chan compressedBlock) + fw.writeQueue.wg.Add(1) + go fw.drainWriteQueue() + return fw +} + +var writerPool = sync.Pool{ + New: func() interface{} { return &FileWriter{} }, +} + +// AddValue adds the provided value to the blob file, returning a Handle +// identifying the location of the value. +func (w *FileWriter) AddValue(v []byte) Handle { + // Determine if we should first flush the block. + if sz := w.valuesEncoder.size(); w.flushGov.ShouldFlush(sz, sz+len(v)) { + w.flush() + } + valuesInBlock := w.valuesEncoder.Count() + w.stats.ValueCount++ + w.stats.UncompressedValueBytes += uint64(len(v)) + w.valuesEncoder.AddValue(v) + return Handle{ + BlobFileID: base.BlobFileID(w.fileNum), + ValueLen: uint32(len(v)), + BlockID: BlockID(w.stats.BlockCount), + ValueID: BlockValueID(valuesInBlock), + } +} + +// beginNewVirtualBlock adds a virtual block mapping to the current physical +// block and valueID offset within the block. +// +// When a blob file is rewritten, beginNewVirtualBlock is called for each block +// in the original blob file before adding any of the block's extant values. +// beginNewVirtualBlock records a mapping from the original block ID (referred +// to as a virtual block) to a tuple of the physical block index and the offset +// of the BlockValueIDs within the new physical block. +// +// This mapping is used by readers to determine which physical block contains a +// given virtual block, and how to map BlockValueIDs from the given virtual +// block to BlockValueIDs in the physical block. +func (w *FileWriter) beginNewVirtualBlock(vblockID BlockID) { + // TODO(jackson): Update tests to use the blob.FileRewriter type and move this + // into the FileRewriter. + w.indexEncoder.AddVirtualBlockMapping(vblockID, int(w.stats.BlockCount), + BlockValueID(w.valuesEncoder.Count())) +} + +// EstimatedSize returns an estimate of the disk space consumed by the blob file +// if it were closed now. +func (w *FileWriter) EstimatedSize() uint64 { + sz := w.stats.FileLen // Completed blocks + sz += uint64(w.valuesEncoder.size()) + block.TrailerLen // Pending uncompressed block + // We estimate the size of the index block as 4 bytes per offset, and n+1 + // offsets for n block handles. We don't use an exact accounting because the + // index block is constructed from the write queue goroutine, so using the + // exact size would introduce nondeterminism. The index block is small + // relatively speaking. In practice, offsets should use at most 4 bytes per + // offset. + sz += uint64(w.stats.BlockCount+1)*4 + block.TrailerLen // Index block + sz += fileFooterLength // Footer + return sz +} + +// FlushForTesting flushes the current block to the write queue. Writers should +// generally not call FlushForTesting, and instead let the heuristics configured +// through FileWriterOptions handle flushing. +// +// It's exposed so that tests can force flushes to construct blob files with +// arbitrary structures. +func (w *FileWriter) FlushForTesting() { + if w.valuesEncoder.Count() == 0 { + return + } + w.flush() +} + +// flush flushes the current block to the write queue. +func (w *FileWriter) flush() { + if w.valuesEncoder.Count() == 0 { + panic(errors.AssertionFailedf("no values to flush")) + } + pb, bh := block.CompressAndChecksumToTempBuffer(w.valuesEncoder.Finish(), blockkind.BlobValue, &w.compressor, &w.checksummer) + compressedLen := uint64(pb.LengthWithoutTrailer()) + w.stats.BlockCount++ + off := w.stats.FileLen + w.stats.FileLen += compressedLen + block.TrailerLen + w.writeQueue.ch <- compressedBlock{pb: pb, bh: bh, off: off} + w.valuesEncoder.Reset() +} + +// drainWriteQueue runs in its own goroutine and is responsible for writing +// finished, compressed data blocks to the writable. It reads from w.writeQueue +// until the channel is closed. All value blocks are written by this goroutine. +func (w *FileWriter) drainWriteQueue() { + defer w.writeQueue.wg.Done() + // Call once to initialize the CPU measurer. + w.cpuMeasurer.MeasureCPU(base.CompactionGoroutineBlobFileSecondary) + for cb := range w.writeQueue.ch { + _, err := cb.pb.WriteTo(w.w) + // Report to the CPU measurer immediately after writing (note that there + // may be a time lag until the next block is available to write). + w.cpuMeasurer.MeasureCPU(base.CompactionGoroutineBlobFileSecondary) + if err != nil { + w.writeQueue.err = err + continue + } + w.indexEncoder.AddBlockHandle(block.Handle{ + Offset: cb.off, + Length: uint64(cb.pb.LengthWithoutTrailer()), + }) + // We're done with the buffer associated with this physical block. + // Release it back to its pool. + cb.bh.Release() + } +} + +// Close finishes writing the blob file. +func (w *FileWriter) Close() (FileWriterStats, error) { + if w.w == nil { + return FileWriterStats{}, w.err + } + // Flush the last block to the write queue if it's non-empty. + if w.valuesEncoder.Count() > 0 { + w.flush() + } + // Inform the write queue we're finished by closing the channel and wait + // for it to complete. + close(w.writeQueue.ch) + w.writeQueue.wg.Wait() + var err error + if w.writeQueue.err != nil { + err = w.writeQueue.err + if w.w != nil { + w.w.Abort() + } + return FileWriterStats{}, err + } + stats := w.stats + if stats.BlockCount != uint32(w.indexEncoder.countBlocks) { + panic(errors.AssertionFailedf("block count mismatch: %d vs %d", + stats.BlockCount, w.indexEncoder.countBlocks)) + } + if stats.BlockCount == 0 { + panic(errors.AssertionFailedf("no blocks written")) + } + + // Write the index block. + var indexBlockHandle block.Handle + { + indexBlock := w.indexEncoder.Finish() + var compressedBuf []byte + pb := block.CopyAndChecksum(&compressedBuf, indexBlock, blockkind.Metadata, &w.compressor, &w.checksummer) + if _, w.err = pb.WriteTo(w.w); w.err != nil { + err = w.err + if w.w != nil { + w.w.Abort() + } + return FileWriterStats{}, err + } + indexBlockHandle.Offset = stats.FileLen + indexBlockHandle.Length = uint64(pb.LengthWithoutTrailer()) + stats.FileLen += uint64(pb.LengthWithTrailer()) + } + + // Write the footer. + footer := fileFooter{ + format: FileFormatV1, + checksum: w.checksummer.Type, + indexHandle: indexBlockHandle, + originalFileNum: w.fileNum, + } + footerBuf := make([]byte, fileFooterLength) + footer.encode(footerBuf) + if w.err = w.w.Write(footerBuf); w.err != nil { + err = w.err + if w.w != nil { + w.w.Abort() + } + return FileWriterStats{}, err + } + stats.FileLen += fileFooterLength + if w.err = w.w.Finish(); w.err != nil { + err = w.err + if w.w != nil { + w.w.Abort() + } + return FileWriterStats{}, err + } + + // Clean up w and return it to the pool. + w.indexEncoder.Reset() + w.valuesEncoder.Reset() + w.w = nil + w.stats = FileWriterStats{} + w.err = errClosed + w.writeQueue.ch = nil + w.writeQueue.err = nil + writerPool.Put(w) + return stats, nil +} + +// fileFooter contains the information contained within the footer of a blob +// file. +// +// Blob file footer format: +// - checksum CRC over footer data (4 bytes) +// - index block offset (8 bytes) +// - index block length (8 bytes) +// - checksum type (1 byte) +// - format (1 byte) +// - original file number (8 bytes) +// - blob file magic string (8 bytes) +type fileFooter struct { + format FileFormat + checksum block.ChecksumType + indexHandle block.Handle + originalFileNum base.DiskFileNum +} + +func (f *fileFooter) decode(b []byte) error { + if uint64(len(b)) != fileFooterLength { + return errors.AssertionFailedf("invalid blob file footer length") + } + encodedChecksum := binary.LittleEndian.Uint32(b[0:]) + computedChecksum := crc.New(b[4:]).Value() + if encodedChecksum != computedChecksum { + return base.CorruptionErrorf("invalid blob file checksum 0x%04x, expected: 0x%04x", encodedChecksum, computedChecksum) + } + f.indexHandle.Offset = binary.LittleEndian.Uint64(b[4:]) + f.indexHandle.Length = binary.LittleEndian.Uint64(b[12:]) + f.checksum = block.ChecksumType(b[20]) + f.format = FileFormat(b[21]) + if f.format != FileFormatV1 { + return base.CorruptionErrorf("invalid blob file format %x", f.format) + } + f.originalFileNum = base.DiskFileNum(binary.LittleEndian.Uint64(b[22:])) + if string(b[30:]) != fileMagic { + return base.CorruptionErrorf("invalid blob file magic string %x", b[30:]) + } + return nil +} + +func (f *fileFooter) encode(b []byte) { + binary.LittleEndian.PutUint64(b[4:], f.indexHandle.Offset) + binary.LittleEndian.PutUint64(b[12:], f.indexHandle.Length) + b[20] = byte(f.checksum) + b[21] = byte(f.format) + binary.LittleEndian.PutUint64(b[22:], uint64(f.originalFileNum)) + copy(b[30:], fileMagic) + footerChecksum := crc.New(b[4 : 30+len(fileMagic)]).Value() + binary.LittleEndian.PutUint32(b[:4], footerChecksum) +} + +// FileReader reads a blob file. +// If you update this struct, make sure you also update the magic number in +// StringForTests() in metrics.go. +type FileReader struct { + r block.Reader + footer fileFooter +} + +// Assert that FileReader implements the ValueReader interface. +var _ ValueReader = (*FileReader)(nil) + +// FileReaderOptions configures a reader of a blob file. +type FileReaderOptions struct { + block.ReaderOptions +} + +func (o FileReaderOptions) ensureDefaults() FileReaderOptions { + if o.LoggerAndTracer == nil { + o.LoggerAndTracer = base.NoopLoggerAndTracer{} + } + return o +} + +// NewFileReader opens a blob file for reading. +// +// In error cases, the objstorage.Readable is still open. The caller remains +// responsible for closing it if necessary. +func NewFileReader( + ctx context.Context, r objstorage.Readable, ro FileReaderOptions, +) (*FileReader, error) { + ro = ro.ensureDefaults() + + fileNum := ro.CacheOpts.FileNum + + var footerBuf [fileFooterLength]byte + size := r.Size() + off := size - fileFooterLength + if size < fileFooterLength { + return nil, base.CorruptionErrorf("pebble: invalid blob file %s (file size is too small)", + errors.Safe(fileNum)) + } + var preallocRH objstorageprovider.PreallocatedReadHandle + rh := objstorageprovider.UsePreallocatedReadHandle( + r, objstorage.ReadBeforeForNewReader, &preallocRH) + + encodedFooter, err := block.ReadRaw(ctx, r, rh, ro.LoggerAndTracer, fileNum, footerBuf[:], off) + _ = rh.Close() + if err != nil { + return nil, err + } + + fr := &FileReader{} + if err := fr.footer.decode(encodedFooter); err != nil { + return nil, err + } + fr.r.Init(r, ro.ReaderOptions, fr.footer.checksum) + return fr, nil +} + +// Close implements io.Closer, closing the underlying Readable. +func (r *FileReader) Close() error { + return r.r.Close() +} + +// InitReadHandle initializes a read handle for the file reader, using the +// provided preallocated read handle. +func (r *FileReader) InitReadHandle( + rh *objstorageprovider.PreallocatedReadHandle, +) objstorage.ReadHandle { + return objstorageprovider.UsePreallocatedReadHandle(r.r.Readable(), objstorage.NoReadBefore, rh) +} + +// ReadValueBlock reads a value block from the file. +func (r *FileReader) ReadValueBlock( + ctx context.Context, env block.ReadEnv, rh objstorage.ReadHandle, h block.Handle, +) (block.BufferHandle, error) { + return r.r.Read(ctx, env, rh, h, blockkind.BlobValue, initBlobValueBlockMetadata) +} + +// ReadIndexBlock reads the index block from the file. +func (r *FileReader) ReadIndexBlock( + ctx context.Context, env block.ReadEnv, rh objstorage.ReadHandle, +) (block.BufferHandle, error) { + return r.r.Read(ctx, env, rh, r.footer.indexHandle, blockkind.Metadata, initIndexBlockMetadata) +} + +// IndexHandle returns the block handle for the file's index block. +func (r *FileReader) IndexHandle() block.Handle { + return r.footer.indexHandle +} + +// Layout returns the layout (block organization) as a string for a blob file. +func (r *FileReader) Layout() (string, error) { + ctx := context.TODO() + + indexH, err := r.ReadIndexBlock(ctx, block.NoReadEnv, nil /* rh */) + if err != nil { + return "", err + } + defer indexH.Release() + + var buf bytes.Buffer + indexDecoder := indexBlockDecoder{} + indexDecoder.Init(indexH.BlockData()) + + if indexDecoder.virtualBlockCount > 0 { + fmt.Fprintf(&buf, "virtual blocks mapping:\n") + for i := range indexDecoder.virtualBlockCount { + blockIndex, valueIDOffset := indexDecoder.RemapVirtualBlockID(BlockID(i)) + fmt.Fprintf(&buf, "virtual block %d -> physical block %d (valueID offset: %d)\n", + i, blockIndex, valueIDOffset) + } + fmt.Fprintf(&buf, "\n") + } + + fmt.Fprintf(&buf, "physical blocks:\n") + for i := range indexDecoder.BlockCount() { + handle := indexDecoder.BlockHandle(i) + fmt.Fprintf(&buf, "block %d: offset=%d length=%d\n", i, handle.Offset, handle.Length) + + valueBlockH, err := r.ReadValueBlock(ctx, block.NoReadEnv, nil /* rh */, handle) + if err != nil { + return "", err + } + + valueDecoder := blobValueBlockDecoder{} + valueDecoder.Init(valueBlockH.BlockData()) + + fmt.Fprintf(&buf, "values: %d\n", valueDecoder.bd.Rows()) + fmt.Fprintf(&buf, "%s", valueDecoder.bd.FormattedString()) + + valueBlockH.Release() + } + + return buf.String(), nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/blocks.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/blocks.go new file mode 100644 index 0000000..740789e --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/blocks.go @@ -0,0 +1,365 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package blob + +import ( + "encoding/binary" + "unsafe" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/colblk" +) + +const ( + indexBlockCustomHeaderSize = 4 + indexBlockColumnCount = 2 + indexBlockColumnVirtualBlocksIdx = 0 + indexBlockColumnOffsetsIdx = 1 + + virtualBlockIndexMask = 0x00000000ffffffff +) + +// indexBlockEncoder encodes a blob index block. +// +// A blob index block tells a reader where in the file each physical blob-value +// block is located. Its format is a columnar block with two columns. The +// offsets column is an array of uints encoding the file offset at which each +// block begins. The last entry in the array points after the last block. The +// virtualBlocks column is an array of uints that is only non-empty in the case +// of rewritten blob files. It encodes a mapping from the original blob file's +// blockIDs to a tuple of the index of the physical block containing the block's +// data and offset that should be added to the value ID to get the index of the +// block's values within the physical block. +type indexBlockEncoder struct { + // countBlocks is the number of physical blocks in the blob file. The number + // of rows in the offsets column is countBlocks+1. The last offset points to + // the first byte after the last block so that a reader can compute the + // length of the last block. + countBlocks int + countVirtualBlocks int + virtualBlocks colblk.UintBuilder + // offsets contains the offset of the start of each block. There is +1 more + // offset than there are blocks, with the last offset pointing to the first + // byte after the last block. Block lengths are inferred from the difference + // between consecutive offsets. + offsets colblk.UintBuilder + enc colblk.BlockEncoder +} + +// Init initializes the index block encoder. +func (e *indexBlockEncoder) Init() { + e.countBlocks = 0 + e.countVirtualBlocks = 0 + e.offsets.Init() + e.virtualBlocks.Init() +} + +// Reset resets the index block encoder to its initial state, retaining buffers. +func (e *indexBlockEncoder) Reset() { + e.countBlocks = 0 + e.countVirtualBlocks = 0 + e.offsets.Reset() + e.virtualBlocks.Reset() + e.enc.Reset() +} + +// AddBlockHandle adds a handle to a blob-value block to the index block. +func (e *indexBlockEncoder) AddBlockHandle(h block.Handle) { + // Every call to AddBlockHandle adds its end offset (i.e, the next block's + // start offset) to the offsets column. + // + // The first call to AddBlockHandle must also add the start offset the first + // block. We also verify that for subsequent blocks, the start offset + // matches the offset encoded by the previous call to AddBlockHandle. + if e.countBlocks == 0 { + e.offsets.Set(0, h.Offset) + } else if expected := e.offsets.Get(e.countBlocks); expected != h.Offset { + panic(errors.AssertionFailedf("block handle %s doesn't have expected offset of %d", h, expected)) + } + + // Increment the number blocks, and set the endOffset. + e.countBlocks++ + endOffset := h.Offset + h.Length + block.TrailerLen + e.offsets.Set(e.countBlocks, endOffset) +} + +// AddVirtualBlockMapping adds a mapping from a virtual block ID to a physical +// block ID and a value ID offset. It's used when rewriting a blob file. +func (e *indexBlockEncoder) AddVirtualBlockMapping( + virtualBlockID BlockID, physicalBlockIndex int, valueIDOffset BlockValueID, +) { + // Require that virtual blocks are added in order. + if virtualBlockID < BlockID(e.countVirtualBlocks) { + panic(errors.AssertionFailedf("virtual block ID %d is out of order; expected %d", virtualBlockID, e.countVirtualBlocks)) + } + // If there's a gap within the virtual block IDs, we fill in the gap with + // entries that clarify these blocks are empty. + for id := BlockID(e.countVirtualBlocks); id < virtualBlockID; id++ { + e.virtualBlocks.Set(int(id), virtualBlockIndexMask) + e.countVirtualBlocks++ + } + e.virtualBlocks.Set(int(virtualBlockID), uint64(physicalBlockIndex)|(uint64(valueIDOffset)<<32)) + e.countVirtualBlocks++ +} + +func (e *indexBlockEncoder) size() int { + off := colblk.HeaderSize(indexBlockColumnCount, indexBlockCustomHeaderSize) + if e.countVirtualBlocks > 0 { + off = e.virtualBlocks.Size(e.countVirtualBlocks, off) + } + if e.countBlocks > 0 { + off = e.offsets.Size(e.countBlocks+1, off) + } + off++ + return int(off) +} + +// Finish serializes the pending index block. +func (e *indexBlockEncoder) Finish() []byte { + e.enc.Init(e.size(), colblk.Header{ + Version: colblk.Version1, + Columns: indexBlockColumnCount, + Rows: uint32(e.countBlocks), + }, indexBlockCustomHeaderSize) + e.enc.Encode(e.countVirtualBlocks, &e.virtualBlocks) + e.enc.Encode(e.countBlocks+1, &e.offsets) + data := e.enc.Finish() + binary.LittleEndian.PutUint32(data, uint32(e.countVirtualBlocks)) + return data +} + +// An indexBlockDecoder decodes blob file index blocks. See the doc comment for +// details on the encoding. +type indexBlockDecoder struct { + // virtualBlockCount is zero for blob files created during ordinary + // compactions. When a blob file is rewritten, virtualBlockCount is nonzero + // and holds the count of blocks in the original blob file. The + // virtualBlocks column contains virtualBlockCount rows. + virtualBlockCount int + // virtualBlocks is a column of uints remapping a BlockID to a tuple of + // (physicalBlockIndex, valueIDOffset). The valueIDOffset is encoded in the + // most-significant 32 bits of each uint value. + virtualBlocks colblk.UnsafeUints + // offsets contains the offset of the start of each block. There is +1 more + // offset than there are blocks, with the last offset pointing to the first + // byte after the last block. Block lengths are inferred from the difference + // between consecutive offsets. + offsets colblk.UnsafeUints + bd colblk.BlockDecoder +} + +// Init initializes the index block decoder with the given serialized index +// block. +func (d *indexBlockDecoder) Init(data []byte) { + d.virtualBlockCount = int(binary.LittleEndian.Uint32(data)) + d.bd.Init(data, indexBlockCustomHeaderSize) + d.virtualBlocks = colblk.DecodeColumn(&d.bd, indexBlockColumnVirtualBlocksIdx, + d.virtualBlockCount, colblk.DataTypeUint, colblk.DecodeUnsafeUints) + // Decode the offsets column. We pass rows+1 because an index block encoding + // n block handles encodes n+1 offsets. + d.offsets = colblk.DecodeColumn(&d.bd, indexBlockColumnOffsetsIdx, + d.bd.Rows()+1, colblk.DataTypeUint, colblk.DecodeUnsafeUints) +} + +// BlockHandle returns the block handle for the given block index in the +// range [0, bd.Rows()). +func (d *indexBlockDecoder) BlockHandle(blockIndex int) block.Handle { + invariants.CheckBounds(blockIndex, d.bd.Rows()) + // TODO(jackson): Add an At2 method to the UnsafeUints type too. + offset := d.offsets.At(blockIndex) + offset2 := d.offsets.At(blockIndex + 1) + return block.Handle{ + Offset: offset, + Length: offset2 - offset - block.TrailerLen, + } +} + +// RemapVirtualBlockID remaps a virtual block ID to a physical block index and a +// value ID offset. RemapVirtualBlockID should only be called on index blocks +// with a non-empty virtual blocks column (i.e., index blocks for rewritten blob +// files). +// +// REQUIRES: d.virtualBlockCount > 0 +func (d *indexBlockDecoder) RemapVirtualBlockID( + blockID BlockID, +) (blockIndex int, valueIDOffset BlockValueID) { + invariants.CheckBounds(int(blockID), d.virtualBlockCount) + v := d.virtualBlocks.At(int(blockID)) + blockIndex = int(v & virtualBlockIndexMask) + valueIDOffset = BlockValueID(v >> 32) + return blockIndex, valueIDOffset +} + +// BlockCount returns the number of physical blocks encoded in the index block. +func (d *indexBlockDecoder) BlockCount() int { + return int(d.bd.Rows()) +} + +// DebugString prints a human-readable explanation of the index block's binary +// representation. +func (d *indexBlockDecoder) DebugString() string { + f := binfmt.New(d.bd.Data()).LineWidth(20) + tp := treeprinter.New() + d.Describe(f, tp.Child("index-block-decoder")) + return tp.String() +} + +// Describe describes the binary format of the index block, assuming f.Offset() +// is positioned at the beginning of the same index block described by d. +func (d *indexBlockDecoder) Describe(f *binfmt.Formatter, tp treeprinter.Node) { + // Set the relative offset. When loaded into memory, the beginning of blocks + // are aligned. Padding that ensures alignment is done relative to the + // current offset. Setting the relative offset ensures that if we're + // describing this block within a larger structure (eg, f.Offset()>0), we + // compute padding appropriately assuming the current byte f.Offset() is + // aligned. + f.SetAnchorOffset() + + n := tp.Child("index block header") + f.HexBytesln(4, "virtual block count: %d", d.virtualBlockCount) + d.bd.HeaderToBinFormatter(f, n) + d.bd.ColumnToBinFormatter(f, n, indexBlockColumnVirtualBlocksIdx, d.virtualBlockCount) + d.bd.ColumnToBinFormatter(f, n, indexBlockColumnOffsetsIdx, d.bd.Rows()+1) + f.HexBytesln(1, "block padding byte") + f.ToTreePrinter(n) +} + +// Assert that an IndexBlockDecoder can fit inside block.Metadata. +const _ uint = block.MetadataSize - uint(unsafe.Sizeof(indexBlockDecoder{})) + +// initIndexBlockMetadata initializes the index block metadata. +func initIndexBlockMetadata(md *block.Metadata, data []byte) (err error) { + d := block.CastMetadataZero[indexBlockDecoder](md) + // Initialization can panic; convert panics to corruption errors (so higher + // layers can add file number and offset information). + defer func() { + if r := recover(); r != nil { + err = base.CorruptionErrorf("error initializing index block metadata: %v", r) + } + }() + d.Init(data) + return nil +} + +const ( + blobValueBlockCustomHeaderSize = 0 + blobValueBlockColumnCount = 1 + blobValueBlockColumnValuesIdx = 0 +) + +// blobValueBlockEncoder encodes a blob value block. +// +// A blob value block is a columnar block containing a single column: an array +// of bytes encoding values. +type blobValueBlockEncoder struct { + values colblk.RawBytesBuilder + enc colblk.BlockEncoder +} + +// Init initializes the blob value block encoder. +func (e *blobValueBlockEncoder) Init() { + e.values.Init() +} + +// Reset resets the blob value block encoder to its initial state, retaining +// buffers. +func (e *blobValueBlockEncoder) Reset() { + e.values.Reset() + e.enc.Reset() +} + +// AddValue adds a value to the blob value block. +func (e *blobValueBlockEncoder) AddValue(v []byte) { + e.values.Put(v) +} + +// Count returns the number of values in the blob value block. +func (e *blobValueBlockEncoder) Count() int { + return e.values.Rows() +} + +func (e *blobValueBlockEncoder) size() int { + rows := e.values.Rows() + if rows == 0 { + return 0 + } + off := colblk.HeaderSize(blobValueBlockColumnCount, blobValueBlockCustomHeaderSize) + off = e.values.Size(rows, off) + off++ + return int(off) +} + +// Finish serializes the pending blob value block. +func (e *blobValueBlockEncoder) Finish() []byte { + e.enc.Init(e.size(), colblk.Header{ + Version: colblk.Version1, + Columns: blobValueBlockColumnCount, + Rows: uint32(e.values.Rows()), + }, blobValueBlockCustomHeaderSize) + e.enc.Encode(e.values.Rows(), &e.values) + return e.enc.Finish() +} + +// A blobValueBlockDecoder reads columnar blob value blocks. +type blobValueBlockDecoder struct { + values colblk.RawBytes + bd colblk.BlockDecoder +} + +// Init initializes the decoder with the given serialized blob value block. +func (d *blobValueBlockDecoder) Init(data []byte) { + d.bd.Init(data, blobValueBlockCustomHeaderSize) + d.values = d.bd.RawBytes(blobValueBlockColumnValuesIdx) +} + +// DebugString prints a human-readable explanation of the blob value block's +// binary representation. +func (d *blobValueBlockDecoder) DebugString() string { + f := binfmt.New(d.bd.Data()).LineWidth(20) + tp := treeprinter.New() + d.Describe(f, tp.Child("blob-value-block-decoder")) + return tp.String() +} + +// Describe describes the binary format of the blob value block, assuming +// f.Offset() is positioned at the beginning of the same blob value block +// described by d. +func (d *blobValueBlockDecoder) Describe(f *binfmt.Formatter, tp treeprinter.Node) { + // Set the relative offset. When loaded into memory, the beginning of blocks + // are aligned. Padding that ensures alignment is done relative to the + // current offset. Setting the relative offset ensures that if we're + // describing this block within a larger structure (eg, f.Offset()>0), we + // compute padding appropriately assuming the current byte f.Offset() is + // aligned. + f.SetAnchorOffset() + + n := tp.Child("blob value block header") + d.bd.HeaderToBinFormatter(f, n) + d.bd.ColumnToBinFormatter(f, n, blobValueBlockColumnValuesIdx, d.bd.Rows()) + f.HexBytesln(1, "block padding byte") + f.ToTreePrinter(n) +} + +// Assert that an BlobBlockDecoder can fit inside block.Metadata. +const _ uint = block.MetadataSize - uint(unsafe.Sizeof(blobValueBlockDecoder{})) + +// initBlobValueBlockMetadata initializes the blob value block metadata. +func initBlobValueBlockMetadata(md *block.Metadata, data []byte) (err error) { + d := block.CastMetadataZero[blobValueBlockDecoder](md) + // Initialization can panic; convert panics to corruption errors (so higher + // layers can add file number and offset information). + defer func() { + if r := recover(); r != nil { + err = base.CorruptionErrorf("error initializing blob value block metadata: %v", r) + } + }() + d.Init(data) + return nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/doc.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/doc.go new file mode 100644 index 0000000..9c0041a --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/doc.go @@ -0,0 +1,126 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// Package blob implements mechanics for encoding and decoding values into blob +// files. +// +// # Blob file format +// +// A blob file consists of a sequence of blob-value blocks containing values, +// followed by an index block describing the location of the blob-value blocks. +// At the tail of the file is a fixed-size footer encoding the exact offset and +// length of the index block. +// +// Semantically, a blob file implements an array of blob values. SSTables that +// reference separated blob values encode a tuple of a (blockID, blockValueID) +// to identify the value within the blob file. The blockID identifies the +// blob-value block that contains the value. The blockValueID identifies the +// value within the block. A reader retrieving a particular value uses the index +// block to identify the offset and length of the blob-value block containing +// the referenced value. It loads the identified blob-value block and then uses +// the block's internal structure to retrieve the value based on the +// blockValueID. +// +// A blob file may be rewritten (without rewriting referencing sstables) to +// remove unused values. Extant handles within sstables must continue to work. +// See the Sparseness section below for more details. +// +// ## Index Block +// +// The index block is used to determine which blob-value block contains a +// particular value and the block's physical offset and length within the file. +// The index block uses a columnar encoding (see pkg colblk) to encode two +// columns: +// +// **Virtual Blocks**: +// an array of uints that is only non-empty for blob files that have been +// rewritten. The length of the array is identified by the first 4 bytes of the +// index block as a custom block header. Within the array each 64-bit uint +// value's least significant 32 bits encode the index of the physical block +// containing the original block's data. This index can be used to look up the +// byte offset and length of the physical block within the index block's offsets +// column. The most significant 32 bits of each uint value encode a BlockValueID +// offset that remaps the original BlockValueID to the corresponding +// BlockValueID in the new physical block. A reader adds this BlockValueID +// offset to a handle's BlockValueID to get the index of the value within the +// physical block. +// +// TODO(jackson,radu): Consider interleaving the encoding of the uints so that +// in the common case of <64K blocks and <64K values per-block, the uint column +// can be encoded in 32-bits. +// See related issue https://github.com/cockroachdb/pebble/v2/issues/4426. +// +// **Offsets**: +// an array of uints encoding the offset in the blob file at which each block +// begins. There are +1 offsets. The last offset points to +// the first byte after the last block. The length of each block is inferred +// through the difference between consecutive offsets. +// +// ## Blob Value Blocks +// +// A blob value block is a columnar block encoding blob values. It encodes a +// single column: a RawBytes of values. The colblk.RawBytes encoding allows +// constant-time access to the i'th value within the block. +// +// ## Sparseness +// +// A rewrite of a blob file elides values that are no longer referenced, +// conserving disk space. Within a value block, an absent value is represented +// as an empty byte slice within the RawBytes column. This requires the overhead +// of 1 additional offset within the RawBytes encoding (typically 2-4 bytes). +// +// If a wide swath of values are no longer referenced, entire blocks may elided. +// When this occurs, the index block's virtual blocks column will map multiple +// of the original blockIDs to the same physical block. +// +// We expect significant locality to gaps in referenced values. Compactions will +// remove swaths of references all at once, typically all the values of keys +// that fall within a narrow keyspan. This locality allows us to represent most +// sparseness using the gaps between blocks, without suffering the 2-4 bytes of +// overhead for absent values internally within a block. +// +// Note: If we find this locality not hold for some reason, we can extend the +// blob-value block format to encode a NullBitmap. This would allow us to +// represent missing values using 2-bits per missing value. +// +// ## Diagram +// +// +------------------------------------------------------------------------------+ +// | BLOB FILE FORMAT | +// +------------------------------------------------------------------------------+ +// | Value Block #0 | +// | +----------------------------------------------------------------------+ | +// | | RawBytes[...] | | +// | +----------------------------------------------------------------------+ | +// | Value Block #1 | +// | +----------------------------------------------------------------------+ | +// | | RawBytes[...] | | +// | +----------------------------------------------------------------------+ | +// | ... | +// | Value Block #N | +// | +----------------------------------------------------------------------+ | +// | | RawBytes[...] | | +// | +----------------------------------------------------------------------+ | +// | | +// +------------------------------- Index Block ----------------------------------+ +// | Custom Header (4 bytes) | +// | Num virtual blocks: M | +// | +---------Virtual blocks (M)--------+ +--------Offsets(N+1)---------+ | +// | | idx block index valueIDoffset | | idx offset | | +// | | 0 0 0 | | 0 0 | | +// | | 1 0 0 | | 1 32952 | | +// | | 2 0 32 | | 2 65904 | | +// | | 3 1 0 | | 3 92522 | | +// | | 4 2 0 | | 4 125474 | | +// | | 5 3 0 | +-----------------------------+ | +// | +-----------------------------------+ | +// +----------------------------- Footer (30 bytes) ------------------------------+ +// | CRC Checksum (4 bytes) | +// | Index Block Offset (8 bytes) | +// | Index Block Length (8 bytes) | +// | Checksum Type (1 byte) | +// | Format (1 byte) | +// | Magic String (8 bytes) | +// +------------------------------------------------------------------------------+ +package blob diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/fetcher.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/fetcher.go new file mode 100644 index 0000000..aca671e --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/fetcher.go @@ -0,0 +1,323 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package blob + +import ( + "context" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +const maxCachedReaders = 5 + +// A ValueReader is an interface defined over a file that can be used to read +// value blocks. +type ValueReader interface { + // IndexHandle returns the handle for the file's index block. + IndexHandle() block.Handle + + // InitReadHandle initializes a ReadHandle for the file, using the provided + // preallocated read handle to avoid an allocation. + InitReadHandle(rh *objstorageprovider.PreallocatedReadHandle) objstorage.ReadHandle + + // ReadValueBlock retrieves a value block described by the provided block + // handle from the block cache, or reads it from the blob file if it's not + // already cached. + ReadValueBlock(context.Context, block.ReadEnv, objstorage.ReadHandle, + block.Handle) (block.BufferHandle, error) + + // ReadIndexBlock retrieves the index block from the block cache, or reads + // it from the blob file if it's not already cached. + ReadIndexBlock(context.Context, block.ReadEnv, objstorage.ReadHandle) (block.BufferHandle, error) +} + +// A FileMapping defines the mapping between blob file IDs and disk file numbers. +// It's implemented by *manifest.BlobFileSet. +type FileMapping interface { + // Lookup returns the disk file number for the given blob file ID. It + // returns false for the second return value if the blob file ID is not + // present in the mapping. + Lookup(base.BlobFileID) (base.DiskFileNum, bool) +} + +// A ReaderProvider is an interface that can be used to retrieve a ValueReader +// for a given file number. +type ReaderProvider interface { + // GetValueReader returns a ValueReader for the given file number. + GetValueReader(ctx context.Context, fileNum base.DiskFileNum) (r ValueReader, closeFunc func(), err error) +} + +// A ValueFetcher retrieves values stored out-of-band in separate blob files. +// The ValueFetcher caches accessed file readers to avoid redundant file cache +// and block cache lookups when performing consecutive value retrievals. +// +// A single ValueFetcher can be used to fetch values from multiple files, and it +// will internally cache readers for each file. +// +// When finished with a ValueFetcher, one must call Close to release all cached +// readers and block buffers. +type ValueFetcher struct { + fileMapping FileMapping + readerProvider ReaderProvider + env block.ReadEnv + fetchCount int + readers [maxCachedReaders]cachedReader + bufMangler invariants.BufMangler +} + +// TODO(jackson): Support setting up a read handle for compaction when relevant. + +// Assert that ValueFetcher implements the ValueFetcher interface. +var _ base.ValueFetcher = (*ValueFetcher)(nil) + +// Init initializes the ValueFetcher. +func (r *ValueFetcher) Init(fm FileMapping, rp ReaderProvider, env block.ReadEnv) { + r.fileMapping = fm + r.readerProvider = rp + r.env = env + if r.readerProvider == nil { + panic("readerProvider is nil") + } +} + +// FetchHandle returns the value, given the handle. FetchHandle must not be +// called after Close. +func (r *ValueFetcher) FetchHandle( + ctx context.Context, handle []byte, blobFileID base.BlobFileID, valLen uint32, buf []byte, +) (val []byte, callerOwned bool, err error) { + handleSuffix := DecodeHandleSuffix(handle) + vh := Handle{ + BlobFileID: blobFileID, + ValueLen: valLen, + BlockID: handleSuffix.BlockID, + ValueID: handleSuffix.ValueID, + } + v, err := r.retrieve(ctx, vh) + if err == nil && len(v) != int(vh.ValueLen) { + return nil, false, + errors.AssertionFailedf("value length mismatch: %d != %d", len(v), vh.ValueLen) + } + if invariants.Enabled { + v = r.bufMangler.MaybeMangleLater(v) + } + return v, false, err +} + +// Fetch is like FetchHandle, but it constructs handle and does not +// validate the value length. Fetch must not be called after Close. +func (r *ValueFetcher) Fetch( + ctx context.Context, blobFileID base.BlobFileID, blockID BlockID, valueID BlockValueID, +) (val []byte, callerOwned bool, err error) { + vh := Handle{ + BlobFileID: blobFileID, + BlockID: blockID, + ValueID: valueID, + } + v, err := r.retrieve(ctx, vh) + if invariants.Enabled { + v = r.bufMangler.MaybeMangleLater(v) + } + return v, false, err +} + +func (r *ValueFetcher) retrieve(ctx context.Context, vh Handle) (val []byte, err error) { + // Look for a cached reader for the file. Also, find the least-recently used + // reader. If we don't find a cached reader, we'll replace the + // least-recently used reader with the new one for the file indicated by + // vh.FileNum. + var cr *cachedReader + var oldestFetchIndex int + // TODO(jackson): Reconsider this O(len(readers)) scan. + for i := range r.readers { + if r.readers[i].blobFileID == vh.BlobFileID && r.readers[i].r != nil { + cr = &r.readers[i] + break + } else if r.readers[i].lastFetchCount < r.readers[oldestFetchIndex].lastFetchCount { + oldestFetchIndex = i + } + } + + if cr == nil { + // No cached reader found for the file. Get one from the file cache. + cr = &r.readers[oldestFetchIndex] + // Release the previous reader, if any. + if cr.r != nil { + if err = cr.Close(); err != nil { + return nil, err + } + } + diskFileNum, ok := r.fileMapping.Lookup(vh.BlobFileID) + if !ok { + return nil, errors.AssertionFailedf("blob file %s not found", vh.BlobFileID) + } + if cr.r, cr.closeFunc, err = r.readerProvider.GetValueReader(ctx, diskFileNum); err != nil { + return nil, err + } + cr.blobFileID = vh.BlobFileID + cr.diskFileNum = diskFileNum + cr.rh = cr.r.InitReadHandle(&cr.preallocRH) + } + + if r.env.Stats != nil { + r.env.Stats.SeparatedPointValue.ValueBytesFetched += uint64(vh.ValueLen) + } + + r.fetchCount++ + cr.lastFetchCount = r.fetchCount + val, err = cr.GetUnsafeValue(ctx, vh, r.env) + return val, err +} + +// Close closes the ValueFetcher and releases all cached readers. Once Close is +// called, the ValueFetcher is no longer usable. +func (r *ValueFetcher) Close() error { + var err error + for i := range r.readers { + if r.readers[i].r != nil { + err = errors.CombineErrors(err, r.readers[i].Close()) + } + } + return err +} + +// cachedReader holds a Reader into an open file, and possibly blocks retrieved +// from the block cache. +type cachedReader struct { + blobFileID base.BlobFileID + diskFileNum base.DiskFileNum + r ValueReader + closeFunc func() + rh objstorage.ReadHandle + lastFetchCount int + // indexBlock holds the index block for the file, lazily loaded on the first + // call to GetUnsafeValue. + indexBlock struct { + // loaded indicates whether buf and dec are valid. + loaded bool + buf block.BufferHandle + dec *indexBlockDecoder + } + // currentValueBlock holds the currently loaded blob value block, if any. + currentValueBlock struct { + // loaded indicates whether a block is currently loaded. + loaded bool + // virtualID is the virtual block ID used to retrieve the block. If the + // blob file has not been rewritten, this equals the physicalIndex. + virtualID BlockID + // valueIDOffset is the offset that should be added to the value ID to + // get the index of the value within the physical block for any blob + // handles encoding a block ID of virtualID. + valueIDOffset BlockValueID + // physicalIndex is the physical index of the current value block. + // physicalIndex is in the range [0, indexBlock.dec.BlockCount()). + physicalIndex int + buf block.BufferHandle + dec *blobValueBlockDecoder + } + preallocRH objstorageprovider.PreallocatedReadHandle +} + +// GetUnsafeValue retrieves the value for the given handle. The value is +// returned as a byte slice pointing directly into the block cache's data. The +// value is only guaranteed to be stable until the next call to GetUnsafeValue +// or until the cachedReader is closed. +func (cr *cachedReader) GetUnsafeValue( + ctx context.Context, vh Handle, env block.ReadEnv, +) ([]byte, error) { + valueID := vh.ValueID + + // Determine which block contains the value. + // + // If we already have a block loaded (eg, we're scanning retrieving multiple + // values), the current block might contain the value. + if !cr.currentValueBlock.loaded || cr.currentValueBlock.virtualID != vh.BlockID { + if !cr.indexBlock.loaded { + // Read the index block. + var err error + cr.indexBlock.buf, err = cr.r.ReadIndexBlock(ctx, env, cr.rh) + if err != nil { + return nil, err + } + cr.indexBlock.dec = block.CastMetadata[indexBlockDecoder](cr.indexBlock.buf.BlockMetadata()) + cr.indexBlock.loaded = true + } + + // Determine which physical block contains the value. If this blob file + // has never been rewritten, the BlockID is the physical index of the + // block containing the value. If the blob file has been rewritten, we + // need to remap the 'virtual' BlockID to the physical block index using + // the virtualBlocks column. We also retrieve a 'value ID offset' which + // should be added to the value handle's value ID to get the index of + // the value within the physical block. + var physicalBlockIndex int = int(vh.BlockID) + var valueIDOffset BlockValueID + if cr.indexBlock.dec.virtualBlockCount > 0 { + physicalBlockIndex, valueIDOffset = cr.indexBlock.dec.RemapVirtualBlockID(vh.BlockID) + if valueIDOffset == virtualBlockIndexMask { + return nil, errors.AssertionFailedf("blob file indicates virtual block ID %d in %s should be unreferenced", + vh.BlockID, vh.BlobFileID) + } + } + invariants.CheckBounds(physicalBlockIndex, cr.indexBlock.dec.BlockCount()) + + // Retrieve the block's handle, and read the blob value block into + // memory. + // + // TODO(jackson): If the blob file has been rewritten, it's possible + // that we already have the physical block in-memory because we + // previously were accessing it under a different BlockID. We expect + // this case to be rare, and this is a hot path for the more common case + // of non-rewritten blob files, so we defer optimizing for now. + h := cr.indexBlock.dec.BlockHandle(physicalBlockIndex) + // Nil out the decoder before releasing the buffers to ensure the Go GC + // doesn't misinterpret the freed memory backing the decoders. + cr.currentValueBlock.dec = nil + cr.currentValueBlock.buf.Release() + cr.currentValueBlock.loaded = false + var err error + cr.currentValueBlock.buf, err = cr.r.ReadValueBlock(ctx, env, cr.rh, h) + if err != nil { + return nil, err + } + cr.currentValueBlock.dec = block.CastMetadata[blobValueBlockDecoder](cr.currentValueBlock.buf.BlockMetadata()) + cr.currentValueBlock.physicalIndex = physicalBlockIndex + cr.currentValueBlock.virtualID = vh.BlockID + cr.currentValueBlock.valueIDOffset = valueIDOffset + cr.currentValueBlock.loaded = true + } + + // Convert the ValueID to an index into the block's values. When a blob file + // is first constructed, the ValueID == the index. However when a blob file + // is rewritten, multiple blocks from the original blob file may be combined + // into the same physical block. To translate the ValueID to the + // apppropriate index, we need to add the 'virtual block' valueIDOffset. + valueIndex := int(valueID) + int(cr.currentValueBlock.valueIDOffset) + invariants.CheckBounds(valueIndex, cr.currentValueBlock.dec.bd.Rows()) + v := cr.currentValueBlock.dec.values.Slice(cr.currentValueBlock.dec.values.Offsets(valueIndex)) + return v, nil +} + +// Close releases resources associated with the reader. +func (cfr *cachedReader) Close() (err error) { + if cfr.rh != nil { + err = cfr.rh.Close() + } + // Nil out the decoders before releasing the buffers to ensure the Go GC + // doesn't misinterpret the freed memory backing the decoders. + cfr.indexBlock.dec = nil + cfr.currentValueBlock.dec = nil + cfr.indexBlock.buf.Release() + cfr.currentValueBlock.buf.Release() + // Release the cfg.Reader. closeFunc is provided by the file cache and + // decrements the refcount on the open file reader. + cfr.closeFunc() + *cfr = cachedReader{} + return err +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/handle.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/handle.go new file mode 100644 index 0000000..5953fa6 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/handle.go @@ -0,0 +1,206 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package blob + +import ( + "encoding/binary" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/redact" +) + +// MaxInlineHandleLength is the maximum length of an inline blob handle. +// +// Handle fields are varint encoded, so maximum 5 bytes each. +const MaxInlineHandleLength = 4 * binary.MaxVarintLen32 + +// BlockValueID identifies a value within a block of a blob file. The +// BlockValueID is local to the block. The BlockValueID is an index 0..n-1 into +// the array of values in the original blob-value block. +type BlockValueID uint32 + +// BlockID identifies a block within a blob file. If a blob file has not been +// rewritten, the block ID is simply an index of the block within the file. If +// the blob file has been rewritten to reclaim disk space, the rewritten blob +// file will contain fewer blocks than the original. The rewritten blob file's +// index block contains a column mapping the original block ID to the index of +// the block in the new blob file containing the original block's data. +type BlockID uint32 + +// Handle describes the location of a value stored within a blob file. +type Handle struct { + BlobFileID base.BlobFileID + ValueLen uint32 + // BlockID identifies the block within the blob file containing the value. + BlockID BlockID + // ValueID identifies the value within the block identified by BlockID. + ValueID BlockValueID +} + +// String implements the fmt.Stringer interface. +func (h Handle) String() string { + return redact.StringWithoutMarkers(h) +} + +// SafeFormat implements redact.SafeFormatter. +func (h Handle) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("(%s,blk%d,id%d,len%d)", h.BlobFileID, h.BlockID, h.ValueID, h.ValueLen) +} + +// TODO(jackson): Consider encoding the handle's data using columnar block +// primitives, rather than a variable-width encoding in the value column. + +// InlineHandle describes a handle as it is encoded within a sstable block. The +// inline handle does not encode the blob file number outright. Instead it +// encodes an index into the containing sstable's BlobReferences. +// +// The inline handle is composed of two parts: a preface (InlineHandlePreface) +// and a suffix (HandleSuffix). The preface is eagerly decoded from the encoded +// handle when returning an InternalValue to higher layers. The remaining bits +// (the suffix) are decoded only when the value is being fetched from the blob +// file. +type InlineHandle struct { + InlineHandlePreface + HandleSuffix +} + +// ReferenceID identifies a particular blob reference within a table. It's +// implemented as an index into the slice of the BlobReferences recorded in the +// manifest. +type ReferenceID uint32 + +// InlineHandlePreface is the prefix of an inline handle. It's eagerly decoded +// when returning an InternalValue to higher layers. +type InlineHandlePreface struct { + ReferenceID ReferenceID + ValueLen uint32 +} + +// HandleSuffix is the suffix of an inline handle. It's decoded only when the +// value is being fetched from the blob file. +type HandleSuffix struct { + BlockID BlockID + ValueID BlockValueID +} + +// Encode encodes the handle suffix into the provided buffer, returning the +// number of bytes encoded. +func (h HandleSuffix) Encode(b []byte) int { + n := binary.PutUvarint(b, uint64(h.BlockID)) + n += binary.PutUvarint(b[n:], uint64(h.ValueID)) + return n +} + +// String implements the fmt.Stringer interface. +func (h InlineHandle) String() string { + return redact.StringWithoutMarkers(h) +} + +// SafeFormat implements redact.SafeFormatter. +func (h InlineHandle) SafeFormat(w redact.SafePrinter, _ rune) { + w.Printf("(f%d,blk%d,id%d,len%d)", h.ReferenceID, h.BlockID, h.ValueID, h.ValueLen) +} + +// Encode encodes the inline handle into the provided buffer, returning the +// number of bytes encoded. +func (h InlineHandle) Encode(b []byte) int { + n := 0 + n += binary.PutUvarint(b[n:], uint64(h.ReferenceID)) + n += binary.PutUvarint(b[n:], uint64(h.ValueLen)) + n += h.HandleSuffix.Encode(b[n:]) + return n +} + +// DecodeInlineHandlePreface decodes the blob reference index and value length +// from the beginning of a variable-width encoded InlineHandle. +func DecodeInlineHandlePreface(src []byte) (InlineHandlePreface, []byte) { + ptr := unsafe.Pointer(&src[0]) + var refIdx uint32 + if a := *((*uint8)(ptr)); a < 128 { + refIdx = uint32(a) + src = src[1:] + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + refIdx = uint32(b)<<7 | uint32(a) + src = src[2:] + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + refIdx = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + src = src[3:] + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + refIdx = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + src = src[4:] + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + refIdx = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + src = src[5:] + } + + ptr = unsafe.Pointer(&src[0]) + var valueLen uint32 + if a := *((*uint8)(ptr)); a < 128 { + valueLen = uint32(a) + src = src[1:] + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + valueLen = uint32(b)<<7 | uint32(a) + src = src[2:] + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + valueLen = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + src = src[3:] + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + valueLen = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + src = src[4:] + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + valueLen = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + src = src[5:] + } + + return InlineHandlePreface{ + ReferenceID: ReferenceID(refIdx), + ValueLen: valueLen, + }, src +} + +// DecodeHandleSuffix decodes the HandleSuffix from the provided buffer. +func DecodeHandleSuffix(src []byte) HandleSuffix { + var vs HandleSuffix + ptr := unsafe.Pointer(&src[0]) + // Manually inlined uvarint decoding. Saves ~25% in benchmarks. Unrolling + // a loop for i:=0; i<2; i++, saves ~6%. + var v uint32 + if a := *((*uint8)(ptr)); a < 128 { + v = uint32(a) + ptr = unsafe.Add(ptr, 1) + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + v = uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 2) + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 3) + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 4) + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 5) + } + vs.BlockID = BlockID(v) + + if a := *((*uint8)(ptr)); a < 128 { + v = uint32(a) + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + v = uint32(b)<<7 | uint32(a) + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + } + vs.ValueID = BlockValueID(v) + return vs +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/rewrite.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/rewrite.go new file mode 100644 index 0000000..a9d5c7a --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob/rewrite.go @@ -0,0 +1,121 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package blob + +import ( + "context" + "slices" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// A FileRewriter copies values from an input blob file, outputting a new blob +// file containing a subset of the original blob file's values. The original +// Handles used to access values in the original blob file will continue to work +// with the new blob file, as long as the value was copied during rewrite. +type FileRewriter struct { + fileID base.BlobFileID + w *FileWriter + f ValueFetcher +} + +// NewFileRewriter creates a new FileRewriter that will copy values from the +// input blob file to the output blob file. +func NewFileRewriter( + fileID base.BlobFileID, + inputFileNum base.DiskFileNum, + rp ReaderProvider, + readEnv block.ReadEnv, + outputFileNum base.DiskFileNum, + w objstorage.Writable, + opts FileWriterOptions, +) *FileRewriter { + rw := &FileRewriter{ + fileID: fileID, + w: NewFileWriter(outputFileNum, w, opts), + } + rw.f.Init(inputFileMapping(inputFileNum), rp, readEnv) + return rw +} + +// CopyBlock copies the values for the given blockID to the output blob file. +// CopyBlock must be called with ascending blockIDs. The totalValueSize must be +// the size of all the values indicated by valueIDs. +func (rw *FileRewriter) CopyBlock( + ctx context.Context, blockID BlockID, totalValueSize int, valueIDs []int, +) error { + slices.Sort(valueIDs) + + // Consider whether we should flush the current physical block. We know + // we'll need to add totalValueSize worth of value data, and can make a + // decision up front. All values from the same original blockID must be + // located in the same physical block. + valuesInBlock := rw.w.valuesEncoder.Count() + if valuesInBlock > 0 { + currentBlockSize := rw.w.valuesEncoder.size() + block.TrailerLen + if rw.w.flushGov.ShouldFlush(currentBlockSize, currentBlockSize+totalValueSize) { + rw.w.flush() + } + } + + // Record the mapping from the virtual block ID to the current physical + // block and offset within the block. + rw.w.beginNewVirtualBlock(blockID) + + previousValueID := -1 + for _, valueID := range valueIDs { + // Subsequent logic depends on the valueIDs being unique. + // TODO(jackson): This is a workaround because we don't have per-sstable + // liveness data. See https://github.com/cockroachdb/pebble/v2/issues/4915. + // If we had per-sstable liveness data, we should be able to make this + // an assertion failure. + if previousValueID == valueID { + continue + } + // If there is a gap in the referenced Value IDs within this block, we + // need to represent this sparseness as empty values within the block. + // We can represent sparseness at the tail of a block or between blocks + // more compactly, but not sparseless at the beginning of a virtual + // block. See the doc.go comment for more details on sparseness. + for missingValueID := previousValueID + 1; missingValueID < valueID; missingValueID++ { + rw.w.stats.ValueCount++ + rw.w.valuesEncoder.AddValue(nil) + } + + // Retrieve the value and copy it to the output blob file. + value, _, err := rw.f.Fetch(ctx, rw.fileID, blockID, BlockValueID(valueID)) + if err != nil { + return err + } + // We don't know the value size, but we know it must not be empty. + if len(value) == 0 { + return errors.AssertionFailedf("value is empty") + } + rw.w.stats.ValueCount++ + rw.w.stats.UncompressedValueBytes += uint64(len(value)) + rw.w.valuesEncoder.AddValue(value) + previousValueID = valueID + } + return nil +} + +// Close finishes writing the output blob file and releases resources. +func (rw *FileRewriter) Close() (FileWriterStats, error) { + stats, err := rw.w.Close() + return stats, errors.CombineErrors(err, rw.f.Close()) +} + +// inputFileMapping implements blob.FileMapping and always maps to itself. +type inputFileMapping base.DiskFileNum + +// Assert that (*inputFileMapping) implements blob.FileMapping. +var _ FileMapping = inputFileMapping(0) + +func (m inputFileMapping) Lookup(fileID base.BlobFileID) (base.DiskFileNum, bool) { + return base.DiskFileNum(m), true +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/blob_reference_index.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob_reference_index.go new file mode 100644 index 0000000..da01dea --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/blob_reference_index.go @@ -0,0 +1,166 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "encoding/binary" + "iter" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/sstable/blob" +) + +// blobRefValueLivenessState tracks the liveness of values within a blob value +// block via a BitmapRunLengthEncoder. +type blobRefValueLivenessState struct { + currentBlock struct { + bitmap BitmapRunLengthEncoder + refID blob.ReferenceID + blockID blob.BlockID + valuesSize uint64 + } + + finishedBlocks []byte +} + +// initNewBlock initializes the state for a new block, resetting all fields to +// their initial values. +func (s *blobRefValueLivenessState) initNewBlock(refID blob.ReferenceID, blockID blob.BlockID) { + s.currentBlock.bitmap.Init() + s.currentBlock.refID = refID + s.currentBlock.blockID = blockID + s.currentBlock.valuesSize = 0 +} + +// finishCurrentBlock writes the in-progress value liveness encoding for a blob +// value block to the encoder's buffer. +// +// [] +func (s *blobRefValueLivenessState) finishCurrentBlock() { + s.finishedBlocks = binary.AppendUvarint(s.finishedBlocks, uint64(s.currentBlock.blockID)) + s.finishedBlocks = binary.AppendUvarint(s.finishedBlocks, s.currentBlock.valuesSize) + s.finishedBlocks = binary.AppendUvarint(s.finishedBlocks, uint64(s.currentBlock.bitmap.Size())) + s.finishedBlocks = s.currentBlock.bitmap.FinishAndAppend(s.finishedBlocks) +} + +// blobRefValueLivenessWriter helps maintain the liveness of values in blob value +// blocks for a sstable's blob references. It maintains: +// - bufs: serialized value liveness encodings that will be written to the +// sstable. +// - refState: a slice of blobRefValueLivenessState. This tracks the +// in-progress value liveness for each blob value block for our sstable's +// blob references. The index of the slice corresponds to the blob.ReferenceID. +type blobRefValueLivenessWriter struct { + refState []blobRefValueLivenessState +} + +// init initializes the writer's state. +func (w *blobRefValueLivenessWriter) init() { + clear(w.refState) + w.refState = w.refState[:0] +} + +// numReferences returns the number of references that have liveness encodings +// that have been added to the writer. +func (w *blobRefValueLivenessWriter) numReferences() int { + return len(w.refState) +} + +// addLiveValue adds a live value to the state maintained by refID. If the +// current blockID for this in-progress state is different from the provided +// blockID, a new state is created and the old one is preserved to the buffer +// at w.bufs[refID]. +// +// addLiveValue adds a new state for the provided refID if one does +// not already exist. It assumes that any new blob.ReferenceIDs are visited in +// monotonically increasing order. +// +// INVARIANT: len(w.refState) == len(w.bufs). +func (w *blobRefValueLivenessWriter) addLiveValue( + refID blob.ReferenceID, blockID blob.BlockID, valueID blob.BlockValueID, valueSize uint64, +) error { + // Compute the minimum expected length of the state slice in order for our + // refID to be indexable. + minLen := int(refID) + 1 + + // If we don't already have a state for this reference, we might just need + // to grow. + if len(w.refState) < minLen { + // Check if we have jumped ahead more than one reference. + if len(w.refState) < minLen && len(w.refState)+1 != minLen { + return base.AssertionFailedf("jump from greatest reference ID %d to new reference "+ + "ID %d greater than 1", len(w.refState)-1, refID) + } + + // We have a new reference. + state := blobRefValueLivenessState{} + state.initNewBlock(refID, blockID) + w.refState = append(w.refState, state) + } + + state := &w.refState[refID] + if state.currentBlock.blockID != blockID { + state.finishCurrentBlock() + state.initNewBlock(refID, blockID) + } + state.currentBlock.valuesSize += valueSize + state.currentBlock.bitmap.Set(int(valueID)) + return nil +} + +// finish finishes encoding the per-blob reference liveness encodings, and +// returns an in-order sequence of (referenceID, encoding) pairs. +func (w *blobRefValueLivenessWriter) finish() iter.Seq2[blob.ReferenceID, []byte] { + return func(yield func(blob.ReferenceID, []byte) bool) { + // N.B. `i` is equivalent to blob.ReferenceID. + for i, state := range w.refState { + state.finishCurrentBlock() + if !yield(blob.ReferenceID(i), state.finishedBlocks) { + return + } + } + } +} + +// BlobRefLivenessEncoding represents the decoded form of a blob reference +// liveness encoding. The encoding format is: +// +// [] +type BlobRefLivenessEncoding struct { + BlockID blob.BlockID + ValuesSize int + BitmapSize int + Bitmap []byte +} + +// DecodeBlobRefLivenessEncoding decodes a sequence of blob reference liveness +// encodings from the provided buffer. Each encoding has the format: +// [] +func DecodeBlobRefLivenessEncoding(buf []byte) []BlobRefLivenessEncoding { + var encodings []BlobRefLivenessEncoding + for len(buf) > 0 { + var enc BlobRefLivenessEncoding + var n int + + blockIDVal, n := binary.Uvarint(buf) + buf = buf[n:] + enc.BlockID = blob.BlockID(blockIDVal) + + valuesSizeVal, n := binary.Uvarint(buf) + buf = buf[n:] + enc.ValuesSize = int(valuesSizeVal) + + bitmapSizeVal, n := binary.Uvarint(buf) + buf = buf[n:] + enc.BitmapSize = int(bitmapSizeVal) + + // The bitmap takes up the remaining bitmapSize bytes for this encoding. + enc.Bitmap = buf[:enc.BitmapSize] + buf = buf[enc.BitmapSize:] + + encodings = append(encodings, enc) + } + return encodings +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block/block.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/block.go new file mode 100644 index 0000000..224af25 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/block.go @@ -0,0 +1,725 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package block + +import ( + "context" + "encoding/binary" + "fmt" + "path/filepath" + "runtime" + "slices" + "time" + "unsafe" + + "github.com/cespare/xxhash/v2" + "github.com/cockroachdb/crlib/crtime" + "github.com/cockroachdb/crlib/fifo" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/bitflip" + "github.com/cockroachdb/pebble/v2/internal/cache" + "github.com/cockroachdb/pebble/v2/internal/crc" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/sstableinternal" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider/objiotracing" + "github.com/cockroachdb/pebble/v2/sstable/block/blockkind" +) + +// Kind is a convenience alias. +type Kind = blockkind.Kind + +// Handle is the file offset and length of a block. +type Handle struct { + // Offset identifies the offset of the block within the file. + Offset uint64 + // Length is the length of the block data (excludes the trailer). + Length uint64 +} + +// EncodeVarints encodes the block handle into dst using a variable-width +// encoding and returns the number of bytes written. +func (h Handle) EncodeVarints(dst []byte) int { + n := binary.PutUvarint(dst, h.Offset) + m := binary.PutUvarint(dst[n:], h.Length) + return n + m +} + +// String implements fmt.Stringer. +func (h Handle) String() string { + return fmt.Sprintf("(%d, %d)", h.Offset, h.Length) +} + +// HandleWithProperties is used for data blocks and first/lower level index +// blocks, since they can be annotated using BlockPropertyCollectors. +type HandleWithProperties struct { + Handle + Props []byte +} + +// EncodeVarints encodes the block handle and properties into dst using a +// variable-width encoding and returns the number of bytes written. +func (h HandleWithProperties) EncodeVarints(dst []byte) []byte { + n := h.Handle.EncodeVarints(dst) + dst = append(dst[:n], h.Props...) + return dst +} + +// DecodeHandle returns the block handle encoded in a variable-width encoding at +// the start of src, as well as the number of bytes it occupies. It returns zero +// if given invalid input. A block handle for a data block or a first/lower +// level index block should not be decoded using DecodeHandle since the caller +// may validate that the number of bytes decoded is equal to the length of src, +// which will be false if the properties are not decoded. In those cases the +// caller should use DecodeHandleWithProperties. +func DecodeHandle(src []byte) (Handle, int) { + offset, n := binary.Uvarint(src) + length, m := binary.Uvarint(src[n:]) + if n == 0 || m == 0 { + return Handle{}, 0 + } + return Handle{Offset: offset, Length: length}, n + m +} + +// DecodeHandleWithProperties returns the block handle and properties encoded in +// a variable-width encoding at the start of src. src needs to be exactly the +// length that was encoded. This method must be used for data block and +// first/lower level index blocks. The properties in the block handle point to +// the bytes in src. +func DecodeHandleWithProperties(src []byte) (HandleWithProperties, error) { + bh, n := DecodeHandle(src) + if n == 0 { + return HandleWithProperties{}, errors.Errorf("invalid block.Handle") + } + return HandleWithProperties{ + Handle: bh, + Props: src[n:], + }, nil +} + +// TrailerLen is the length of the trailer at the end of a block. +const TrailerLen = 5 + +// Trailer is the trailer at the end of a block, encoding the block type +// (compression) and a checksum. +type Trailer = [TrailerLen]byte + +// MakeTrailer constructs a trailer from a block type and a checksum. +func MakeTrailer(blockType byte, checksum uint32) (t Trailer) { + t[0] = blockType + binary.LittleEndian.PutUint32(t[1:5], checksum) + return t +} + +// ChecksumType specifies the checksum used for blocks. +type ChecksumType byte + +// The available checksum types. These values are part of the durable format and +// should not be changed. +const ( + ChecksumTypeNone ChecksumType = 0 + ChecksumTypeCRC32c ChecksumType = 1 + ChecksumTypeXXHash ChecksumType = 2 + ChecksumTypeXXHash64 ChecksumType = 3 +) + +// String implements fmt.Stringer. +func (t ChecksumType) String() string { + switch t { + case ChecksumTypeCRC32c: + return "crc32c" + case ChecksumTypeNone: + return "none" + case ChecksumTypeXXHash: + return "xxhash" + case ChecksumTypeXXHash64: + return "xxhash64" + default: + panic(errors.Newf("sstable: unknown checksum type: %d", t)) + } +} + +// A Checksummer calculates checksums for blocks. +type Checksummer struct { + Type ChecksumType + xxHasher *xxhash.Digest + blockTypeBuf [1]byte +} + +func (c *Checksummer) Init(typ ChecksumType) { + c.Type = typ +} + +// Checksum computes a checksum over the provided block and block type. +func (c *Checksummer) Checksum(block []byte, blockType byte) (checksum uint32) { + // Calculate the checksum. + c.blockTypeBuf[0] = blockType + switch c.Type { + case ChecksumTypeCRC32c: + checksum = crc.New(block).Update(c.blockTypeBuf[:]).Value() + case ChecksumTypeXXHash64: + if c.xxHasher == nil { + c.xxHasher = xxhash.New() + } else { + c.xxHasher.Reset() + } + _, _ = c.xxHasher.Write(block) + _, _ = c.xxHasher.Write(c.blockTypeBuf[:]) + checksum = uint32(c.xxHasher.Sum64()) + default: + panic(errors.Newf("unsupported checksum type: %d", c.Type)) + } + return checksum +} + +// ValidateChecksum validates the checksum of a block. +func ValidateChecksum(checksumType ChecksumType, b []byte, bh Handle) error { + expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:]) + var computedChecksum uint32 + switch checksumType { + case ChecksumTypeCRC32c: + computedChecksum = crc.New(b[:bh.Length+1]).Value() + case ChecksumTypeXXHash64: + computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1])) + default: + return errors.Errorf("unsupported checksum type: %d", checksumType) + } + if expectedChecksum != computedChecksum { + // Check if the checksum was due to a singular bit flip and report it. + data := slices.Clone(b[:bh.Length+1]) + var checksumFunction func([]byte) uint32 + switch checksumType { + case ChecksumTypeCRC32c: + checksumFunction = func(data []byte) uint32 { + return crc.New(data).Value() + } + case ChecksumTypeXXHash64: + checksumFunction = func(data []byte) uint32 { + return uint32(xxhash.Sum64(data)) + } + } + found, indexFound, bitFound := bitflip.CheckSliceForBitFlip(data, checksumFunction, expectedChecksum) + err := base.CorruptionErrorf("block %d/%d: %s checksum mismatch %x != %x", + errors.Safe(bh.Offset), errors.Safe(bh.Length), checksumType, + expectedChecksum, computedChecksum) + if found { + err = errors.WithSafeDetails(err, ". bit flip found: byte index %d. got: 0x%x. want: 0x%x.", + errors.Safe(indexFound), errors.Safe(data[indexFound]), errors.Safe(data[indexFound]^(1<= the + // given key. The function is best effort; false negatives are allowed. + // + // If IsLowerBound is true then Compare(First().UserKey, k) >= 0. + // + // If the iterator produces no keys (i.e. First() is nil), IsLowerBound can + // return true for any key. + IsLowerBound(k []byte) bool + // Invalidate invalidates the block iterator, removing references to the + // block it was initialized with. The iterator may continue to be used after + // a call to Invalidate, but all positioning methods should return false. + // Valid() must also return false. + Invalidate() + // IsDataInvalidated returns true when the iterator has been invalidated + // using an Invalidate call. + // + // NB: this is different from Valid which indicates whether the current *KV* + // is valid. + IsDataInvalidated() bool +} + +// IndexBlockIterator is an interface for implementations of block iterators +// over index blocks. It's implemented by *rowblk.IndexIter and +// *colblk.IndexBlockIter. +type IndexBlockIterator interface { + // Init initializes the block iterator from the provided block. + Init(*base.Comparer, []byte, IterTransforms) error + // InitHandle initializes an iterator from the provided block handle. + // + // The iterator takes ownership of the BufferHandle and releases it when it is + // closed (or re-initialized with another handle). This happens even in error + // cases. + InitHandle(*base.Comparer, BufferHandle, IterTransforms) error + // Valid returns true if the iterator is currently positioned at a valid + // block handle. + Valid() bool + // IsDataInvalidated returns true when the iterator has been invalidated + // using an Invalidate call. + // + // NB: this is different from Valid which indicates whether the iterator is + // currently positioned over a valid block entry. + IsDataInvalidated() bool + // Invalidate invalidates the block iterator, removing references to the + // block it was initialized with. The iterator may continue to be used after + // a call to Invalidate, but all positioning methods should return false. + // Valid() must also return false. + Invalidate() + // Handle returns the underlying block buffer handle, if the iterator was + // initialized with one. + Handle() BufferHandle + // Separator returns the separator at the iterator's current position. The + // iterator must be positioned at a valid row. A Separator is a user key + // guaranteed to be greater than or equal to every key contained within the + // referenced block(s). + Separator() []byte + // SeparatorLT returns true if the separator at the iterator's current + // position is strictly less than the provided key. For some + // implementations, it may be more performant to call SeparatorLT rather + // than explicitly performing Compare(Separator(), key) < 0. + SeparatorLT(key []byte) bool + // SeparatorGT returns true if the separator at the iterator's current + // position is strictly greater than (or equal, if orEqual=true) the + // provided key. For some implementations, it may be more performant to call + // SeparatorGT rather than explicitly performing a comparison using the key + // returned by Separator. + SeparatorGT(key []byte, orEqual bool) bool + // BlockHandleWithProperties decodes the block handle with any encoded + // properties at the iterator's current position. + BlockHandleWithProperties() (HandleWithProperties, error) + // SeekGE seeks the index iterator to the first block entry with a separator + // key greater or equal to the given key. If it returns true, the iterator + // is positioned over the first block that might contain the key [key], and + // following blocks have keys ≥ Separator(). It returns false if the seek + // key is greater than all index block separators. + SeekGE(key []byte) bool + // First seeks index iterator to the first block entry. It returns false if + // the index block is empty. + First() bool + // Last seeks index iterator to the last block entry. It returns false if + // the index block is empty. + Last() bool + // Next steps the index iterator to the next block entry. It returns false + // if the index block is exhausted in the forward direction. A call to Next + // while already exhausted in the forward direction is a no-op. + Next() bool + // Prev steps the index iterator to the previous block entry. It returns + // false if the index block is exhausted in the reverse direction. A call to + // Prev while already exhausted in the reverse direction is a no-op. + Prev() bool + // Close closes the iterator, releasing any resources it holds. After Close, + // the iterator must be reset such that it could be reused after a call to + // Init or InitHandle. + Close() error +} + +// NoReadEnv is the empty ReadEnv which reports no stats and does not use a +// buffer pool. +var NoReadEnv = ReadEnv{} + +// ReadEnv contains arguments used when reading a block which apply to all +// the block reads performed by a higher-level operation. +type ReadEnv struct { + // stats and iterStats are slightly different. stats is a shared struct + // supplied from the outside, and represents stats for the whole iterator + // tree and can be reset from the outside (e.g. when the pebble.Iterator is + // being reused). It is currently only provided when the iterator tree is + // rooted at pebble.Iterator. iterStats contains an sstable iterator's + // private stats that are reported to a CategoryStatsCollector when this + // iterator is closed. In the important code paths, the CategoryStatsCollector + // is managed by the fileCacheContainer. + Stats *base.InternalIteratorStats + IterStats *CategoryStatsShard + + // BufferPool is not-nil if we read blocks into a buffer pool and not into the + // cache. This is used during compactions. + BufferPool *BufferPool + + // ReportCorruptionFn is called with ReportCorruptionArg and the error + // whenever an SSTable corruption is detected. The argument is used to avoid + // allocating a separate function for each object. It returns an error with + // more details. + ReportCorruptionFn func(opaque any, err error) error + ReportCorruptionArg any +} + +// BlockServedFromCache updates the stats when a block was found in the cache. +func (env *ReadEnv) BlockServedFromCache(blockLength uint64) { + if env.Stats != nil { + env.Stats.BlockBytes += blockLength + env.Stats.BlockBytesInCache += blockLength + } + if env.IterStats != nil { + env.IterStats.Accumulate(blockLength, blockLength, 0) + } +} + +// BlockRead updates the stats when a block had to be read. +func (env *ReadEnv) BlockRead(blockLength uint64, readDuration time.Duration) { + if env.Stats != nil { + env.Stats.BlockBytes += blockLength + env.Stats.BlockReadDuration += readDuration + } + if env.IterStats != nil { + env.IterStats.Accumulate(blockLength, 0, readDuration) + } +} + +// maybeReportCorruption calls the ReportCorruptionFn if the given error +// indicates corruption. +func (env *ReadEnv) maybeReportCorruption(err error) error { + if env.ReportCorruptionFn != nil && base.IsCorruptionError(err) { + return env.ReportCorruptionFn(env.ReportCorruptionArg, err) + } + return err +} + +// A Reader reads blocks from a single file, handling caching, checksum +// validation and decompression. +type Reader struct { + readable objstorage.Readable + opts ReaderOptions + checksumType ChecksumType +} + +// ReaderOptions configures a block reader. +type ReaderOptions struct { + // CacheOpts contains the information needed to interact with the block + // cache. + CacheOpts sstableinternal.CacheOptions + // LoadBlockSema, if set, is used to limit the number of blocks that can be + // loaded (i.e. read from the filesystem) in parallel. Each load acquires + // one unit from the semaphore for the duration of the read. + LoadBlockSema *fifo.Semaphore + // LoggerAndTracer is an optional logger and tracer. + LoggerAndTracer base.LoggerAndTracer +} + +// Init initializes the Reader to read blocks from the provided Readable. +func (r *Reader) Init(readable objstorage.Readable, ro ReaderOptions, checksumType ChecksumType) { + r.readable = readable + r.opts = ro + r.checksumType = checksumType +} + +// FileNum returns the file number of the file being read. +func (r *Reader) FileNum() base.DiskFileNum { + return r.opts.CacheOpts.FileNum +} + +// ChecksumType returns the checksum type used by the reader. +func (r *Reader) ChecksumType() ChecksumType { + return r.checksumType +} + +// Read reads the block referenced by the provided handle. The readHandle is +// optional. +func (r *Reader) Read( + ctx context.Context, + env ReadEnv, + readHandle objstorage.ReadHandle, + bh Handle, + kind Kind, + initBlockMetadataFn func(*Metadata, []byte) error, +) (handle BufferHandle, _ error) { + // The compaction path uses env.BufferPool, and does not coordinate read + // using a cache.ReadHandle. This is ok since only a single compaction is + // reading a block. + if r.opts.CacheOpts.CacheHandle == nil || env.BufferPool != nil { + if r.opts.CacheOpts.CacheHandle != nil { + if cv := r.opts.CacheOpts.CacheHandle.Get(r.opts.CacheOpts.FileNum, bh.Offset); cv != nil { + recordCacheHit(ctx, env, readHandle, bh) + return CacheBufferHandle(cv), nil + } + } + value, err := r.doRead(ctx, env, readHandle, bh, kind, initBlockMetadataFn) + if err != nil { + return BufferHandle{}, env.maybeReportCorruption(err) + } + return value.MakeHandle(), nil + } + + cv, crh, errorDuration, hit, err := r.opts.CacheOpts.CacheHandle.GetWithReadHandle( + ctx, r.opts.CacheOpts.FileNum, bh.Offset) + if errorDuration > 5*time.Millisecond && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) { + r.opts.LoggerAndTracer.Eventf( + ctx, "waited for turn when %s time wasted by failed reads", errorDuration.String()) + } + // TODO(sumeer): consider tracing when waited longer than some duration + // for turn to do the read. + if err != nil { + // Another caller tried to read this block and failed. We want each caller + // to report corruption errors separately, since the ReportCorruptionArg + // could be different. In particular, we might read the same physical block + // (e.g. an index block) for two different virtual tables. + return BufferHandle{}, env.maybeReportCorruption(err) + } + + if cv != nil { + if invariants.Enabled && crh.Valid() { + panic("cache.ReadHandle must not be valid") + } + if hit { + recordCacheHit(ctx, env, readHandle, bh) + } + return CacheBufferHandle(cv), nil + } + + value, err := r.doRead(ctx, env, readHandle, bh, kind, initBlockMetadataFn) + if err != nil { + crh.SetReadError(err) + return BufferHandle{}, env.maybeReportCorruption(err) + } + crh.SetReadValue(value.v) + return value.MakeHandle(), nil +} + +func recordCacheHit(ctx context.Context, env ReadEnv, readHandle objstorage.ReadHandle, bh Handle) { + // Cache hit. + if readHandle != nil { + readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+TrailerLen)) + } + env.BlockServedFromCache(bh.Length) +} + +// TODO(sumeer): should the threshold be configurable. +const slowReadTracingThreshold = 5 * time.Millisecond + +// doRead is a helper for Read that does the read, checksum check, +// decompression, and returns either a Value or an error. +func (r *Reader) doRead( + ctx context.Context, + env ReadEnv, + readHandle objstorage.ReadHandle, + bh Handle, + kind Kind, + initBlockMetadataFn func(*Metadata, []byte) error, +) (Value, error) { + ctx = objiotracing.WithBlockKind(ctx, kind) + // First acquire loadBlockSema, if needed. + if sema := r.opts.LoadBlockSema; sema != nil { + if err := sema.Acquire(ctx, 1); err != nil { + // An error here can only come from the context. + return Value{}, err + } + defer sema.Release(1) + } + + compressed := Alloc(int(bh.Length+TrailerLen), env.BufferPool) + readStopwatch := makeStopwatch() + var err error + if readHandle != nil { + err = readHandle.ReadAt(ctx, compressed.BlockData(), int64(bh.Offset)) + } else { + err = r.readable.ReadAt(ctx, compressed.BlockData(), int64(bh.Offset)) + } + readDuration := readStopwatch.stop() + // Call IsTracingEnabled to avoid the allocations of boxing integers into an + // interface{}, unless necessary. + if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) { + _, file1, line1, _ := runtime.Caller(1) + _, file2, line2, _ := runtime.Caller(2) + r.opts.LoggerAndTracer.Eventf(ctx, "reading block of %d bytes took %s (fileNum=%s; %s/%s:%d -> %s/%s:%d)", + int(bh.Length+TrailerLen), readDuration.String(), + r.opts.CacheOpts.FileNum, + filepath.Base(filepath.Dir(file2)), filepath.Base(file2), line2, + filepath.Base(filepath.Dir(file1)), filepath.Base(file1), line1) + } + if err != nil { + compressed.Release() + return Value{}, err + } + env.BlockRead(bh.Length, readDuration) + if err = ValidateChecksum(r.checksumType, compressed.BlockData(), bh); err != nil { + compressed.Release() + err = errors.Wrapf(err, "pebble: file %s", r.opts.CacheOpts.FileNum) + return Value{}, err + } + typ := CompressionIndicator(compressed.BlockData()[bh.Length]) + compressed.Truncate(int(bh.Length)) + var decompressed Value + if typ == NoCompressionIndicator { + decompressed = compressed + } else { + // Decode the length of the decompressed value. + decodedLen, err := DecompressedLen(typ, compressed.BlockData()) + if err != nil { + compressed.Release() + return Value{}, err + } + decompressed = Alloc(decodedLen, env.BufferPool) + err = DecompressInto(typ, compressed.BlockData(), decompressed.BlockData()) + compressed.Release() + if err != nil { + decompressed.Release() + return Value{}, err + } + } + if err = initBlockMetadataFn(decompressed.BlockMetadata(), decompressed.BlockData()); err != nil { + decompressed.Release() + return Value{}, err + } + return decompressed, nil +} + +// Readable returns the underlying objstorage.Readable. +// +// Users should avoid accessing the underlying Readable if it can be avoided. +func (r *Reader) Readable() objstorage.Readable { + return r.readable +} + +// GetFromCache retrieves the block from the cache, if it is present. +// +// Users should prefer using Read, which handles reading from object storage on +// a cache miss. +func (r *Reader) GetFromCache(bh Handle) *cache.Value { + return r.opts.CacheOpts.CacheHandle.Get(r.opts.CacheOpts.FileNum, bh.Offset) +} + +// UsePreallocatedReadHandle returns a ReadHandle that reads from the reader and +// uses the provided preallocated read handle to back the read handle, avoiding +// an unnecessary allocation. +func (r *Reader) UsePreallocatedReadHandle( + readBeforeSize objstorage.ReadBeforeSize, rh *objstorageprovider.PreallocatedReadHandle, +) objstorage.ReadHandle { + return objstorageprovider.UsePreallocatedReadHandle(r.readable, readBeforeSize, rh) +} + +// Close releases resources associated with the Reader. +func (r *Reader) Close() error { + var err error + if r.readable != nil { + err = r.readable.Close() + r.readable = nil + } + return err +} + +// ReadRaw reads len(buf) bytes from the provided Readable at the given offset +// into buf. It's used to read the footer of a table. +func ReadRaw( + ctx context.Context, + f objstorage.Readable, + readHandle objstorage.ReadHandle, + logger base.LoggerAndTracer, + fileNum base.DiskFileNum, + buf []byte, + off int64, +) ([]byte, error) { + size := f.Size() + if size < int64(len(buf)) { + return nil, base.CorruptionErrorf("pebble: invalid file %s (file size is too small)", errors.Safe(fileNum)) + } + + readStopwatch := makeStopwatch() + var err error + if readHandle != nil { + err = readHandle.ReadAt(ctx, buf, off) + } else { + err = f.ReadAt(ctx, buf, off) + } + readDuration := readStopwatch.stop() + // Call IsTracingEnabled to avoid the allocations of boxing integers into an + // interface{}, unless necessary. + if readDuration >= slowReadTracingThreshold && logger.IsTracingEnabled(ctx) { + logger.Eventf(ctx, "reading footer of %d bytes took %s", + len(buf), readDuration.String()) + } + if err != nil { + return nil, errors.Wrap(err, "pebble: invalid file (could not read footer)") + } + return buf, nil +} + +// DeterministicReadBlockDurationForTesting is for tests that want a +// deterministic value of the time to read a block (that is not in the cache). +// The return value is a function that must be called before the test exits. +func DeterministicReadBlockDurationForTesting() func() { + drbdForTesting := deterministicReadBlockDurationForTesting + deterministicReadBlockDurationForTesting = true + return func() { + deterministicReadBlockDurationForTesting = drbdForTesting + } +} + +var deterministicReadBlockDurationForTesting = false + +type deterministicStopwatchForTesting struct { + startTime crtime.Mono +} + +func makeStopwatch() deterministicStopwatchForTesting { + return deterministicStopwatchForTesting{startTime: crtime.NowMono()} +} + +func (w deterministicStopwatchForTesting) stop() time.Duration { + dur := w.startTime.Elapsed() + if deterministicReadBlockDurationForTesting { + dur = slowReadTracingThreshold + } + return dur +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block/blockkind/kind.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/blockkind/kind.go new file mode 100644 index 0000000..212c570 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/blockkind/kind.go @@ -0,0 +1,54 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package blockkind + +import "iter" + +// Kind identifies the type of block. +type Kind uint8 + +const ( + Unknown Kind = iota + SSTableData + SSTableIndex + SSTableValue + BlobValue + BlobReferenceValueLivenessIndex + Index + Filter + RangeDel + RangeKey + Metadata + + NumKinds +) + +var kindString = [...]string{ + Unknown: "unknown", + SSTableData: "data", + SSTableValue: "sstval", + SSTableIndex: "index", + BlobValue: "blobval", + BlobReferenceValueLivenessIndex: "blobrefval", + Filter: "filter", + RangeDel: "rangedel", + RangeKey: "rangekey", + Metadata: "metadata", +} + +func (k Kind) String() string { + return kindString[k] +} + +// All returns all block kinds. +func All() iter.Seq[Kind] { + return func(yield func(Kind) bool) { + for i := Kind(1); i < NumKinds; i++ { + if !yield(i) { + break + } + } + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block/buffer_pool.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/buffer_pool.go new file mode 100644 index 0000000..5390553 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/buffer_pool.go @@ -0,0 +1,244 @@ +// Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package block + +import ( + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/cache" +) + +// Alloc allocates a new Value for a block of length n (excluding the block +// trailer, but including an associated Metadata). If bufferPool is non-nil, +// Alloc allocates the buffer from the pool. Otherwise it allocates it from the +// block cache. +func Alloc(n int, p *BufferPool) Value { + if p != nil { + return Value{buf: p.Alloc(MetadataSize + n)} + } + return Value{v: cache.Alloc(MetadataSize + n)} +} + +// Value is a block buffer, either backed by the block cache or a BufferPool. +type Value struct { + // buf.Valid() returns true if backed by a BufferPool. + buf Buf + // v is non-nil if backed by the block cache. + v *cache.Value +} + +// getInternalBuf gets the underlying buffer which includes the Metadata and the +// block. +func (b Value) getInternalBuf() []byte { + if b.buf.Valid() { + return b.buf.p.pool[b.buf.i].b + } + return b.v.RawBuffer() +} + +// BlockData returns the byte slice for the block data. +func (b Value) BlockData() []byte { + return b.getInternalBuf()[MetadataSize:] +} + +// BlockMetadata returns the block metadata. +func (b Value) BlockMetadata() *Metadata { + return (*Metadata)(b.getInternalBuf()) +} + +// MakeHandle constructs a BufferHandle from the Value. +func (b Value) MakeHandle() BufferHandle { + if b.buf.Valid() { + return BufferHandle{b: b.buf} + } + return BufferHandle{cv: b.v} +} + +func (b *Value) SetInCacheForTesting(h *cache.Handle, fileNum base.DiskFileNum, offset uint64) { + if b.buf.Valid() { + panic("block value must be backed by a cache.Value") + } + h.Set(fileNum, offset, b.v) + b.v.Release() + b.v = nil +} + +// Release releases the handle. +func (b Value) Release() { + if b.buf.Valid() { + b.buf.Release() + } else { + cache.Free(b.v) + } +} + +// Truncate truncates the block to n bytes. +func (b Value) Truncate(n int) { + n += MetadataSize + if b.buf.Valid() { + b.buf.p.pool[b.buf.i].b = b.buf.p.pool[b.buf.i].b[:n] + } else { + b.v.Truncate(n) + } +} + +// A BufferHandle is a handle to manually-managed memory. The handle may point +// to a block in the block cache (h.cv != nil), or a buffer that exists outside +// the block cache allocated from a BufferPool (b.Valid()). +type BufferHandle struct { + cv *cache.Value + b Buf +} + +// CacheBufferHandle constructs a BufferHandle from a block cache Handle. +func CacheBufferHandle(cv *cache.Value) BufferHandle { + return BufferHandle{cv: cv} +} + +// Valid returns true if the BufferHandle holds a value. +func (bh BufferHandle) Valid() bool { + return bh.cv != nil || bh.b.Valid() +} + +func (bh BufferHandle) rawBuffer() []byte { + if bh.cv != nil { + return bh.cv.RawBuffer() + } + return bh.b.p.pool[bh.b.i].b +} + +// BlockMetadata returns the buffer for the block metadata. +func (bh BufferHandle) BlockMetadata() *Metadata { + return (*Metadata)(bh.rawBuffer()) +} + +// BlockData retrieves the buffer for the block data. +func (bh BufferHandle) BlockData() []byte { + return (bh.rawBuffer())[MetadataSize:] +} + +// Release releases the buffer, either back to the block cache or BufferPool. It +// is okay to call Release on a zero-value BufferHandle (to no effect). +func (bh BufferHandle) Release() { + bh.cv.Release() + bh.b.Release() +} + +// A BufferPool holds a pool of buffers for holding sstable blocks. An initial +// size of the pool is provided on Init, but a BufferPool will grow to meet the +// largest working set size. It'll never shrink. When a buffer is released, the +// BufferPool recycles the buffer for future allocations. +// +// A BufferPool should only be used for short-lived allocations with +// well-understood working set sizes to avoid excessive memory consumption. +// +// BufferPool is not thread-safe. +type BufferPool struct { + // pool contains all the buffers held by the pool, including buffers that + // are in-use. For every i < len(pool): pool[i].v is non-nil. + pool []AllocedBuffer +} + +// AllocedBuffer is an allocated memory buffer. +type AllocedBuffer struct { + v *cache.Value + // b holds the current byte slice. It's backed by v, but may be a subslice + // of v's memory while the buffer is in-use [ len(b) ≤ len(v.RawBuffer()) ]. + // + // If the buffer is not currently in-use, b is nil. When being recycled, the + // BufferPool.Alloc will reset b to be a subslice of v.RawBuffer(). + b []byte +} + +// Init initializes the pool with an initial working set buffer size of +// `initialSize`. +func (p *BufferPool) Init(initialSize int) { + *p = BufferPool{ + pool: make([]AllocedBuffer, 0, initialSize), + } +} + +// InitPreallocated is like Init but for internal sstable package use in +// instances where a pre-allocated slice of []allocedBuffer already exists. It's +// used to avoid an extra allocation initializing BufferPool.pool. +func (p *BufferPool) InitPreallocated(pool []AllocedBuffer) { + *p = BufferPool{ + pool: pool[:0], + } +} + +// Release releases all buffers held by the pool and resets the pool to an +// uninitialized state. +func (p *BufferPool) Release() { + for i := range p.pool { + if p.pool[i].b != nil { + panic(errors.AssertionFailedf("Release called on a BufferPool with in-use buffers")) + } + v := p.pool[i].v + p.pool[i].v = nil + cache.Free(v) + } + p.pool = p.pool[:0] +} + +// Alloc allocates a new buffer of size n. If the pool already holds a buffer at +// least as large as n, the pooled buffer is used instead. +// +// Alloc is O(MAX(N,M)) where N is the largest number of concurrently in-use +// buffers allocated and M is the initialSize passed to Init. +func (p *BufferPool) Alloc(n int) Buf { + unusableBufferIdx := -1 + for i := 0; i < len(p.pool); i++ { + if p.pool[i].b == nil { + if len(p.pool[i].v.RawBuffer()) >= n { + p.pool[i].b = p.pool[i].v.RawBuffer()[:n] + return Buf{p: p, i: i} + } + unusableBufferIdx = i + } + } + + // If we would need to grow the size of the pool to allocate another buffer, + // but there was a slot available occupied by a buffer that's just too + // small, replace the too-small buffer. + if len(p.pool) == cap(p.pool) && unusableBufferIdx >= 0 { + i := unusableBufferIdx + cache.Free(p.pool[i].v) + p.pool[i].v = cache.Alloc(n) + p.pool[i].b = p.pool[i].v.RawBuffer() + return Buf{p: p, i: i} + } + + // Allocate a new buffer. + v := cache.Alloc(n) + p.pool = append(p.pool, AllocedBuffer{v: v, b: v.RawBuffer()[:n]}) + return Buf{p: p, i: len(p.pool) - 1} +} + +// A Buf holds a reference to a manually-managed, pooled byte buffer. +type Buf struct { + p *BufferPool + // i holds the index into p.pool where the buffer may be found. This scheme + // avoids needing to allocate the handle to the buffer on the heap at the + // cost of copying two words instead of one. + i int +} + +// Valid returns true if the buf holds a valid buffer. +func (b Buf) Valid() bool { + return b.p != nil +} + +// Release releases the buffer back to the pool. +func (b *Buf) Release() { + if b.p == nil { + return + } + // Clear the allocedBuffer's byte slice. This signals the allocated buffer + // is no longer in use and a future call to BufferPool.Alloc may reuse this + // buffer. + b.p.pool[b.i].b = nil + b.p = nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block/category_stats.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/category_stats.go new file mode 100644 index 0000000..abe7cb9 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/category_stats.go @@ -0,0 +1,272 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package block + +import ( + "cmp" + "runtime" + "slices" + "sync" + "sync/atomic" + "time" + "unsafe" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" +) + +// Category is a user-understandable string, where stats are aggregated for +// each category. The cardinality of this should be low, say < 20. The prefix +// "pebble-" is reserved for internal Pebble categories. +// +// Examples of categories that can be useful in the CockroachDB context are: +// sql-user, sql-stats, raft, rangefeed, mvcc-gc, range-snapshot. +type Category uint8 + +// CategoryUnknown is the unknown category. It has the latency-sensitive QoS +// level. +const CategoryUnknown Category = 0 + +// CategoryMax is the maximum value of a category, and is also the maximum +// number of categories that can be registered. +const CategoryMax = 30 + +// shardPadding pads each shard to 64 bytes so they don't share a cache line. +const shardPadding = 64 - unsafe.Sizeof(CategoryStatsShard{}) + +// paddedCategoryStatsShard is a single shard of a category's statistics. +type paddedCategoryStatsShard struct { + CategoryStatsShard + _ [shardPadding]byte +} + +func (c Category) String() string { + return categories[c].name +} + +// QoSLevel returns the QoSLevel associated with this Category. +func (c Category) QoSLevel() QoSLevel { + return categories[c].qosLevel +} + +// SafeFormat implements the redact.SafeFormatter interface. +func (c Category) SafeFormat(p redact.SafePrinter, verb rune) { + p.SafeString(redact.SafeString(c.String())) +} + +// RegisterCategory registers a new category. Each category has a name and an +// associated QoS level. The category name must be unique. +// +// Only CategoryMax categories can be registered in total. +func RegisterCategory(name string, qosLevel QoSLevel) Category { + if categoriesList != nil { + panic("ReigsterCategory called after Categories()") + } + c := Category(numRegisteredCategories.Add(1)) + if c > CategoryMax { + panic("too many categories") + } + categories[c].name = name + categories[c].qosLevel = qosLevel + return c +} + +// Categories returns all registered categories, including CategoryUnknown. +// +// Can only be called after all categories have been registered. Calling +// RegisterCategory() after Categories() will result in a panic. +func Categories() []Category { + categoriesListOnce.Do(func() { + categoriesList = make([]Category, numRegisteredCategories.Load()+1) + for i := range categoriesList { + categoriesList[i] = Category(i) + } + }) + return categoriesList +} + +var categories = [CategoryMax + 1]struct { + name string + qosLevel QoSLevel +}{ + CategoryUnknown: {name: "unknown", qosLevel: LatencySensitiveQoSLevel}, +} + +var numRegisteredCategories atomic.Uint32 + +var categoriesList []Category +var categoriesListOnce sync.Once + +// StringToCategoryForTesting returns the Category for the string, or panics if +// the string is not known. +func StringToCategoryForTesting(s string) Category { + for i := range categories { + if categories[i].name == s { + return Category(i) + } + } + panic(errors.AssertionFailedf("unknown Category %s", s)) +} + +// QoSLevel describes whether the read is latency-sensitive or not. Each +// category must map to a single QoSLevel. While category strings are opaque +// to Pebble, the QoSLevel may be internally utilized in Pebble to better +// optimize future reads. +type QoSLevel uint8 + +const ( + // LatencySensitiveQoSLevel is the default when QoSLevel is not specified, + // and represents reads that are latency-sensitive. + LatencySensitiveQoSLevel QoSLevel = iota + // NonLatencySensitiveQoSLevel represents reads that are not + // latency-sensitive. + NonLatencySensitiveQoSLevel +) + +// SafeFormat implements the redact.SafeFormatter interface. +func (q QoSLevel) SafeFormat(p redact.SafePrinter, verb rune) { + switch q { + case LatencySensitiveQoSLevel: + p.Printf("latency") + case NonLatencySensitiveQoSLevel: + p.Printf("non-latency") + default: + p.Printf("") + } +} + +// StringToQoSForTesting returns the QoSLevel for the string, or panics if the +// string is not known. +func StringToQoSForTesting(s string) QoSLevel { + switch s { + case "latency": + return LatencySensitiveQoSLevel + case "non-latency": + return NonLatencySensitiveQoSLevel + } + panic(errors.AssertionFailedf("unknown QoS %s", s)) +} + +// CategoryStats provides stats about a category of reads. +type CategoryStats struct { + // BlockBytes is the bytes in the loaded blocks. If the block was + // compressed, this is the compressed bytes. Currently, only the index + // blocks, data blocks containing points, and filter blocks are included. + // Additionally, value blocks read after the corresponding iterator is + // closed are not included. + BlockBytes uint64 + // BlockBytesInCache is the subset of BlockBytes that were in the block + // cache. + BlockBytesInCache uint64 + // BlockReadDuration is the total duration to read the bytes not in the + // cache, i.e., BlockBytes-BlockBytesInCache. + BlockReadDuration time.Duration +} + +func (s *CategoryStats) aggregate( + blockBytes, blockBytesInCache uint64, blockReadDuration time.Duration, +) { + s.BlockBytes += blockBytes + s.BlockBytesInCache += blockBytesInCache + s.BlockReadDuration += blockReadDuration +} + +// CategoryStatsAggregate is the aggregate for the given category. +type CategoryStatsAggregate struct { + Category Category + CategoryStats CategoryStats +} + +// numCategoryStatsShards must be a power of 2. We initialize it to GOMAXPROCS +// (rounded up to the nearest power of 2) or 16, whichever is larger. +var numCategoryStatsShards = func() int { + p := runtime.GOMAXPROCS(0) + n := 16 + for n < p { + n *= 2 + } + return n +}() + +// CategoryStatsShard holds CategoryStats with a mutex +// to ensure safe access. +type CategoryStatsShard struct { + mu struct { + sync.Mutex + stats CategoryStats + } +} + +// Accumulate implements the IterStatsAccumulator interface. +func (c *CategoryStatsShard) Accumulate( + blockBytes, blockBytesInCache uint64, blockReadDuration time.Duration, +) { + c.mu.Lock() + c.mu.stats.aggregate(blockBytes, blockBytesInCache, blockReadDuration) + c.mu.Unlock() +} + +// CategoryStatsCollector collects and aggregates the stats per category. +type CategoryStatsCollector struct { + // mu protects additions to statsMap. + mu sync.Mutex + // Category => *shardedCategoryStats. + statsMap sync.Map +} + +// shardedCategoryStats accumulates stats for a category, splitting its stats +// across multiple shards to prevent mutex contention. In high-read workloads, +// contention on the category stats mutex has been observed. +type shardedCategoryStats struct { + Category Category + shards []paddedCategoryStatsShard +} + +// getStats retrieves the aggregated stats for the category, summing across all +// shards. +func (s *shardedCategoryStats) getStats() CategoryStatsAggregate { + agg := CategoryStatsAggregate{ + Category: s.Category, + } + for i := range s.shards { + s.shards[i].mu.Lock() + agg.CategoryStats.aggregate(s.shards[i].mu.stats.BlockBytes, s.shards[i].mu.stats.BlockBytesInCache, s.shards[i].mu.stats.BlockReadDuration) + s.shards[i].mu.Unlock() + } + return agg +} + +// Accumulator returns a stats accumulator for the given category. The provided +// p is used to detrmine which shard to write stats to. +func (c *CategoryStatsCollector) Accumulator(p uint64, category Category) *CategoryStatsShard { + v, ok := c.statsMap.Load(category) + if !ok { + c.mu.Lock() + v, _ = c.statsMap.LoadOrStore(category, &shardedCategoryStats{ + Category: category, + shards: make([]paddedCategoryStatsShard, numCategoryStatsShards), + }) + c.mu.Unlock() + } + s := v.(*shardedCategoryStats) + // This equation is taken from: + // https://en.wikipedia.org/wiki/Linear_congruential_generator#Parameters_in_common_use + shard := ((p * 25214903917) >> 32) & uint64(numCategoryStatsShards-1) + return &s.shards[shard].CategoryStatsShard +} + +// GetStats returns the aggregated stats. +func (c *CategoryStatsCollector) GetStats() []CategoryStatsAggregate { + var stats []CategoryStatsAggregate + c.statsMap.Range(func(_, v any) bool { + s := v.(*shardedCategoryStats).getStats() + stats = append(stats, s) + return true + }) + slices.SortFunc(stats, func(a, b CategoryStatsAggregate) int { + return cmp.Compare(a.Category, b.Category) + }) + return stats +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block/compression.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/compression.go new file mode 100644 index 0000000..9c43afc --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/compression.go @@ -0,0 +1,452 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package block + +import ( + "runtime" + "slices" + "strings" + "sync" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/bytealloc" + "github.com/cockroachdb/pebble/v2/internal/compression" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/objstorage" +) + +// CompressionProfile contains the parameters for compressing blocks in an +// sstable or blob file. +// +// CompressionProfile is a more advanced successor to Compression. +type CompressionProfile struct { + Name string + + // DataBlocks applies to sstable data blocks. + // ValueBlocks applies to sstable value blocks and blob file value blocks. + // OtherBlocks applies to all other blocks (such as index, filter, metadata + // blocks). + // + // Some blocks (like rangedel) never use compression; this is at the + // discretion of the sstable or blob file writer. + // + // Note that MinLZ is only supported with table formats v6+. Older formats + // fall back to Snappy. + DataBlocks compression.Setting + ValueBlocks compression.Setting + OtherBlocks compression.Setting + + // Blocks that are reduced by less than this percentage are stored + // uncompressed. + MinReductionPercent uint8 + + // AdaptiveReductionCutoffPercent (when set to a non-zero value) enables + // adaptive compressors for data and value blocks which fall back to the + // OtherBlocks setting. The OtherBlocks setting is used when the + // DataBlocks/ValueBlocks setting cannot achieve a further data reduction of + // at least AdaptiveReductionCutoffPercent%. + AdaptiveReductionCutoffPercent uint8 +} + +// UsesMinLZ returns true if the profile uses the MinLZ compression algorithm +// (for any block kind). +func (p *CompressionProfile) UsesMinLZ() bool { + return p.DataBlocks.Algorithm == compression.MinLZ || + p.ValueBlocks.Algorithm == compression.MinLZ || + p.OtherBlocks.Algorithm == compression.MinLZ +} + +var ( + NoCompression = simpleCompressionProfile("NoCompression", compression.None) + SnappyCompression = simpleCompressionProfile("Snappy", compression.Snappy) + ZstdCompression = simpleCompressionProfile("ZSTD", compression.ZstdLevel3) + MinLZCompression = simpleCompressionProfile("MinLZ", compression.MinLZFastest) + + DefaultCompression = SnappyCompression + FastestCompression = simpleCompressionProfile("Fastest", fastestCompression) + + FastCompression = registerCompressionProfile(CompressionProfile{ + Name: "Fast", + DataBlocks: fastestCompression, + ValueBlocks: compression.ZstdLevel1, + OtherBlocks: fastestCompression, + MinReductionPercent: 10, + AdaptiveReductionCutoffPercent: 30, + }) + + BalancedCompression = registerCompressionProfile(CompressionProfile{ + Name: "Balanced", + DataBlocks: compression.ZstdLevel1, + ValueBlocks: compression.ZstdLevel1, + OtherBlocks: fastestCompression, + MinReductionPercent: 5, + AdaptiveReductionCutoffPercent: 20, + }) + + GoodCompression = registerCompressionProfile(CompressionProfile{ + Name: "Good", + DataBlocks: compression.ZstdLevel3, + ValueBlocks: compression.ZstdLevel3, + OtherBlocks: fastestCompression, + MinReductionPercent: 5, + AdaptiveReductionCutoffPercent: 10, + }) +) + +var fastestCompression = func() compression.Setting { + if runtime.GOARCH == "arm64" { + // MinLZ is generally faster and better than Snappy except for arm64: Snappy + // has an arm64 assembly implementation and MinLZ does not. + return compression.Snappy + } + return compression.MinLZFastest +}() + +// simpleCompressionProfile returns a CompressionProfile that uses the same +// compression setting for all blocks and which uses the uncompressed block if +// compression reduces it by less than 12%. This is similar to older Pebble +// versions which used Compression. +// +// It should only be used during global initialization. +func simpleCompressionProfile(name string, setting compression.Setting) *CompressionProfile { + return registerCompressionProfile(CompressionProfile{ + Name: name, + DataBlocks: setting, + ValueBlocks: setting, + OtherBlocks: setting, + MinReductionPercent: 12, + }) +} + +// CompressionProfileByName returns the built-in compression profile with the +// given name, or nil if there is no such profile. It is case-insensitive. +// +// The caller must gracefully handle the nil return case as an unknown +// (user-defined or deprecated) profile. +func CompressionProfileByName(name string) *CompressionProfile { + return compressionProfileMap[strings.ToLower(name)] +} + +var compressionProfileMap = make(map[string]*CompressionProfile) + +func registerCompressionProfile(p CompressionProfile) *CompressionProfile { + key := strings.ToLower(p.Name) + if _, ok := compressionProfileMap[key]; ok { + panic(errors.AssertionFailedf("duplicate compression profile: %s", p.Name)) + } + compressionProfileMap[key] = &p + return &p +} + +// CompressionIndicator is the byte stored physically within the block.Trailer +// to indicate the compression type. +// +// TODO(jackson): Avoid exporting once all compression and decompression is +// delegated to the block package. +type CompressionIndicator byte + +// The block type gives the per-block compression format. +// These constants are part of the file format and should not be changed. +// They are different from the Compression constants because the latter +// are designed so that the zero value of the Compression type means to +// use the default compression (which is snappy). +// Not all compression types listed here are supported. +const ( + NoCompressionIndicator CompressionIndicator = 0 + SnappyCompressionIndicator CompressionIndicator = 1 + ZlibCompressionIndicator CompressionIndicator = 2 + Bzip2CompressionIndicator CompressionIndicator = 3 + Lz4CompressionIndicator CompressionIndicator = 4 + Lz4hcCompressionIndicator CompressionIndicator = 5 + XpressCompressionIndicator CompressionIndicator = 6 + ZstdCompressionIndicator CompressionIndicator = 7 + MinLZCompressionIndicator CompressionIndicator = 8 +) + +// String implements fmt.Stringer. +func (i CompressionIndicator) String() string { + switch i { + case 0: + return "none" + case 1: + return "snappy" + case 2: + return "zlib" + case 3: + return "bzip2" + case 4: + return "lz4" + case 5: + return "lz4hc" + case 6: + return "xpress" + case 7: + return "zstd" + case 8: + return "minlz" + default: + panic(errors.Newf("sstable: unknown block type: %d", i)) + } +} + +func (i CompressionIndicator) Algorithm() compression.Algorithm { + switch i { + case NoCompressionIndicator: + return compression.NoCompression + case SnappyCompressionIndicator: + return compression.SnappyAlgorithm + case ZstdCompressionIndicator: + return compression.Zstd + case MinLZCompressionIndicator: + return compression.MinLZ + default: + panic("Invalid compression type.") + } +} + +func compressionIndicatorFromAlgorithm(algo compression.Algorithm) CompressionIndicator { + switch algo { + case compression.NoCompression: + return NoCompressionIndicator + case compression.SnappyAlgorithm: + return SnappyCompressionIndicator + case compression.Zstd: + return ZstdCompressionIndicator + case compression.MinLZ: + return MinLZCompressionIndicator + default: + panic("invalid algorithm") + } +} + +// DecompressedLen returns the length of the provided block once decompressed, +// allowing the caller to allocate a buffer exactly sized to the decompressed +// payload. +func DecompressedLen(ci CompressionIndicator, b []byte) (decompressedLen int, err error) { + decompressor := GetDecompressor(ci) + defer decompressor.Close() + return decompressor.DecompressedLen(b) +} + +// DecompressInto decompresses compressed into buf. The buf slice must have the +// exact size as the decompressed value. Callers may use DecompressedLen to +// determine the correct size. +func DecompressInto(ci CompressionIndicator, compressed []byte, buf []byte) error { + decompressor := GetDecompressor(ci) + defer decompressor.Close() + err := decompressor.DecompressInto(buf, compressed) + if err != nil { + return base.MarkCorruptionError(err) + } + return nil +} + +// PhysicalBlock represents a block (possibly compressed) as it is stored +// physically on disk, including its trailer. +type PhysicalBlock struct { + // data contains the possibly compressed block data. + data []byte + trailer Trailer +} + +// NewPhysicalBlock returns a new PhysicalBlock with the provided block +// data. The trailer is set from the last TrailerLen bytes of the +// block. The data could be compressed. +func NewPhysicalBlock(data []byte) PhysicalBlock { + trailer := Trailer(data[len(data)-TrailerLen:]) + data = data[:len(data)-TrailerLen] + return PhysicalBlock{data: data, trailer: trailer} +} + +// LengthWithTrailer returns the length of the data block, including the trailer. +func (b *PhysicalBlock) LengthWithTrailer() int { + return len(b.data) + TrailerLen +} + +// LengthWithoutTrailer returns the length of the data block, excluding the trailer. +func (b *PhysicalBlock) LengthWithoutTrailer() int { + return len(b.data) +} + +// CloneWithByteAlloc returns a deep copy of the block, using the provided +// bytealloc.A to allocate memory for the new copy. +func (b *PhysicalBlock) CloneWithByteAlloc(a *bytealloc.A) PhysicalBlock { + var data []byte + *a, data = (*a).Alloc(len(b.data)) + copy(data, b.data) + return PhysicalBlock{ + data: data, + trailer: b.trailer, + } +} + +// Clone returns a deep copy of the block. +func (b PhysicalBlock) Clone() PhysicalBlock { + data := make([]byte, len(b.data)) + copy(data, b.data) + return PhysicalBlock{data: data, trailer: b.trailer} +} + +// WriteTo writes the block (including its trailer) to the provided Writable. If +// err == nil, n is the number of bytes successfully written to the Writable. +// +// WriteTo might mangle the block data. +func (b *PhysicalBlock) WriteTo(w objstorage.Writable) (n int, err error) { + if err := w.Write(b.data); err != nil { + return 0, err + } + if err := w.Write(b.trailer[:]); err != nil { + return 0, err + } + + // WriteTo is allowed to mangle the data. Mangle it ourselves some of the time + // in invariant builds to catch callers that don't handle this. + if invariants.Enabled && invariants.Sometimes(1) { + for i := range b.data { + b.data[i] = 0xFF + } + } + return len(b.data) + len(b.trailer), nil +} + +// CompressAndChecksum compresses and checksums the provided block, returning +// the compressed block and its trailer. The result is appended to the dst +// argument. +func CompressAndChecksum( + dst *[]byte, blockData []byte, blockKind Kind, compressor *Compressor, checksummer *Checksummer, +) PhysicalBlock { + buf := (*dst)[:0] + ci, buf := compressor.Compress(buf, blockData, blockKind) + *dst = buf + + // Calculate the checksum. + pb := PhysicalBlock{data: buf} + checksum := checksummer.Checksum(buf, byte(ci)) + pb.trailer = MakeTrailer(byte(ci), checksum) + return pb +} + +// CopyAndChecksum copies the provided block (without compressing it) and +// checksums it, returning the physical block. The result is appended to the dst +// argument. +// +// Note that we still need to provide a Compressor so we can inform it of the +// uncompressed block (for statistics). +func CopyAndChecksum( + dst *[]byte, blockData []byte, blockKind Kind, compressor *Compressor, checksummer *Checksummer, +) PhysicalBlock { + buf := *dst + buf = append(buf[:0], blockData...) + *dst = buf + + // Calculate the checksum. + pb := PhysicalBlock{data: buf} + checksum := checksummer.Checksum(buf, byte(NoCompressionIndicator)) + pb.trailer = MakeTrailer(byte(NoCompressionIndicator), checksum) + compressor.UncompressedBlock(len(blockData), blockKind) + return pb +} + +// CompressAndChecksumToTempBuffer compresses and checksums the provided block +// into a TempBuffer. The caller should Release() the TempBuffer once it is no +// longer necessary. +func CompressAndChecksumToTempBuffer( + blockData []byte, blockKind Kind, compressor *Compressor, checksummer *Checksummer, +) (PhysicalBlock, *TempBuffer) { + // Grab a buffer to use as the destination for compression. + compressedBuf := NewTempBuffer() + pb := CompressAndChecksum(&compressedBuf.b, blockData, blockKind, compressor, checksummer) + return pb, compressedBuf +} + +// CopyAndChecksumToTempBuffer copies (without compressing) and checksums +// the provided block into a TempBuffer. The caller should Release() the +// TempBuffer once it is no longer necessary. +func CopyAndChecksumToTempBuffer( + blockData []byte, blockKind Kind, compressor *Compressor, checksummer *Checksummer, +) (PhysicalBlock, *TempBuffer) { + // Grab a buffer to use as the destination for compression. + compressedBuf := NewTempBuffer() + pb := CopyAndChecksum(&compressedBuf.b, blockData, blockKind, compressor, checksummer) + return pb, compressedBuf +} + +// TempBuffer is a buffer that is used temporarily and is released back to a +// pool for reuse. +type TempBuffer struct { + b []byte +} + +// NewTempBuffer returns a TempBuffer from the pool. The buffer will have zero +// size and length and arbitrary capacity. +func NewTempBuffer() *TempBuffer { + tb := tempBufferPool.Get().(*TempBuffer) + if invariants.Enabled && len(tb.b) > 0 { + panic("NewTempBuffer length not 0") + } + return tb +} + +// Data returns the byte slice currently backing the Buffer. +func (tb *TempBuffer) Data() []byte { + return tb.b +} + +// Size returns the current size of the buffer. +func (tb *TempBuffer) Size() int { + return len(tb.b) +} + +// Append appends the contents of v to the buffer, growing the buffer if +// necessary. Returns the offset at which it was appended. +func (tb *TempBuffer) Append(v []byte) (startOffset int) { + startOffset = len(tb.b) + tb.b = append(tb.b, v...) + return startOffset +} + +// Resize resizes the buffer to the specified length, allocating if necessary. +// If the length is longer than the current length, the values of the new bytes +// are arbitrary. +func (tb *TempBuffer) Resize(length int) { + if length > cap(tb.b) { + tb.b = slices.Grow(tb.b, length-len(tb.b)) + } + tb.b = tb.b[:length] +} + +// Reset is equivalent to Resize(0). +func (tb *TempBuffer) Reset() { + tb.b = tb.b[:0] +} + +// Release releases the buffer back to the pool for reuse. +func (tb *TempBuffer) Release() { + // Note we avoid releasing buffers that are larger than the configured + // maximum to the pool. This avoids holding on to occasional large buffers + // necessary for e.g. singular large values. + if tb.b != nil && len(tb.b) < tempBufferMaxReusedSize { + if invariants.Sometimes(20) { + // Mangle the buffer data. + for i := range tb.b { + tb.b[i] = 0xCC + } + } + tb.b = tb.b[:0] + tempBufferPool.Put(tb) + } +} + +// tempBufferPool is a pool of buffers that are used to temporarily hold either +// compressed or uncompressed block data. +var tempBufferPool = sync.Pool{ + New: func() any { + return &TempBuffer{b: make([]byte, 0, tempBufferInitialSize)} + }, +} + +const tempBufferInitialSize = 32 * 1024 +const tempBufferMaxReusedSize = 256 * 1024 diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block/compressor.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/compressor.go new file mode 100644 index 0000000..baa4f81 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/compressor.go @@ -0,0 +1,185 @@ +package block + +import ( + "fmt" + "iter" + "math/rand" + "strings" + + "github.com/cockroachdb/pebble/v2/internal/compression" + "github.com/cockroachdb/pebble/v2/sstable/block/blockkind" +) + +// Compressor is used to compress blocks. Typical usage: +// +// c := MakeCompressor(profile) +// .. = c.Compress(..) +// .. = c.Compress(..) +// c.Close() +type Compressor struct { + minReductionPercent uint8 + dataBlocksCompressor compression.Compressor + valueBlocksCompressor compression.Compressor + otherBlocksCompressor compression.Compressor + + stats CompressionStats +} + +// MakeCompressor returns a Compressor that applies the given compression +// profile. Close must be called when the compressor is no longer needed. +func MakeCompressor(profile *CompressionProfile) Compressor { + c := Compressor{ + minReductionPercent: profile.MinReductionPercent, + } + + c.dataBlocksCompressor = maybeAdaptiveCompressor(profile, profile.DataBlocks) + c.valueBlocksCompressor = maybeAdaptiveCompressor(profile, profile.ValueBlocks) + c.otherBlocksCompressor = compression.GetCompressor(profile.OtherBlocks) + return c +} + +func maybeAdaptiveCompressor( + profile *CompressionProfile, setting compression.Setting, +) compression.Compressor { + if profile.AdaptiveReductionCutoffPercent != 0 && setting != profile.OtherBlocks { + params := compression.AdaptiveCompressorParams{ + Slow: setting, + Fast: profile.OtherBlocks, + ReductionCutoff: float64(profile.AdaptiveReductionCutoffPercent) * 0.01, + SampleEvery: 10, + SampleHalfLife: 256 * 1024, // 256 KB + SamplingSeed: rand.Uint64(), + } + return compression.NewAdaptiveCompressor(params) + } + return compression.GetCompressor(setting) +} + +// Close must be called when the Compressor is no longer needed. +// After Close is called, the Compressor must not be used again. +func (c *Compressor) Close() { + c.dataBlocksCompressor.Close() + c.valueBlocksCompressor.Close() + c.otherBlocksCompressor.Close() + *c = Compressor{} +} + +// Compress a block, appending the compressed data to dst[:0]. +// +// In addition to the buffer, returns the algorithm that was used. +func (c *Compressor) Compress(dst, src []byte, kind Kind) (CompressionIndicator, []byte) { + var compressor compression.Compressor + switch kind { + case blockkind.SSTableData: + compressor = c.dataBlocksCompressor + case blockkind.SSTableValue, blockkind.BlobValue: + compressor = c.valueBlocksCompressor + default: + compressor = c.otherBlocksCompressor + } + + out, setting := compressor.Compress(dst, src) + + // Return the original data uncompressed if the reduction is less than the + // minimum, i.e.: + // + // after * 100 + // ----------- > 100 - MinReductionPercent + // before + if setting.Algorithm != compression.NoCompression && + int64(len(out))*100 > int64(len(src))*int64(100-c.minReductionPercent) { + c.stats.add(compression.None, uint64(len(src)), uint64(len(src))) + return NoCompressionIndicator, append(out[:0], src...) + } + c.stats.add(setting, uint64(len(src)), uint64(len(out))) + return compressionIndicatorFromAlgorithm(setting.Algorithm), out +} + +// UncompressedBlock informs the compressor that a block of the given size and +// kind was written uncompressed. This is used so that the final statistics are +// complete. +func (c *Compressor) UncompressedBlock(size int, kind Kind) { + c.stats.add(compression.None, uint64(size), uint64(size)) +} + +// Stats returns the compression stats. The result can only be used until the +// next call to the Compressor. +func (c *Compressor) Stats() *CompressionStats { + return &c.stats +} + +// CompressionStats collects compression statistics for a single file - the +// total compressed and uncompressed sizes for each distinct compression.Setting +// used. +type CompressionStats struct { + n int + // Compression profiles have three settings (data, value, other) and + // NoCompression can also be used for data that didn't compress. + buf [4]CompressionStatsForSetting +} + +type CompressionStatsForSetting struct { + Setting compression.Setting + UncompressedBytes uint64 + CompressedBytes uint64 +} + +// add updates the stats to reflect a block that was compressed with the given setting. +func (c *CompressionStats) add( + setting compression.Setting, sizeUncompressed, sizeCompressed uint64, +) { + for i := 0; i < c.n; i++ { + if c.buf[i].Setting == setting { + c.buf[i].UncompressedBytes += sizeUncompressed + c.buf[i].CompressedBytes += sizeCompressed + return + } + } + if c.n >= len(c.buf) { + panic("too many compression settings") + } + c.buf[c.n] = CompressionStatsForSetting{ + Setting: setting, + UncompressedBytes: sizeUncompressed, + CompressedBytes: sizeCompressed, + } + c.n++ +} + +// MergeWith updates the receiver stats to include the other stats. +func (c *CompressionStats) MergeWith(other *CompressionStats) { + for i := 0; i < other.n; i++ { + c.add(other.buf[i].Setting, other.buf[i].UncompressedBytes, other.buf[i].CompressedBytes) + } +} + +// All returns an iterator over the collected stats, in arbitrary order. +func (c CompressionStats) All() iter.Seq[CompressionStatsForSetting] { + return func(yield func(cs CompressionStatsForSetting) bool) { + for i := 0; i < c.n; i++ { + if !yield(c.buf[i]) { + return + } + } + } +} + +// String returns a string representation of the stats, in the format: +// ":/,:/,..." +func (c CompressionStats) String() string { + var buf strings.Builder + buf.Grow(c.n * 64) + for i := 0; i < c.n; i++ { + if i > 0 { + buf.WriteString(",") + } + fmt.Fprintf(&buf, "%s:%d/%d", c.buf[i].Setting.String(), c.buf[i].CompressedBytes, c.buf[i].UncompressedBytes) + } + return buf.String() +} + +type Decompressor = compression.Decompressor + +func GetDecompressor(c CompressionIndicator) Decompressor { + return compression.GetDecompressor(c.Algorithm()) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block/flush_governor.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/flush_governor.go new file mode 100644 index 0000000..a6ca5ac --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/flush_governor.go @@ -0,0 +1,146 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package block + +import ( + "fmt" + "slices" + + "github.com/cockroachdb/pebble/v2/internal/cache" +) + +// FlushGovernor is used to decide when to flush a block. It takes into +// consideration a target block size and (optionally) allocation size classes. +// +// When allocation size classes are used, we use the allocation class that is +// closest to the target block size. We also take into account the next +// allocation class and use it if it reduces internal fragmentation. +type FlushGovernor struct { + // We always add another KV to a block if its initial size is below + // lowWatermark (even if the block is very large after adding the KV). This is + // a safeguard to avoid very small blocks in the presence of large KVs. + lowWatermark int + // We never add another KV to a block if its existing size exceeds + // highWatermark (unless its initial size is < lowWatermark). + // + // When using allocation classes, the high watermark corresponds to the + // allocation size class that follows the target class. Otherwise, it + // corresponds to the target block size. + highWatermark int + // targetBoundary corresponds to the size class we are targeting; if we are + // not using allocation size classes, targetBoundary equals highWatermark. + targetBoundary int +} + +// This value is the amount of extra bytes we allocate together with the block +// data. This must be taken into account when taking allocator size classes into +// consideration. +// +// For instance, we may have a block of size 1020B that by itself would fit +// within a 1024B class. However, when loaded into the block cache we also +// allocate space for the cache entry metadata. The new allocation may now only +// fit within a 2048B class, which increases internal fragmentation. +const blockAllocationOverhead = cache.ValueMetadataSize + MetadataSize + +// MakeFlushGovernor initializes a flush controller. +// +// There are two cases: +// +// 1. No allocation classes. If we don't have any allocatorSizeClasses, or +// targetBlockSize doesn't fit between two allocation classes, then we flush +// right before the block would exceed targetBlockSize (except if the block size +// would be smaller than blockSizeThreshold percent of the target, in which case +// we flush right after the target block size is exceeded). +// +// 2. With allocation classes. We take into account allocation size classes no +// smaller than sizeClassAwareThreshold percent of the target block size and up +// to the first class that fits the target block size. We flush near allocation +// class boundaries to minimize wasted memory space in the block cache (internal +// fragmentation). +// +// The FlushGovernor is immutable and can be copied by value. +func MakeFlushGovernor( + targetBlockSize int, + blockSizeThreshold int, + sizeClassAwareThreshold int, + allocatorSizeClasses []int, +) FlushGovernor { + if len(allocatorSizeClasses) == 0 { + return makeFlushGovernorNoSizeClasses(targetBlockSize, blockSizeThreshold) + } + targetSizeWithOverhead := targetBlockSize + blockAllocationOverhead + classIdx := findClosestClass(allocatorSizeClasses, targetSizeWithOverhead) + if classIdx == 0 || classIdx == len(allocatorSizeClasses)-1 { + // Safeguard if our target isn't inside the known classes. + return makeFlushGovernorNoSizeClasses(targetBlockSize, blockSizeThreshold) + } + + var fg FlushGovernor + fg.lowWatermark = (targetBlockSize*sizeClassAwareThreshold + 99) / 100 + fg.targetBoundary = allocatorSizeClasses[classIdx] - blockAllocationOverhead + fg.highWatermark = allocatorSizeClasses[classIdx+1] - blockAllocationOverhead + // Safeguard, in case the threshold is very close to 100. + fg.lowWatermark = min(fg.lowWatermark, fg.targetBoundary) + + return fg +} + +func makeFlushGovernorNoSizeClasses(targetBlockSize int, blockSizeThreshold int) FlushGovernor { + return FlushGovernor{ + lowWatermark: (targetBlockSize*blockSizeThreshold + 99) / 100, + highWatermark: targetBlockSize, + targetBoundary: targetBlockSize, + } +} + +// LowWatermark returns the minimum size of a block that could be flushed. +// ShouldFlush will never return true if sizeBefore is below the low watermark. +// +// This can be used in a "fast path" check that uses an easy-to-compute +// overestimation of the block size. +func (fg *FlushGovernor) LowWatermark() int { + return fg.lowWatermark +} + +// ShouldFlush returns true if we should flush the current block of sizeBefore +// instead of adding another KV that would increase the block to sizeAfter. +func (fg *FlushGovernor) ShouldFlush(sizeBefore int, sizeAfter int) bool { + // In rare cases it's possible for the size to stay the same (or even + // decrease) when we add a KV to the block; tolerate this by always accepting + // the new KV. + if sizeBefore >= sizeAfter { + return false + } + if sizeBefore < fg.lowWatermark { + return false + } + if sizeAfter > fg.highWatermark { + return true + } + if sizeAfter > fg.targetBoundary { + // Flush, unless we're already past the boundary or the KV is large enough + // that we would waste less space in the next class. + if sizeBefore <= fg.targetBoundary && fg.highWatermark-sizeAfter > fg.targetBoundary-sizeBefore { + return true + } + } + return false +} + +func (fg FlushGovernor) String() string { + return fmt.Sprintf("low watermark: %d\nhigh watermark: %d\ntargetBoundary: %v\n", + fg.lowWatermark, fg.highWatermark, fg.targetBoundary) +} + +// findClosestClass returns the index of the allocation class that is closest to +// target. It can be either larger or smaller. +func findClosestClass(allocatorSizeClasses []int, target int) int { + // Find the first class >= target. + i, _ := slices.BinarySearch(allocatorSizeClasses, target) + if i == len(allocatorSizeClasses) || (i > 0 && target-allocatorSizeClasses[i-1] < allocatorSizeClasses[i]-target) { + i-- + } + return i +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block/kv.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/kv.go new file mode 100644 index 0000000..0cd450c --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/kv.go @@ -0,0 +1,103 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package block + +import "github.com/cockroachdb/pebble/v2/internal/base" + +// ValuePrefix is the single byte prefix in values indicating either an in-place +// value or a value encoding a valueHandle. It encodes multiple kinds of +// information (see below). +type ValuePrefix byte + +const ( + // 2 most-significant bits of valuePrefix encodes the value-kind. + valueKindMask ValuePrefix = 0xC0 + valueKindIsValueBlockHandle ValuePrefix = 0x80 + valueKindIsBlobHandle ValuePrefix = 0x40 + valueKindIsInPlaceValue ValuePrefix = 0x00 + + // 1 bit indicates SET has same key prefix as immediately preceding key that + // is also a SET. If the immediately preceding key in the same block is a + // SET, AND this bit is 0, the prefix must have changed. + // + // Note that the current policy of only storing older MVCC versions in value + // blocks means that valueKindIsValueHandle => SET has same prefix. But no + // code should rely on this behavior. Also, SET has same prefix does *not* + // imply valueKindIsValueHandle. + setHasSameKeyPrefixMask ValuePrefix = 0x20 + + // 3 least-significant bits for the user-defined base.ShortAttribute. + // Undefined for valueKindIsInPlaceValue. + userDefinedShortAttributeMask ValuePrefix = 0x07 +) + +// IsInPlaceValue returns true if the ValuePrefix is for an in-place value. +func (vp ValuePrefix) IsInPlaceValue() bool { + return vp&valueKindMask == valueKindIsInPlaceValue +} + +// IsValueBlockHandle returns true if the ValuePrefix is for a valblk.Handle. +func (vp ValuePrefix) IsValueBlockHandle() bool { + return vp&valueKindMask == valueKindIsValueBlockHandle +} + +// IsBlobValueHandle returns true if the ValuePrefix is for a blob. +func (vp ValuePrefix) IsBlobValueHandle() bool { + return vp&valueKindMask == valueKindIsBlobHandle +} + +// SetHasSamePrefix returns true if the ValuePrefix encodes that the key is a +// set with the same prefix as the preceding key which also is a set. +func (vp ValuePrefix) SetHasSamePrefix() bool { + return vp&setHasSameKeyPrefixMask == setHasSameKeyPrefixMask +} + +// ShortAttribute returns the user-defined base.ShortAttribute encoded in the +// ValuePrefix. +// +// REQUIRES: !IsInPlaceValue() +func (vp ValuePrefix) ShortAttribute() base.ShortAttribute { + return base.ShortAttribute(vp & userDefinedShortAttributeMask) +} + +// ValueBlockHandlePrefix returns the ValuePrefix for a valblk.Handle. +func ValueBlockHandlePrefix(setHasSameKeyPrefix bool, attribute base.ShortAttribute) ValuePrefix { + prefix := valueKindIsValueBlockHandle | ValuePrefix(attribute) + if setHasSameKeyPrefix { + prefix = prefix | setHasSameKeyPrefixMask + } + return prefix +} + +// InPlaceValuePrefix returns the ValuePrefix for an in-place value. +func InPlaceValuePrefix(setHasSameKeyPrefix bool) ValuePrefix { + prefix := valueKindIsInPlaceValue + if setHasSameKeyPrefix { + prefix = prefix | setHasSameKeyPrefixMask + } + return prefix +} + +// BlobValueHandlePrefix returns the ValuePrefix for a blob. +func BlobValueHandlePrefix(setHasSameKeyPrefix bool, attr base.ShortAttribute) ValuePrefix { + prefix := valueKindIsBlobHandle | ValuePrefix(attr) + if setHasSameKeyPrefix { + prefix = prefix | setHasSameKeyPrefixMask + } + return prefix +} + +// GetInternalValueForPrefixAndValueHandler is an interface for getting an +// InternalValue from a value prefix and value. +type GetInternalValueForPrefixAndValueHandler interface { + // GetInternalValueForPrefixAndValueHandle returns a InternalValue for the + // given value prefix and value. + // + // The result is only valid until the next call to + // GetInternalValueForPrefixAndValueHandle. Use InternalValue.Clone if the + // lifetime of the InternalValue needs to be extended. For more details, see + // the "memory management" comment where LazyValue is declared. + GetInternalValueForPrefixAndValueHandle(handle []byte) base.InternalValue +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block/transforms.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/transforms.go new file mode 100644 index 0000000..03e6ced --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block/transforms.go @@ -0,0 +1,248 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package block + +import ( + "bytes" + "fmt" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/base" +) + +// IterTransforms allow on-the-fly transformation of data at iteration time. +// +// These transformations could in principle be implemented as block transforms +// (at least for non-virtual sstables), but applying them during iteration is +// preferable. +type IterTransforms struct { + // SyntheticSeqNum, if set, overrides the sequence number in all keys. It is + // set if the sstable was ingested or it is foreign. + SyntheticSeqNum SyntheticSeqNum + // HideObsoletePoints, if true, skips over obsolete points during iteration. + // This is the norm when the sstable is foreign or the largest sequence number + // of the sstable is below the one we are reading. + HideObsoletePoints bool + + SyntheticPrefixAndSuffix SyntheticPrefixAndSuffix +} + +// NoTransforms is the default value for IterTransforms. +var NoTransforms = IterTransforms{} + +// NoTransforms returns true if there are no transforms enabled. +func (t *IterTransforms) NoTransforms() bool { + return t.SyntheticSeqNum == 0 && + !t.HideObsoletePoints && + t.SyntheticPrefixAndSuffix.IsUnset() +} + +func (t *IterTransforms) HasSyntheticPrefix() bool { + return t.SyntheticPrefixAndSuffix.HasPrefix() +} + +func (t *IterTransforms) SyntheticPrefix() []byte { + return t.SyntheticPrefixAndSuffix.Prefix() +} + +func (t *IterTransforms) HasSyntheticSuffix() bool { + return t.SyntheticPrefixAndSuffix.HasSuffix() +} + +func (t *IterTransforms) SyntheticSuffix() []byte { + return t.SyntheticPrefixAndSuffix.Suffix() +} + +// FragmentIterTransforms allow on-the-fly transformation of range deletion or +// range key data at iteration time. +type FragmentIterTransforms struct { + SyntheticSeqNum SyntheticSeqNum + SyntheticPrefixAndSuffix SyntheticPrefixAndSuffix +} + +// NoTransforms returns true if there are no transforms enabled. +func (t *FragmentIterTransforms) NoTransforms() bool { + // NoTransforms returns true if there are no transforms enabled. + return t.SyntheticSeqNum == 0 && t.SyntheticPrefixAndSuffix.IsUnset() +} + +func (t *FragmentIterTransforms) HasSyntheticPrefix() bool { + return t.SyntheticPrefixAndSuffix.HasPrefix() +} + +func (t *FragmentIterTransforms) SyntheticPrefix() []byte { + return t.SyntheticPrefixAndSuffix.Prefix() +} + +func (t *FragmentIterTransforms) HasSyntheticSuffix() bool { + return t.SyntheticPrefixAndSuffix.HasSuffix() +} + +func (t *FragmentIterTransforms) SyntheticSuffix() []byte { + return t.SyntheticPrefixAndSuffix.Suffix() +} + +// NoFragmentTransforms is the default value for IterTransforms. +var NoFragmentTransforms = FragmentIterTransforms{} + +// SyntheticSeqNum is used to override all sequence numbers in a table. It is +// set to a non-zero value when the table was created externally and ingested +// whole. +type SyntheticSeqNum base.SeqNum + +// NoSyntheticSeqNum is the default zero value for SyntheticSeqNum, which +// disables overriding the sequence number. +const NoSyntheticSeqNum SyntheticSeqNum = 0 + +// SyntheticSuffix will replace every suffix of every point key surfaced during +// block iteration. A synthetic suffix can be used if: +// 1. no two keys in the sst share the same prefix; and +// 2. pebble.Compare(prefix + replacementSuffix, prefix + originalSuffix) < 0, +// for all keys in the backing sst which have a suffix (i.e. originalSuffix +// is not empty). +// +// Range dels are not supported when synthetic suffix is used. +// +// For range keys, the synthetic suffix applies to the suffix that is part of +// RangeKeySet - if it is non-empty, it is replaced with the SyntheticSuffix. +// RangeKeyUnset keys are not supported when a synthetic suffix is used. +type SyntheticSuffix []byte + +// IsSet returns true if the synthetic suffix is not empty. +func (ss SyntheticSuffix) IsSet() bool { + return len(ss) > 0 +} + +// SyntheticPrefix represents a byte slice that is implicitly prepended to every +// key in a file being read or accessed by a reader. Note that since the byte +// slice is prepended to every KV rather than replacing a byte prefix, the +// result of prepending the synthetic prefix must be a full, valid key while the +// partial key physically stored within the sstable need not be a valid key +// according to user key semantics. +// +// Note that elsewhere we use the language of 'prefix' to describe the user key +// portion of a MVCC key, as defined by the Comparer's base.Split method. The +// SyntheticPrefix is related only in that it's a byte prefix that is +// incorporated into the logical MVCC prefix. +// +// The table's bloom filters are constructed only on the partial keys physically +// stored in the table, but interactions with the file including seeks and +// reads will all behave as if the file had been constructed from keys that +// include the synthetic prefix. Note that all Compare operations will act on a +// partial key (before any prepending), so the Comparer must support comparing +// these partial keys. +// +// The synthetic prefix will never modify key metadata stored in the key suffix. +// +// NB: Since this transformation currently only applies to point keys, a block +// with range keys cannot be iterated over with a synthetic prefix. +type SyntheticPrefix []byte + +// IsSet returns true if the synthetic prefix is not enpty. +func (sp SyntheticPrefix) IsSet() bool { + return len(sp) > 0 +} + +// Apply prepends the synthetic prefix to a key. +func (sp SyntheticPrefix) Apply(key []byte) []byte { + res := make([]byte, 0, len(sp)+len(key)) + res = append(res, sp...) + res = append(res, key...) + return res +} + +// Invert removes the synthetic prefix from a key. +func (sp SyntheticPrefix) Invert(key []byte) []byte { + res, ok := bytes.CutPrefix(key, sp) + if !ok { + panic(fmt.Sprintf("unexpected prefix: %s", key)) + } + return res +} + +// SyntheticPrefixAndSuffix is a more compact way of representing both a +// synthetic prefix and a synthetic suffix. See SyntheticPrefix and +// SyntheticSuffix. +// +// The zero value is valid, representing no synthetic prefix or suffix. +type SyntheticPrefixAndSuffix struct { + prefixLen uint32 + suffixLen uint32 + // buf is either nil (iff prefixLen=suffixLen=0) or a pointer to a buffer + // containing the prefix followed by the suffix. + buf unsafe.Pointer +} + +// MakeSyntheticPrefixAndSuffix returns a SyntheticPrefixAndSuffix with the +// given prefix and suffix. +func MakeSyntheticPrefixAndSuffix( + prefix SyntheticPrefix, suffix SyntheticSuffix, +) SyntheticPrefixAndSuffix { + if !prefix.IsSet() && !suffix.IsSet() { + return SyntheticPrefixAndSuffix{} + } + buf := make([]byte, len(prefix)+len(suffix)) + copy(buf, prefix) + copy(buf[len(prefix):], suffix) + return SyntheticPrefixAndSuffix{ + prefixLen: uint32(len(prefix)), + suffixLen: uint32(len(suffix)), + buf: unsafe.Pointer(&buf[0]), + } +} + +// IsUnset returns true if HasPrefix() and HasSuffix() both return false. +func (ps SyntheticPrefixAndSuffix) IsUnset() bool { + return ps.buf == nil +} + +// HasPrefix returns true if ps contains a non-empty synthetic prefix. +func (ps SyntheticPrefixAndSuffix) HasPrefix() bool { + return ps.prefixLen != 0 +} + +// PrefixLen returns the length of the synthetic prefix, or 0 if it is not set. +func (ps SyntheticPrefixAndSuffix) PrefixLen() uint32 { + return ps.prefixLen +} + +// Prefix returns the synthetic prefix. +func (ps SyntheticPrefixAndSuffix) Prefix() SyntheticPrefix { + if ps.prefixLen == 0 { + return nil + } + return unsafe.Slice((*byte)(ps.buf), ps.prefixLen) +} + +// HasSuffix returns true if ps contains a non-empty synthetic suffix. +func (ps SyntheticPrefixAndSuffix) HasSuffix() bool { + return ps.suffixLen != 0 +} + +// SuffixLen returns the length of the synthetic prefix, or 0 if it is not set. +func (ps SyntheticPrefixAndSuffix) SuffixLen() uint32 { + return ps.suffixLen +} + +// Suffix returns the synthetic suffix. +func (ps SyntheticPrefixAndSuffix) Suffix() SyntheticSuffix { + if ps.suffixLen == 0 { + return nil + } + return unsafe.Slice((*byte)(unsafe.Pointer(uintptr(ps.buf)+uintptr(ps.prefixLen))), ps.suffixLen) +} + +// RemoveSuffix returns a SyntheticPrefixAndSuffix that has the same prefix as +// the receiver but no suffix. +func (ps SyntheticPrefixAndSuffix) RemoveSuffix() SyntheticPrefixAndSuffix { + if ps.prefixLen == 0 { + return SyntheticPrefixAndSuffix{} + } + return SyntheticPrefixAndSuffix{ + prefixLen: ps.prefixLen, + suffixLen: 0, + buf: ps.buf, + } +} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/block_property.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block_property.go similarity index 66% rename from vendor/github.com/cockroachdb/pebble/sstable/block_property.go rename to vendor/github.com/cockroachdb/pebble/v2/sstable/block_property.go index d85a2b2..1983945 100644 --- a/vendor/github.com/cockroachdb/pebble/sstable/block_property.go +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block_property.go @@ -11,8 +11,8 @@ import ( "sync" "unsafe" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/rangekey" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" ) // Block properties are an optional user-facing feature that can be used to @@ -98,51 +98,61 @@ import ( type BlockPropertyCollector interface { // Name returns the name of the block property collector. Name() string - // Add is called with each new entry added to a data block in the sstable. - // The callee can assume that these are in sorted order. - Add(key InternalKey, value []byte) error + + // AddPointKey is called with each new key added to a data block in the + // sstable. The callee can assume that these are in sorted order. + AddPointKey(key InternalKey, value []byte) error + + // AddRangeKeys is called for each range span added to the sstable. The range + // key properties are stored separately and don't contribute to data block + // properties. They are only used when FinishTable is called. + // TODO(radu): clean up this subtle semantic. + AddRangeKeys(span Span) error + + // AddCollectedWithSuffixReplacement adds previously collected property data + // and updates it to reflect a change of suffix on all keys: the old property + // data is assumed to be constructed from keys that all have the same + // oldSuffix and is recalculated to reflect the same keys but with newSuffix. + // + // A collector which supports this method must be able to derive its updated + // value from its old value and the change being made to the suffix, without + // needing to be passed each updated K/V. + // + // For example, a collector that only inspects values can simply copy its + // previously computed property as-is, since key-suffix replacement does not + // change values, while a collector that depends only on key suffixes, like + // one which collected mvcc-timestamp bounds from timestamp-suffixed keys, can + // just set its new bounds from the new suffix, as it is common to all keys, + // without needing to recompute it from every key. + // + // This method is optional (if it is not implemented, it always returns an + // error). SupportsSuffixReplacement() can be used to check if this method is + // implemented. + AddCollectedWithSuffixReplacement(oldProp []byte, oldSuffix, newSuffix []byte) error + + // SupportsSuffixReplacement returns whether the collector supports the + // AddCollectedWithSuffixReplacement method. + SupportsSuffixReplacement() bool + // FinishDataBlock is called when all the entries have been added to a // data block. Subsequent Add calls will be for the next data block. It // returns the property value for the finished block. FinishDataBlock(buf []byte) ([]byte, error) + // AddPrevDataBlockToIndexBlock adds the entry corresponding to the // previous FinishDataBlock to the current index block. AddPrevDataBlockToIndexBlock() + // FinishIndexBlock is called when an index block, containing all the // key-value pairs since the last FinishIndexBlock, will no longer see new // entries. It returns the property value for the index block. FinishIndexBlock(buf []byte) ([]byte, error) + // FinishTable is called when the sstable is finished, and returns the // property value for the sstable. FinishTable(buf []byte) ([]byte, error) } -// SuffixReplaceableBlockCollector is an extension to the BlockPropertyCollector -// interface that allows a block property collector to indicate that it supports -// being *updated* during suffix replacement, i.e. when an existing SST in which -// all keys have the same key suffix is updated to have a new suffix. -// -// A collector which supports being updated in such cases must be able to derive -// its updated value from its old value and the change being made to the suffix, -// without needing to be passed each updated K/V. -// -// For example, a collector that only inspects values would can simply copy its -// previously computed property as-is, since key-suffix replacement does not -// change values, while a collector that depends only on key suffixes, like one -// which collected mvcc-timestamp bounds from timestamp-suffixed keys, can just -// set its new bounds from the new suffix, as it is common to all keys, without -// needing to recompute it from every key. -// -// An implementation of DataBlockIntervalCollector can also implement this -// interface, in which case the BlockPropertyCollector returned by passing it to -// NewBlockIntervalCollector will also implement this interface automatically. -type SuffixReplaceableBlockCollector interface { - // UpdateKeySuffixes is called when a block is updated to change the suffix of - // all keys in the block, and is passed the old value for that prop, if any, - // for that block as well as the old and new suffix. - UpdateKeySuffixes(oldProp []byte, oldSuffix, newSuffix []byte) error -} - // BlockPropertyFilter is used in an Iterator to filter sstables and blocks // within the sstable. It should not maintain any per-sstable state, and must // be thread-safe. @@ -218,29 +228,28 @@ type BoundLimitedBlockPropertyFilter interface { // independent instances, rather than references to the same collector, as point // and range keys are tracked independently. type BlockIntervalCollector struct { - name string - points DataBlockIntervalCollector - ranges DataBlockIntervalCollector + name string + mapper IntervalMapper + suffixReplacer BlockIntervalSuffixReplacer - blockInterval interval - indexInterval interval - tableInterval interval + blockInterval BlockInterval + indexInterval BlockInterval + tableInterval BlockInterval } var _ BlockPropertyCollector = &BlockIntervalCollector{} -// DataBlockIntervalCollector is the interface used by BlockIntervalCollector -// that contains the actual logic pertaining to the property. It only -// maintains state for the current data block, and resets that state in -// FinishDataBlock. This interface can be used to reduce parsing costs. -type DataBlockIntervalCollector interface { - // Add is called with each new entry added to a data block in the sstable. - // The callee can assume that these are in sorted order. - Add(key InternalKey, value []byte) error - // FinishDataBlock is called when all the entries have been added to a - // data block. Subsequent Add calls will be for the next data block. It - // returns the [lower, upper) for the finished block. - FinishDataBlock() (lower uint64, upper uint64, err error) +// IntervalMapper is an interface through which a user can define the mapping +// between keys and intervals. The interval for any collection of keys (e.g. a +// data block, a table) is the union of intervals for all keys. +type IntervalMapper interface { + // MapPointKey maps a point key to an interval. The interval can be empty, which + // means that this key will effectively be ignored. + MapPointKey(key InternalKey, value []byte) (BlockInterval, error) + + // MapRangeKeys maps a range key span to an interval. The interval can be + // empty, which means that this span will effectively be ignored. + MapRangeKeys(span Span) (BlockInterval, error) } // NewBlockIntervalCollector constructs a BlockIntervalCollector with the given @@ -256,161 +265,174 @@ type DataBlockIntervalCollector interface { // If both point and range keys are to be tracked, two independent collectors // should be provided, rather than the same collector passed in twice (see the // comment on BlockIntervalCollector for more detail) +// XXX update func NewBlockIntervalCollector( - name string, pointCollector, rangeCollector DataBlockIntervalCollector, + name string, mapper IntervalMapper, suffixReplacer BlockIntervalSuffixReplacer, ) BlockPropertyCollector { - if pointCollector == nil && rangeCollector == nil { - panic("sstable: at least one interval collector must be provided") + if mapper == nil { + panic("mapper must be provided") } - bic := BlockIntervalCollector{ - name: name, - points: pointCollector, - ranges: rangeCollector, - } - if _, ok := pointCollector.(SuffixReplaceableBlockCollector); ok { - return &suffixReplacementBlockCollectorWrapper{bic} + return &BlockIntervalCollector{ + name: name, + mapper: mapper, + suffixReplacer: suffixReplacer, } - return &bic } -// Name implements the BlockPropertyCollector interface. +// Name is part of the BlockPropertyCollector interface. func (b *BlockIntervalCollector) Name() string { return b.name } -// Add implements the BlockPropertyCollector interface. -func (b *BlockIntervalCollector) Add(key InternalKey, value []byte) error { - if rangekey.IsRangeKey(key.Kind()) { - if b.ranges != nil { - return b.ranges.Add(key, value) - } - } else if b.points != nil { - return b.points.Add(key, value) +// AddPointKey is part of the BlockPropertyCollector interface. +func (b *BlockIntervalCollector) AddPointKey(key InternalKey, value []byte) error { + interval, err := b.mapper.MapPointKey(key, value) + if err != nil { + return err } + b.blockInterval.UnionWith(interval) return nil } -// FinishDataBlock implements the BlockPropertyCollector interface. -func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) { - if b.points == nil { - return buf, nil +// AddRangeKeys is part of the BlockPropertyCollector interface. +func (b *BlockIntervalCollector) AddRangeKeys(span Span) error { + if span.Empty() { + return nil } - var err error - b.blockInterval.lower, b.blockInterval.upper, err = b.points.FinishDataBlock() + interval, err := b.mapper.MapRangeKeys(span) + if err != nil { + return err + } + // Range keys are not included in block or index intervals; they just apply + // directly to the table interval. + b.tableInterval.UnionWith(interval) + return nil +} + +// AddCollectedWithSuffixReplacement is part of the BlockPropertyCollector interface. +func (b *BlockIntervalCollector) AddCollectedWithSuffixReplacement( + oldProp []byte, oldSuffix, newSuffix []byte, +) error { + i, err := decodeBlockInterval(oldProp) + if err != nil { + return err + } + i, err = b.suffixReplacer.ApplySuffixReplacement(i, newSuffix) if err != nil { - return buf, err + return err } - buf = b.blockInterval.encode(buf) - b.tableInterval.union(b.blockInterval) + b.blockInterval.UnionWith(i) + return nil +} + +// SupportsSuffixReplacement is part of the BlockPropertyCollector interface. +func (b *BlockIntervalCollector) SupportsSuffixReplacement() bool { + return b.suffixReplacer != nil +} + +// FinishDataBlock is part of the BlockPropertyCollector interface. +func (b *BlockIntervalCollector) FinishDataBlock(buf []byte) ([]byte, error) { + buf = encodeBlockInterval(b.blockInterval, buf) + b.tableInterval.UnionWith(b.blockInterval) return buf, nil } // AddPrevDataBlockToIndexBlock implements the BlockPropertyCollector // interface. func (b *BlockIntervalCollector) AddPrevDataBlockToIndexBlock() { - b.indexInterval.union(b.blockInterval) - b.blockInterval = interval{} + b.indexInterval.UnionWith(b.blockInterval) + b.blockInterval = BlockInterval{} } // FinishIndexBlock implements the BlockPropertyCollector interface. func (b *BlockIntervalCollector) FinishIndexBlock(buf []byte) ([]byte, error) { - buf = b.indexInterval.encode(buf) - b.indexInterval = interval{} + buf = encodeBlockInterval(b.indexInterval, buf) + b.indexInterval = BlockInterval{} return buf, nil } // FinishTable implements the BlockPropertyCollector interface. func (b *BlockIntervalCollector) FinishTable(buf []byte) ([]byte, error) { - // If the collector is tracking range keys, the range key interval is union-ed - // with the point key interval for the table. - if b.ranges != nil { - var rangeInterval interval - var err error - rangeInterval.lower, rangeInterval.upper, err = b.ranges.FinishDataBlock() - if err != nil { - return buf, err - } - b.tableInterval.union(rangeInterval) - } - return b.tableInterval.encode(buf), nil + return encodeBlockInterval(b.tableInterval, buf), nil +} + +// BlockInterval represents the [Lower, Upper) interval of 64-bit values +// corresponding to a set of keys. The meaning of the values themselves is +// opaque to the BlockIntervalCollector. +// +// If Lower >= Upper, the interval is the empty set. +type BlockInterval struct { + Lower uint64 + Upper uint64 +} + +// IsEmpty returns true if the interval is empty. +func (i BlockInterval) IsEmpty() bool { + return i.Lower >= i.Upper } -type interval struct { - lower uint64 - upper uint64 +// Intersects returns true if the two intervals intersect. +func (i BlockInterval) Intersects(other BlockInterval) bool { + return !i.IsEmpty() && !other.IsEmpty() && i.Upper > other.Lower && i.Lower < other.Upper } -func (i interval) encode(buf []byte) []byte { - if i.lower < i.upper { - var encoded [binary.MaxVarintLen64 * 2]byte - n := binary.PutUvarint(encoded[:], i.lower) - n += binary.PutUvarint(encoded[n:], i.upper-i.lower) - buf = append(buf, encoded[:n]...) +// UnionWith extends the receiver to include another interval. +func (i *BlockInterval) UnionWith(other BlockInterval) { + switch { + case other.IsEmpty(): + case i.IsEmpty(): + *i = other + default: + i.Lower = min(i.Lower, other.Lower) + i.Upper = max(i.Upper, other.Upper) } - return buf } -func (i *interval) decode(buf []byte) error { +func encodeBlockInterval(i BlockInterval, buf []byte) []byte { + if i.IsEmpty() { + return buf + } + + var encoded [binary.MaxVarintLen64 * 2]byte + n := binary.PutUvarint(encoded[:], i.Lower) + n += binary.PutUvarint(encoded[n:], i.Upper-i.Lower) + return append(buf, encoded[:n]...) +} + +func decodeBlockInterval(buf []byte) (BlockInterval, error) { if len(buf) == 0 { - *i = interval{} - return nil + return BlockInterval{}, nil } + var i BlockInterval var n int - i.lower, n = binary.Uvarint(buf) + i.Lower, n = binary.Uvarint(buf) if n <= 0 || n >= len(buf) { - return base.CorruptionErrorf("cannot decode interval from buf %x", buf) + return BlockInterval{}, base.CorruptionErrorf("cannot decode interval from buf %x", buf) } pos := n - i.upper, n = binary.Uvarint(buf[pos:]) + i.Upper, n = binary.Uvarint(buf[pos:]) pos += n if pos != len(buf) || n <= 0 { - return base.CorruptionErrorf("cannot decode interval from buf %x", buf) + return BlockInterval{}, base.CorruptionErrorf("cannot decode interval from buf %x", buf) } // Delta decode. - i.upper += i.lower - if i.upper < i.lower { - return base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.upper, i.lower) - } - return nil -} - -func (i *interval) union(x interval) { - if x.lower >= x.upper { - // x is the empty set. - return - } - if i.lower >= i.upper { - // i is the empty set. - *i = x - return - } - // Both sets are non-empty. - if x.lower < i.lower { - i.lower = x.lower - } - if x.upper > i.upper { - i.upper = x.upper + i.Upper += i.Lower + if i.Upper < i.Lower { + return BlockInterval{}, base.CorruptionErrorf("unexpected overflow, upper %d < lower %d", i.Upper, i.Lower) } + return i, nil } -func (i interval) intersects(x interval) bool { - if i.lower >= i.upper || x.lower >= x.upper { - // At least one of the sets is empty. - return false - } - // Neither set is empty. - return i.upper > x.lower && i.lower < x.upper -} - -type suffixReplacementBlockCollectorWrapper struct { - BlockIntervalCollector -} - -// UpdateKeySuffixes implements the SuffixReplaceableBlockCollector interface. -func (w *suffixReplacementBlockCollectorWrapper) UpdateKeySuffixes( - oldProp []byte, from, to []byte, -) error { - return w.BlockIntervalCollector.points.(SuffixReplaceableBlockCollector).UpdateKeySuffixes(oldProp, from, to) +// BlockIntervalSuffixReplacer provides methods to conduct just in time +// adjustments of a passed in block prop interval before filtering. +type BlockIntervalSuffixReplacer interface { + // ApplySuffixReplacement recalculates a previously calculated interval (which + // corresponds to an arbitrary collection of keys) under the assumption + // that those keys are rewritten with a new prefix. + // + // Such a transformation is possible when the intervals depend only on the + // suffixes. + ApplySuffixReplacement(interval BlockInterval, newSuffix []byte) (BlockInterval, error) } // BlockIntervalFilter is an implementation of BlockPropertyFilter when the @@ -418,7 +440,8 @@ func (w *suffixReplacementBlockCollectorWrapper) UpdateKeySuffixes( // the form [lower, upper). type BlockIntervalFilter struct { name string - filterInterval interval + filterInterval BlockInterval + suffixReplacer BlockIntervalSuffixReplacer } var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil) @@ -427,9 +450,11 @@ var _ BlockPropertyFilter = (*BlockIntervalFilter)(nil) // based on an interval property collected by BlockIntervalCollector and the // given [lower, upper) bounds. The given name specifies the // BlockIntervalCollector's properties to read. -func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockIntervalFilter { +func NewBlockIntervalFilter( + name string, lower uint64, upper uint64, suffixReplacer BlockIntervalSuffixReplacer, +) *BlockIntervalFilter { b := new(BlockIntervalFilter) - b.Init(name, lower, upper) + b.Init(name, lower, upper, suffixReplacer) return b } @@ -437,10 +462,13 @@ func NewBlockIntervalFilter(name string, lower uint64, upper uint64) *BlockInter // BLockPropertyFilter to filter blocks based on an interval property collected // by BlockIntervalCollector and the given [lower, upper) bounds. The given name // specifies the BlockIntervalCollector's properties to read. -func (b *BlockIntervalFilter) Init(name string, lower, upper uint64) { +func (b *BlockIntervalFilter) Init( + name string, lower, upper uint64, suffixReplacer BlockIntervalSuffixReplacer, +) { *b = BlockIntervalFilter{ name: name, - filterInterval: interval{lower: lower, upper: upper}, + filterInterval: BlockInterval{Lower: lower, Upper: upper}, + suffixReplacer: suffixReplacer, } } @@ -451,11 +479,28 @@ func (b *BlockIntervalFilter) Name() string { // Intersects implements the BlockPropertyFilter interface. func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) { - var i interval - if err := i.decode(prop); err != nil { + i, err := decodeBlockInterval(prop) + if err != nil { + return false, err + } + return i.Intersects(b.filterInterval), nil +} + +// SyntheticSuffixIntersects implements the BlockPropertyFilter interface. +func (b *BlockIntervalFilter) SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) { + if b.suffixReplacer == nil { + return false, base.AssertionFailedf("missing SuffixReplacer for SyntheticSuffixIntersects()") + } + i, err := decodeBlockInterval(prop) + if err != nil { + return false, err + } + + newInterval, err := b.suffixReplacer.ApplySuffixReplacement(i, suffix) + if err != nil { return false, err } - return i.intersects(b.filterInterval), nil + return newInterval.Intersects(b.filterInterval), nil } // SetInterval adjusts the [lower, upper) bounds used by the filter. It is not @@ -463,14 +508,31 @@ func (b *BlockIntervalFilter) Intersects(prop []byte) (bool, error) { // implementation of BlockPropertyFilterMask.SetSuffix used for range-key // masking. func (b *BlockIntervalFilter) SetInterval(lower, upper uint64) { - b.filterInterval = interval{lower: lower, upper: upper} + b.filterInterval = BlockInterval{Lower: lower, Upper: upper} +} + +// When encoding block properties for each block, we cannot afford to encode the +// name. Instead, the name is mapped to a shortID, in the scope of that sstable, +// and the shortID is encoded as a single byte (which imposes a limit of of 256 +// block property collectors per sstable). +// Note that the in-memory type is int16 to avoid overflows (e.g. in loops) and +// to allow special values like -1 in code. +type shortID int16 + +const invalidShortID shortID = -1 +const maxShortID shortID = math.MaxUint8 +const maxPropertyCollectors = int(maxShortID) + 1 + +func (id shortID) IsValid() bool { + return id >= 0 && id <= maxShortID } -// When encoding block properties for each block, we cannot afford to encode -// the name. Instead, the name is mapped to a shortID, in the scope of that -// sstable, and the shortID is encoded. Since we use a uint8, there is a limit -// of 256 block property collectors per sstable. -type shortID uint8 +func (id shortID) ToByte() byte { + if invariants.Enabled && !id.IsValid() { + panic(fmt.Sprintf("inavlid id %d", id)) + } + return byte(id) +} type blockPropertiesEncoder struct { propsBuf []byte @@ -486,6 +548,11 @@ func (e *blockPropertiesEncoder) resetProps() { } func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) { + if len(scratch) == 0 { + // We omit empty properties. The decoder will know that any missing IDs had + // empty values. + return + } const lenID = 1 lenProp := uvarintLen(uint32(len(scratch))) n := lenID + lenProp + len(scratch) @@ -500,7 +567,7 @@ func (e *blockPropertiesEncoder) addProp(id shortID, scratch []byte) { } pos := len(e.propsBuf) b := e.propsBuf[pos : pos+lenID] - b[0] = byte(id) + b[0] = id.ToByte() pos += lenID b = e.propsBuf[pos : pos+lenProp] n = binary.PutUvarint(b, uint64(len(scratch))) @@ -524,16 +591,41 @@ func (e *blockPropertiesEncoder) props() []byte { type blockPropertiesDecoder struct { props []byte + + // numCollectedProps is the number of collectors that were used when writing + // these properties. The encoded properties contain values for shortIDs 0 + // through numCollectedProps-1, in order (with empty properties omitted). + numCollectedProps int + nextID shortID +} + +func makeBlockPropertiesDecoder(numCollectedProps int, propsBuf []byte) blockPropertiesDecoder { + return blockPropertiesDecoder{ + props: propsBuf, + numCollectedProps: numCollectedProps, + } } -func (d *blockPropertiesDecoder) done() bool { - return len(d.props) == 0 +func (d *blockPropertiesDecoder) Done() bool { + return int(d.nextID) >= d.numCollectedProps } -// REQUIRES: !done() -func (d *blockPropertiesDecoder) next() (id shortID, prop []byte, err error) { +// Next returns the property for each shortID between 0 and numCollectedProps-1, in order. +// Note that some properties might be empty. +// REQUIRES: !Done() +func (d *blockPropertiesDecoder) Next() (id shortID, prop []byte, err error) { + id = d.nextID + d.nextID++ + + if len(d.props) == 0 || shortID(d.props[0]) != id { + if invariants.Enabled && len(d.props) > 0 && shortID(d.props[0]) < id { + panic("shortIDs are not in order") + } + // This property was omitted because it was empty. + return id, nil, nil + } + const lenID = 1 - id = shortID(d.props[0]) propLen, m := binary.Uvarint(d.props[lenID:]) n := lenID + m if m <= 0 || propLen == 0 || (n+int(propLen)) > len(d.props) { @@ -580,6 +672,8 @@ type BlockPropertiesFilterer struct { // collected when the table was built. boundLimitedFilter BoundLimitedBlockPropertyFilter boundLimitedShortID int + + syntheticSuffix SyntheticSuffix } var blockPropertiesFiltererPool = sync.Pool{ @@ -591,7 +685,9 @@ var blockPropertiesFiltererPool = sync.Pool{ // newBlockPropertiesFilterer returns a partially initialized filterer. To complete // initialization, call IntersectsUserPropsAndFinishInit. func newBlockPropertiesFilterer( - filters []BlockPropertyFilter, limited BoundLimitedBlockPropertyFilter, + filters []BlockPropertyFilter, + limited BoundLimitedBlockPropertyFilter, + syntheticSuffix SyntheticSuffix, ) *BlockPropertiesFilterer { filterer := blockPropertiesFiltererPool.Get().(*BlockPropertiesFilterer) *filterer = BlockPropertiesFilterer{ @@ -599,6 +695,7 @@ func newBlockPropertiesFilterer( shortIDToFiltersIndex: filterer.shortIDToFiltersIndex[:0], boundLimitedFilter: limited, boundLimitedShortID: -1, + syntheticSuffix: syntheticSuffix, } return filterer } @@ -620,8 +717,9 @@ func IntersectsTable( filters []BlockPropertyFilter, limited BoundLimitedBlockPropertyFilter, userProperties map[string]string, + syntheticSuffix SyntheticSuffix, ) (*BlockPropertiesFilterer, error) { - f := newBlockPropertiesFilterer(filters, limited) + f := newBlockPropertiesFilterer(filters, limited, syntheticSuffix) ok, err := f.intersectsUserPropsAndFinishInit(userProperties) if !ok || err != nil { releaseBlockPropertiesFilterer(f) @@ -655,7 +753,13 @@ func (f *BlockPropertiesFilterer) intersectsUserPropsAndFinishInit( // Note that unsafe.StringData only works if the string is not empty // (which we already checked). byteProps := unsafe.Slice(unsafe.StringData(props), len(props)) - intersects, err := f.filters[i].Intersects(byteProps[1:]) + var intersects bool + var err error + if len(f.syntheticSuffix) == 0 { + intersects, err = f.filters[i].Intersects(byteProps[1:]) + } else { + intersects, err = f.filters[i].SyntheticSuffixIntersects(byteProps[1:], f.syntheticSuffix) + } if err != nil || !intersects { return false, err } @@ -741,59 +845,40 @@ const ( blockMaybeExcluded ) -func (f *BlockPropertiesFilterer) intersects(props []byte) (ret intersectsResult, err error) { - i := 0 - decoder := blockPropertiesDecoder{props: props} - ret = blockIntersects - for i < len(f.shortIDToFiltersIndex) { - var id int - var prop []byte - if !decoder.done() { - var shortID shortID - var err error - shortID, prop, err = decoder.next() - if err != nil { - return ret, err - } - id = int(shortID) - } else { - id = math.MaxUint8 + 1 - } - for i < len(f.shortIDToFiltersIndex) && id > i { - // The property for this id is not encoded for this block, but there - // may still be a filter for this id. - if intersects, err := f.intersectsFilter(i, nil); err != nil { - return ret, err - } else if intersects == blockExcluded { - return blockExcluded, nil - } else if intersects == blockMaybeExcluded { - ret = blockMaybeExcluded - } - i++ - } - if i >= len(f.shortIDToFiltersIndex) { - return ret, nil - } - // INVARIANT: id <= i. And since i is always incremented by 1, id==i. - if id != i { - panic(fmt.Sprintf("%d != %d", id, i)) +func (f *BlockPropertiesFilterer) intersects(props []byte) (intersectsResult, error) { + decoder := makeBlockPropertiesDecoder(len(f.shortIDToFiltersIndex), props) + ret := blockIntersects + for !decoder.Done() { + id, prop, err := decoder.Next() + if err != nil { + return ret, err } - if intersects, err := f.intersectsFilter(i, prop); err != nil { + intersects, err := f.intersectsFilter(id, prop) + if err != nil { return ret, err - } else if intersects == blockExcluded { + } + if intersects == blockExcluded { return blockExcluded, nil - } else if intersects == blockMaybeExcluded { + } + if intersects == blockMaybeExcluded { ret = blockMaybeExcluded } - i++ } - // ret == blockIntersects || ret == blockMaybeExcluded + // ret is either blockIntersects or blockMaybeExcluded. return ret, nil } -func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersectsResult, error) { - if f.shortIDToFiltersIndex[i] >= 0 { - intersects, err := f.filters[f.shortIDToFiltersIndex[i]].Intersects(prop) +func (f *BlockPropertiesFilterer) intersectsFilter( + id shortID, prop []byte, +) (intersectsResult, error) { + var intersects bool + var err error + if filterIdx := f.shortIDToFiltersIndex[id]; filterIdx >= 0 { + if !f.syntheticSuffix.IsSet() { + intersects, err = f.filters[filterIdx].Intersects(prop) + } else { + intersects, err = f.filters[filterIdx].SyntheticSuffixIntersects(prop, f.syntheticSuffix) + } if err != nil { return blockIntersects, err } @@ -801,7 +886,7 @@ func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersec return blockExcluded, nil } } - if i == f.boundLimitedShortID { + if int(id) == f.boundLimitedShortID { // The bound-limited filter uses this id. // // The bound-limited filter only applies within a keyspan interval. We @@ -809,7 +894,11 @@ func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersec // Intersects determines that there is no intersection, we return // `blockMaybeExcluded` if no other bpf unconditionally excludes the // block. - intersects, err := f.boundLimitedFilter.Intersects(prop) + if !f.syntheticSuffix.IsSet() { + intersects, err = f.boundLimitedFilter.Intersects(prop) + } else { + intersects, err = f.boundLimitedFilter.SyntheticSuffixIntersects(prop, f.syntheticSuffix) + } if err != nil { return blockIntersects, err } else if !intersects { @@ -818,3 +907,12 @@ func (f *BlockPropertiesFilterer) intersectsFilter(i int, prop []byte) (intersec } return blockIntersects, nil } + +func uvarintLen(v uint32) int { + i := 0 + for v >= 0x80 { + v >>= 7 + i++ + } + return i + 1 +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/block_property_obsolete.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block_property_obsolete.go new file mode 100644 index 0000000..3646745 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block_property_obsolete.go @@ -0,0 +1,159 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" +) + +// obsoleteKeyBlockPropertyCollector is a block property collector used to +// implement obsoleteKeyBlockPropertyFilter - a filter that excludes blocks +// which contain only obsolete keys. +// +// For an explanation of obsolete keys, see the comment for TableFormatPebblev4 +// which explains obsolete keys. +type obsoleteKeyBlockPropertyCollector struct { + blockIsNonObsolete bool + indexIsNonObsolete bool + tableIsNonObsolete bool +} + +var _ BlockPropertyCollector = (*obsoleteKeyBlockPropertyCollector)(nil) + +// Name is part of the BlockPropertyCollector interface. +func (o *obsoleteKeyBlockPropertyCollector) Name() string { + return "obsolete-key" +} + +// AddPointKey is part of the BlockPropertyCollector interface. +func (o *obsoleteKeyBlockPropertyCollector) AddPointKey(key InternalKey, value []byte) error { + // Ignore. + return nil +} + +// AddRangeKeys is part of the BlockPropertyCollector interface. +func (o *obsoleteKeyBlockPropertyCollector) AddRangeKeys(span Span) error { + // Ignore. + return nil +} + +// AddPoint is an out-of-band method that is specific to this collector. +func (o *obsoleteKeyBlockPropertyCollector) AddPoint(isObsolete bool) { + o.blockIsNonObsolete = o.blockIsNonObsolete || !isObsolete +} + +// FinishDataBlock is part of the BlockPropertyCollector interface. +func (o *obsoleteKeyBlockPropertyCollector) FinishDataBlock(buf []byte) ([]byte, error) { + o.tableIsNonObsolete = o.tableIsNonObsolete || o.blockIsNonObsolete + return obsoleteKeyBlockPropertyEncode(!o.blockIsNonObsolete, buf), nil +} + +// AddPrevDataBlockToIndexBlock is part of the BlockPropertyCollector interface. +func (o *obsoleteKeyBlockPropertyCollector) AddPrevDataBlockToIndexBlock() { + o.indexIsNonObsolete = o.indexIsNonObsolete || o.blockIsNonObsolete + o.blockIsNonObsolete = false +} + +// FinishIndexBlock is part of the BlockPropertyCollector interface. +func (o *obsoleteKeyBlockPropertyCollector) FinishIndexBlock(buf []byte) ([]byte, error) { + buf = obsoleteKeyBlockPropertyEncode(!o.indexIsNonObsolete, buf) + o.indexIsNonObsolete = false + return buf, nil +} + +// FinishTable is part of the BlockPropertyCollector interface. +func (o *obsoleteKeyBlockPropertyCollector) FinishTable(buf []byte) ([]byte, error) { + return obsoleteKeyBlockPropertyEncode(!o.tableIsNonObsolete, buf), nil +} + +// AddCollectedWithSuffixReplacement is part of the BlockPropertyCollector interface. +func (o *obsoleteKeyBlockPropertyCollector) AddCollectedWithSuffixReplacement( + oldProp []byte, oldSuffix, newSuffix []byte, +) error { + // Verify the property is valid. + _, err := obsoleteKeyBlockPropertyDecode(oldProp) + if err != nil { + return err + } + // Suffix rewriting currently loses the obsolete bit. + o.blockIsNonObsolete = true + return nil +} + +// SupportsSuffixReplacement is part of the BlockPropertyCollector interface. +func (o *obsoleteKeyBlockPropertyCollector) SupportsSuffixReplacement() bool { + return true +} + +// obsoleteKeyBlockPropertyFilter implements the filter that excludes blocks +// that only contain obsolete keys. It pairs with +// obsoleteKeyBlockPropertyCollector. +// +// Note that we filter data blocks as well as first-level index blocks. +// +// For an explanation of obsolete keys, see the comment for TableFormatPebblev4 +// which explains obsolete keys. +// +// NB: obsoleteKeyBlockPropertyFilter is stateless. This aspect of the filter +// is used in table_cache.go for in-place modification of a filters slice. +type obsoleteKeyBlockPropertyFilter struct{} + +var _ BlockPropertyFilter = obsoleteKeyBlockPropertyFilter{} + +// Name is part of the BlockPropertyFilter interface. It must match +// obsoleteKeyBlockPropertyCollector.Name. +func (o obsoleteKeyBlockPropertyFilter) Name() string { + return "obsolete-key" +} + +// Intersects is part of the BlockPropertyFilter interface. It returns true +// if the block may contain non-obsolete keys. +func (o obsoleteKeyBlockPropertyFilter) Intersects(prop []byte) (bool, error) { + isObsolete, err := obsoleteKeyBlockPropertyDecode(prop) + if err != nil { + return false, err + } + return !isObsolete, nil +} + +// SyntheticSuffixIntersects is part of the BlockPropertyFilter interface. It +// expects that synthetic suffix is never used with tables that contain obsolete +// keys. +func (o obsoleteKeyBlockPropertyFilter) SyntheticSuffixIntersects( + prop []byte, suffix []byte, +) (bool, error) { + isObsolete, err := obsoleteKeyBlockPropertyDecode(prop) + if err != nil { + return false, err + } + // A block with suffix replacement should never be obsolete. + if isObsolete { + return false, base.AssertionFailedf("block with synthetic suffix is obsolete") + } + return true, nil +} + +// Encodes the information of whether the block contains only obsolete keys. We +// use the empty encoding for the common case of a block not being obsolete. +func obsoleteKeyBlockPropertyEncode(isObsolete bool, buf []byte) []byte { + if isObsolete { + return append(buf, 't') + } + return buf +} + +// Decodes the information of whether the block contains only obsolete keys (the +// inverse of obsoleteKeyBlockPropertyEncode). +func obsoleteKeyBlockPropertyDecode(prop []byte) (isObsolete bool, _ error) { + switch { + case len(prop) == 0: + return false, nil + case len(prop) == 1 && prop[0] == 't': + return true, nil + default: + return false, errors.Errorf("invalid obsolete block property %x", prop) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/block_property_test_utils.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/block_property_test_utils.go similarity index 52% rename from vendor/github.com/cockroachdb/pebble/sstable/block_property_test_utils.go rename to vendor/github.com/cockroachdb/pebble/v2/sstable/block_property_test_utils.go index 0ade68f..acd3ee0 100644 --- a/vendor/github.com/cockroachdb/pebble/sstable/block_property_test_utils.go +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/block_property_test_utils.go @@ -5,10 +5,10 @@ package sstable import ( + "fmt" "math" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/testkeys" + "github.com/cockroachdb/pebble/v2/internal/testkeys" ) // Code in this file contains utils for testing. It implements interval block @@ -22,7 +22,7 @@ const testKeysBlockPropertyName = `pebble.internal.testkeys.suffixes` func NewTestKeysBlockPropertyCollector() BlockPropertyCollector { return NewBlockIntervalCollector( testKeysBlockPropertyName, - &testKeysSuffixIntervalCollector{}, + &testKeysSuffixIntervalMapper{}, nil) } @@ -35,7 +35,27 @@ func NewTestKeysBlockPropertyCollector() BlockPropertyCollector { // and keys with suffixes within the range [filterMin, filterMax). For keys with // suffixes outside the range, iteration is nondeterministic. func NewTestKeysBlockPropertyFilter(filterMin, filterMax uint64) *BlockIntervalFilter { - return NewBlockIntervalFilter(testKeysBlockPropertyName, filterMin, filterMax) + return NewBlockIntervalFilter(testKeysBlockPropertyName, filterMin, filterMax, testKeysBlockIntervalSyntheticReplacer{}) +} + +var _ BlockIntervalSuffixReplacer = testKeysBlockIntervalSyntheticReplacer{} + +type testKeysBlockIntervalSyntheticReplacer struct{} + +// ApplySuffixReplacement implements BlockIntervalSyntheticReplacer. +func (sr testKeysBlockIntervalSyntheticReplacer) ApplySuffixReplacement( + interval BlockInterval, newSuffix []byte, +) (BlockInterval, error) { + decoded, err := testkeys.ParseSuffix(newSuffix) + if err != nil { + return BlockInterval{}, err + } + // The testKeysSuffixIntervalMapper below maps keys with no suffix to + // [0, MaxUint64); ignore that. + if interval.Upper != math.MaxUint64 && uint64(decoded) < interval.Upper { + panic(fmt.Sprintf("the synthetic suffix %d is less than the property upper bound %d", decoded, interval.Upper)) + } + return BlockInterval{uint64(decoded), uint64(decoded) + 1}, nil } // NewTestKeysMaskingFilter constructs a TestKeysMaskingFilter that implements @@ -68,50 +88,52 @@ func (f TestKeysMaskingFilter) Intersects(prop []byte) (bool, error) { return f.BlockIntervalFilter.Intersects(prop) } -var _ DataBlockIntervalCollector = (*testKeysSuffixIntervalCollector)(nil) +// SyntheticSuffixIntersects implements the BlockPropertyFilter interface. +func (f TestKeysMaskingFilter) SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) { + return f.BlockIntervalFilter.SyntheticSuffixIntersects(prop, suffix) +} -// testKeysSuffixIntervalCollector maintains an interval over the timestamps in -// MVCC-like suffixes for keys (e.g. foo@123). -type testKeysSuffixIntervalCollector struct { - initialized bool - lower, upper uint64 +// testKeysSuffixIntervalMapper maps keys to intervals according to the +// timestamps in MVCC-like suffixes for keys (e.g. "foo@123" -> 123). +type testKeysSuffixIntervalMapper struct { + ignorePoints bool + ignoreRangeKeys bool } -// Add implements DataBlockIntervalCollector by adding the timestamp(s) in the -// suffix(es) of this record to the current interval. -// -// Note that range sets and unsets may have multiple suffixes. Range key deletes -// do not have a suffix. All other point keys have a single suffix. -func (c *testKeysSuffixIntervalCollector) Add(key base.InternalKey, value []byte) error { - i := testkeys.Comparer.Split(key.UserKey) - if i == len(key.UserKey) { - c.initialized = true - c.lower, c.upper = 0, math.MaxUint64 - return nil - } - ts, err := testkeys.ParseSuffix(key.UserKey[i:]) - if err != nil { - return err - } - uts := uint64(ts) - if !c.initialized { - c.lower, c.upper = uts, uts+1 - c.initialized = true - return nil +var _ IntervalMapper = &testKeysSuffixIntervalMapper{} + +// MapPointKey is part of the IntervalMapper interface. +func (c *testKeysSuffixIntervalMapper) MapPointKey( + key InternalKey, value []byte, +) (BlockInterval, error) { + if c.ignorePoints { + return BlockInterval{}, nil } - if uts < c.lower { - c.lower = uts + n := testkeys.Comparer.Split(key.UserKey) + return testKeysSuffixToInterval(key.UserKey[n:]), nil +} + +// MapRangeKeys is part of the IntervalMapper interface. +func (c *testKeysSuffixIntervalMapper) MapRangeKeys(span Span) (BlockInterval, error) { + if c.ignoreRangeKeys { + return BlockInterval{}, nil } - if uts >= c.upper { - c.upper = uts + 1 + var result BlockInterval + for _, k := range span.Keys { + if len(k.Suffix) > 0 { + result.UnionWith(testKeysSuffixToInterval(k.Suffix)) + } } - return nil + return result, nil } -// FinishDataBlock implements DataBlockIntervalCollector. -func (c *testKeysSuffixIntervalCollector) FinishDataBlock() (lower, upper uint64, err error) { - l, u := c.lower, c.upper - c.lower, c.upper = 0, 0 - c.initialized = false - return l, u, nil +func testKeysSuffixToInterval(suffix []byte) BlockInterval { + if len(suffix) == 0 { + return BlockInterval{0, math.MaxUint64} + } + n, err := testkeys.ParseSuffix(suffix) + if err != nil { + panic(err) + } + return BlockInterval{uint64(n), uint64(n) + 1} } diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/base.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/base.go new file mode 100644 index 0000000..bc9087a --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/base.go @@ -0,0 +1,60 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "unsafe" + + "golang.org/x/exp/constraints" +) + +// align returns the next value greater than or equal to offset that's divisible +// by val. +func align[T constraints.Integer](offset, val T) T { + return (offset + val - 1) & ^(val - 1) +} + +// alignWithZeroes aligns the provided offset to val, and writing zeroes to any +// bytes in buf between the old offset and new aligned offset. This provides +// determinism when reusing memory that has not been zeroed. +func alignWithZeroes[T constraints.Integer](buf []byte, offset, val T) T { + aligned := align[T](offset, val) + for i := offset; i < aligned; i++ { + buf[i] = 0 + } + return aligned +} + +const ( + align16 = 2 + align32 = 4 + align64 = 8 +) + +// When multiplying or dividing by align{16,32,64} using signed integers, it's +// faster to shift to the left to multiply or shift to the right to divide. (The +// compiler optimization is limited to unsigned integers.) The below constants +// define the shift amounts corresponding to the above align constants. (eg, +// alignNShift = log(alignN)). +// +// TODO(jackson): Consider updating usages to use uints? They can be less +// ergonomic. +const ( + align16Shift = 1 + align32Shift = 2 + align64Shift = 3 +) + +// TODO(jackson): A subsequent Go release will remove the ability to call these +// runtime functions. We should consider asm implementations that we maintain +// within the crlib repo. +// +// See https://github.com/golang/go/issues/67401 + +//go:linkname memmove runtime.memmove +func memmove(to, from unsafe.Pointer, n uintptr) + +//go:linkname mallocgc runtime.mallocgc +func mallocgc(size uintptr, typ unsafe.Pointer, needzero bool) unsafe.Pointer diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/bitmap.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/bitmap.go new file mode 100644 index 0000000..bde81b9 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/bitmap.go @@ -0,0 +1,501 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "fmt" + "io" + "math" + "math/bits" + "strings" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// Bitmap is a bitmap structure built on a []uint64. A bitmap utilizes ~1 +// physical bit/logical bit (~0.125 bytes/row). The bitmap is encoded into an +// 8-byte aligned array of 64-bit words which is (nRows+63)/64 words in length. +// +// A summary bitmap is stored after the primary bitmap in which each bit in the +// summary bitmap corresponds to 1 word in the primary bitmap. A bit is set in +// the summary bitmap if the corresponding word in the primary bitmap is +// non-zero. The summary bitmap accelerates predecessor and successor +// operations. +type Bitmap struct { + // data contains the bitmap data, according to defaultBitmapEncoding, or it + // is nil if the bitmap is all zeros. + data unsafeUint64Decoder + bitCount int +} + +// Assert that Bitmap implements Array[bool]. +var _ Array[bool] = Bitmap{} + +// DecodeBitmap decodes the structure of a Bitmap and returns a Bitmap that +// reads from b supporting bitCount logical bits. No bounds checking is +// performed, so the caller must guarantee the bitmap is appropriately sized and +// the provided bitCount correctly identifies the number of bits in the bitmap. +func DecodeBitmap(b []byte, off uint32, bitCount int) (bitmap Bitmap, endOffset uint32) { + encoding := bitmapEncoding(b[off]) + off++ + if encoding == zeroBitmapEncoding { + return Bitmap{bitCount: bitCount}, off + } + off = align(off, align64) + sz := bitmapRequiredSize(bitCount) + if len(b) < int(off)+sz { + panic(errors.AssertionFailedf("bitmap of %d bits requires at least %d bytes; provided with %d-byte slice", + bitCount, bitmapRequiredSize(bitCount), len(b[off:]))) + } + return Bitmap{ + data: makeUnsafeUint64Decoder(b[off:], sz>>align64Shift), + bitCount: bitCount, + }, off + uint32(sz) +} + +// Assert that DecodeBitmap implements DecodeFunc. +var _ DecodeFunc[Bitmap] = DecodeBitmap + +// At returns true if the bit at position i is set and false otherwise. +// +//gcassert:inline +func (b Bitmap) At(i int) bool { + if b.data.ptr == nil { + // zero bitmap case. + return false + } + invariants.CheckBounds(i, b.bitCount) + val := b.data.At(int(uint(i) >> 6)) // aka At(i/64) + return val&(1<<(uint(i)&63)) != 0 +} + +// SeekSetBitGE returns the next bit greater than or equal to i set in the bitmap. +// The i parameter must be ≥ 0. Returns the number of bits +// represented by the bitmap if no next bit is set (or if i >= bitCount). +func (b Bitmap) SeekSetBitGE(i int) int { + if b.data.ptr == nil || i >= b.bitCount { + // Zero bitmap case. + return b.bitCount + } + + wordIdx := i >> 6 // i/64 + // Fast path for common case of reasonably dense bitmaps; if the there's a + // bit ≥ i set in the same word, return it. + if next := nextBitInWord(b.data.At(wordIdx), uint(i)&63); next < 64 { + return wordIdx<<6 + next + } + + // Consult summary structure to find the next word with a set bit. The word + // we just checked (wordIdx) is represented by the wordIdx%64'th bit in the + // wordIdx/64'th summary word. We want to know if any of the other later + // words that are summarized together have a set bit. We call [nextInWord] + // on the summary word to get the index of which word has a set bit, if any. + summaryTableOffset, summaryTableEnd := b.summaryTableBounds() + summaryWordIdx := summaryTableOffset + wordIdx>>6 + if invariants.Enabled { + sz := bitmapRequiredSize(b.bitCount) + invariants.CheckBounds(summaryTableEnd-1, sz) + } + summaryNext := nextBitInWord(b.data.At(summaryWordIdx), uint(wordIdx%64)+1) + // If [summaryNext] == 64, then there are no set bits in any of the earlier + // words represented by the summary word at [summaryWordIdx]. In that case, + // we need to keep scanning the summary table forwards. + if summaryNext == 64 { + for summaryWordIdx++; ; summaryWordIdx++ { + // When we fall off the end of the summary table, we've determined + // there are no set bits after i across the entirety of the bitmap. + if summaryWordIdx >= summaryTableEnd { + return b.bitCount + } + if summaryWord := b.data.At(summaryWordIdx); summaryWord != 0 { + summaryNext = bits.TrailingZeros64(summaryWord) + break + } + } + } + // The summary word index and the summaryNext together tell us which word + // has a set bit. The number of leading zeros in the word itself tell us + // which bit is set. + wordIdx = ((summaryWordIdx - summaryTableOffset) << 6) + summaryNext + return (wordIdx << 6) + bits.TrailingZeros64(b.data.At(wordIdx)) +} + +// SeekSetBitLE returns the previous bit less than or equal to i set in the +// bitmap. The i parameter must be in [0, bitCount). Returns -1 if no previous +// bit is set. +func (b Bitmap) SeekSetBitLE(i int) int { + if b.data.ptr == nil { + // Zero bitmap case. + return -1 + } + wordIdx := i >> 6 // i/64 + // Fast path for common case of reasonably dense bitmaps; if the there's a + // bit ≤ i set in the same word, return it. + if prev := prevBitInWord(b.data.At(wordIdx), uint(i)&63); prev >= 0 { + return (wordIdx << 6) + prev + } + + // Consult summary structure to find the next word with a set bit. The word + // we just checked (wordIdx) is represented by the wordIdx%64'th bit in the + // wordIdx/64'th summary word. We want to know if any of other earlier words + // that are summarized together have a set bit. We call [prevInWord] on the + // summary word to get the index of which word has a set bit, if any. + summaryTableOffset, _ := b.summaryTableBounds() + summaryWordIdx := summaryTableOffset + wordIdx>>6 + summaryPrev := prevBitInWord(b.data.At(summaryWordIdx), uint(wordIdx%64)-1) + // If [summaryPrev] is negative, then there are no set bits in any of the + // earlier words represented by the summary word at [summaryWordIdx]. In + // that case, we need to keep scanning the summary table backwards. + if summaryPrev < 0 { + for summaryWordIdx--; ; summaryWordIdx-- { + // When we fall below the beginning of the summary table, we've + // determined there are no set bits before i across the entirety of + // the bitmap. + if summaryWordIdx < summaryTableOffset { + return -1 + } + if summaryWord := b.data.At(summaryWordIdx); summaryWord != 0 { + summaryPrev = 63 - bits.LeadingZeros64(summaryWord) + break + } + } + } + // The summary word index and the summary prev together tell us which word + // has a set bit. The number of trailing zeros in the word itself tell us + // which bit is set. + wordIdx = ((summaryWordIdx - summaryTableOffset) << 6) + summaryPrev + return (wordIdx << 6) + 63 - bits.LeadingZeros64(b.data.At(wordIdx)) +} + +// SeekUnsetBitGE returns the next bit greater than or equal to i that is unset +// in the bitmap. The i parameter must be in [0, bitCount). Returns the number +// of bits represented by the bitmap if no next bit is unset. +func (b Bitmap) SeekUnsetBitGE(i int) int { + invariants.CheckBounds(i, b.bitCount) + if b.data.ptr == nil { + // Zero bitmap case. + return i + } + wordIdx := i >> 6 // i/64 + // If the there's a bit ≥ i unset in the same word, return it. + if next := nextBitInWord(^b.data.At(wordIdx), uint(i)&63); next < 64 { + return wordIdx<<6 + next + } + numWords := (b.bitCount + 63) >> 6 + var word uint64 + for wordIdx++; ; wordIdx++ { + if wordIdx >= numWords { + return b.bitCount + } + word = b.data.At(wordIdx) + if word != math.MaxUint64 { + break + } + } + return wordIdx<<6 + bits.TrailingZeros64(^word) +} + +// SeekUnsetBitLE returns the previous bit less than or equal to i set in the +// bitmap. The i parameter must be in [0, bitCount). Returns -1 if no previous +// bit is unset. +func (b Bitmap) SeekUnsetBitLE(i int) int { + invariants.CheckBounds(i, b.bitCount) + if b.data.ptr == nil { + // Zero bitmap case. + return i + } + + wordIdx := i >> 6 // i/64 + // If there's a bit ≤ i unset in the same word, return it. + if prev := prevBitInWord(^b.data.At(wordIdx), uint(i)&63); prev >= 0 { + return (wordIdx << 6) + prev + } + var word uint64 + for wordIdx--; ; wordIdx-- { + if wordIdx < 0 { + return -1 + } + word = b.data.At(wordIdx) + if word != math.MaxUint64 { + break + } + } + return (wordIdx << 6) + 63 - bits.LeadingZeros64(^word) +} + +// summaryTableBounds returns the indexes of the bitmap words containing the +// summary table. The summary table's words lie within [startOffset, endOffset). +func (b Bitmap) summaryTableBounds() (startOffset, endOffset int) { + startOffset = (b.bitCount + 63) >> 6 + endOffset = startOffset + (startOffset+63)>>6 + return startOffset, endOffset +} + +// String returns a string representation of the entire bitmap. +func (b Bitmap) String() string { + var sb strings.Builder + for w := 0; w < (b.bitCount+63)/64; w++ { + fmt.Fprintf(&sb, "%064b", b.data.At(w)) + } + return sb.String() +} + +// BitmapBuilder constructs a Bitmap. Bits default to false. +type BitmapBuilder struct { + words []uint64 + // minNonZeroRowCount is the row count at which the bitmap should begin to + // use the defaultBitmapEncoding (as opposed to the zeroBitmapEncoding). + // It's updated on the first call to Set and defaults to zero. + minNonZeroRowCount int +} + +type bitmapEncoding uint8 + +const ( + // defaultBitmapEncoding encodes the bitmap using ⌈n/64⌉ words followed by + // ⌈⌈n/64⌉/64⌉ summary words. + defaultBitmapEncoding bitmapEncoding = iota + // zeroBitmapEncoding is used for the special case when the bitmap is empty. + zeroBitmapEncoding +) + +// Assert that BitmapBuilder implements ColumnWriter. +var _ ColumnWriter = (*BitmapBuilder)(nil) + +// bitmapRequiredSize returns the size of an encoded bitmap in bytes, using the +// defaultBitmapEncoding. +func bitmapRequiredSize(total int) int { + nWords := (total + 63) >> 6 // divide by 64 + nSummaryWords := (nWords + 63) >> 6 // divide by 64 + return (nWords + nSummaryWords) << 3 // multiply by 8 +} + +// Set sets the bit at position i to true. +func (b *BitmapBuilder) Set(i int) { + // Update minNonZeroRowCount if necessary. This is used to determine whether + // the bitmap should be encoded using the all-zeros encoding. + if b.isZero(i + 1) { + b.minNonZeroRowCount = i + 1 + } + w := i >> 6 // divide by 64 + for len(b.words) <= w { + // We append zeros because if b.words has additional capacity, it has + // not been zeroed. + b.words = append(b.words, 0) + } + b.words[w] |= 1 << uint(i&63) +} + +// isZero returns true if no bits are set and Invert was not called. +// +//gcassert:inline +func (b *BitmapBuilder) isZero(rows int) bool { + return b.minNonZeroRowCount == 0 || rows < b.minNonZeroRowCount +} + +// Reset resets the bitmap to the empty state. +func (b *BitmapBuilder) Reset() { + if invariants.Sometimes(50) { + // Sometimes trash the bitmap with all ones to catch bugs that assume + // b.words is zeroed. + for i := 0; i < len(b.words); i++ { + b.words[i] = ^uint64(0) + } + } + + // NB: We don't zero the contents of b.words. When the BitmapBuilder reuses + // b.words, it must ensure it zeroes the contents as necessary. + b.words = b.words[:0] + b.minNonZeroRowCount = 0 +} + +// NumColumns implements the ColumnWriter interface. +func (b *BitmapBuilder) NumColumns() int { return 1 } + +// DataType implements the ColumnWriter interface. +func (b *BitmapBuilder) DataType(int) DataType { return DataTypeBool } + +// Size implements the ColumnWriter interface. +func (b *BitmapBuilder) Size(rows int, offset uint32) uint32 { + // First byte will be the encoding type. + offset++ + if b.isZero(rows) { + return offset + } + offset = align(offset, align64) + return offset + uint32(bitmapRequiredSize(rows)) +} + +// InvertedSize returns the size of the encoded bitmap, assuming Invert will be called. +func (b *BitmapBuilder) InvertedSize(rows int, offset uint32) uint32 { + // First byte will be the encoding type. + offset++ + // An inverted bitmap will never use all-zeros encoding (even if it happens to + // be all zero). + offset = align(offset, align64) + return offset + uint32(bitmapRequiredSize(rows)) +} + +// Invert inverts the bitmap, setting all bits that are not set and clearing all +// bits that are set. If the bitmap's tail is sparse and is not large enough to +// represent nRows rows, it's first materialized. +// +// Note that Invert can affect the Size of the bitmap. Use InvertedSize() if you +// intend to invert the bitmap before finishing. +func (b *BitmapBuilder) Invert(nRows int) { + // Inverted bitmaps never use the all-zero encoding, so we set + // rowCountIncludingFirstSetBit to 1 so that as long as the bitmap is + // finished encoding any rows at all, it uses the default encoding. + b.minNonZeroRowCount = 1 + // If the tail of b is sparse, fill in zeroes before inverting. + nBitmapWords := (nRows + 63) >> 6 + for len(b.words) < nBitmapWords { + // We append zeros because if b.words has additional capacity, it has + // not been zeroed. + b.words = append(b.words, 0) + } + b.words = b.words[:nBitmapWords] + for i := range b.words { + b.words[i] = ^b.words[i] + } +} + +// Finish finalizes the bitmap, computing the per-word summary bitmap and +// writing the resulting data to buf at offset. +func (b *BitmapBuilder) Finish(col, nRows int, offset uint32, buf []byte) uint32 { + if b.isZero(nRows) { + buf[offset] = byte(zeroBitmapEncoding) + return offset + 1 + } + buf[offset] = byte(defaultBitmapEncoding) + offset++ + offset = alignWithZeroes(buf, offset, align64) + + nBitmapWords := (nRows + 63) >> 6 + // Truncate the bitmap to the number of words required to represent nRows. + // The caller may have written more bits than nRows and no longer cares to + // write them out. + if len(b.words) > nBitmapWords { + b.words = b.words[:nBitmapWords] + } + // Ensure the last word of the bitmap does not contain any set bits beyond + // the last row. This is not just for determinism but also to ensure that + // the summary bitmap is correct (which is necessary for Bitmap.SeekSetBitGE + // correctness). + if i := nRows % 64; len(b.words) >= nBitmapWords && i != 0 { + b.words[nBitmapWords-1] &= (1 << i) - 1 + } + + nSummaryWords := (nBitmapWords + 63) >> 6 + dest := makeUintsEncoder[uint64](buf[offset:], nBitmapWords+nSummaryWords) + // Copy all the words of the bitmap into the destination buffer. + dest.CopyFrom(0, b.words) + offset += uint32(len(b.words)) << align64Shift + + // The caller may have written fewer than nRows rows if the tail is all + // zeroes, relying on these bits being implicitly zero. If the tail of b is + // sparse, fill in zeroes. + for i := len(b.words); i < nBitmapWords; i++ { + dest.UnsafeSet(i, 0) + offset += align64 + } + + // Add the summary bitmap. + for i := 0; i < nSummaryWords; i++ { + wordsOff := (i << 6) // i*64 + nWords := min(64, len(b.words)-wordsOff) + var summaryWord uint64 + for j := 0; j < nWords; j++ { + if (b.words)[wordsOff+j] != 0 { + summaryWord |= 1 << j + } + } + dest.UnsafeSet(nBitmapWords+i, summaryWord) + } + dest.Finish() + return offset + uint32(nSummaryWords)<= the key begin. Readers look up the key start index +// for the start boundary (s) and the end boundary (e). Any keys within indexes +// [s,e) have the corresponding bounds. +// +// Both range deletions and range keys are encoded with the same schema. Range +// deletion keyspan.Keys never contain suffixes or values. When one of these +// columns is encoded, the RawBytes encoding uses uintEncodingAllZero to avoid +// encoding N offsets. Each of these empty columns is encoded in just 1 byte of +// column data. +package colblk + +import ( + "cmp" + "encoding/binary" + "fmt" + "unsafe" + + "github.com/cockroachdb/crlib/crbytes" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// Version indicates the version of the columnar block format encoded. The +// version byte is always the first byte within the block. This ensures that +// readers can switch on the version before reading the rest of the block. +type Version uint8 + +const ( + // Version1 is the first version of the columnar block format. + Version1 Version = 0x01 +) + +const blockHeaderBaseSize = 7 +const columnHeaderSize = 5 +const maxBlockRetainedSize = 256 << 10 + +// Header holds the metadata extracted from a columnar block's header. +type Header struct { + Version Version + // Columns holds the number of columns encoded within the block. + Columns uint16 + // Rows holds the number of rows encoded within the block. + Rows uint32 +} + +// String implements the fmt.Stringer interface, returning a human-readable +// representation of the block header. +func (h Header) String() string { + return fmt.Sprintf("Version=%v; Columns=%d; Rows=%d", h.Version, h.Columns, h.Rows) +} + +// Encode encodes the header to the provided buf. The length of buf must be at +// least 7 bytes. +func (h Header) Encode(buf []byte) { + buf[0] = byte(h.Version) + binary.LittleEndian.PutUint16(buf[1:], h.Columns) + binary.LittleEndian.PutUint32(buf[1+align16:], h.Rows) +} + +// HeaderSize returns the size of the block header, including column +// headers, for a block with the specified number of columns and optionally a +// custom header size. +func HeaderSize(cols int, customHeaderSize uint32) uint32 { + // Each column has a 1-byte DataType and a 4-byte offset into the block. + return uint32(blockHeaderBaseSize+cols*columnHeaderSize) + customHeaderSize +} + +// DecodeHeader reads the block header from the provided serialized columnar +// block. +func DecodeHeader(data []byte) Header { + return Header{ + Version: Version(data[0]), + Columns: uint16(binary.LittleEndian.Uint16(data[1:])), + Rows: uint32(binary.LittleEndian.Uint32(data[1+align16:])), + } +} + +// A BlockEncoder encodes a columnar block and handles encoding the block's +// header, including individual column headers. +type BlockEncoder struct { + buf []byte + headerOffset uint32 + pageOffset uint32 +} + +// Reset resets an encoder for reuse. +func (e *BlockEncoder) Reset() { + if cap(e.buf) > maxBlockRetainedSize { + e.buf = nil + } + e.headerOffset = 0 + e.pageOffset = 0 +} + +// Init initializes the block encoder with a buffer of the specified size and +// header. +func (e *BlockEncoder) Init(size int, h Header, customHeaderSize uint32) { + if cap(e.buf) < size { + e.buf = crbytes.AllocAligned(size) + } else { + e.buf = e.buf[:size] + } + e.headerOffset = uint32(customHeaderSize) + blockHeaderBaseSize + e.pageOffset = HeaderSize(int(h.Columns), customHeaderSize) + h.Encode(e.buf[customHeaderSize:]) +} + +// Data returns the underlying buffer. +func (e *BlockEncoder) Data() []byte { + return e.buf +} + +// Encode encodes w's columns to the block. +func (e *BlockEncoder) Encode(rows int, w ColumnWriter) { + for i := 0; i < w.NumColumns(); i++ { + e.buf[e.headerOffset] = byte(w.DataType(i)) + binary.LittleEndian.PutUint32(e.buf[e.headerOffset+1:], e.pageOffset) + e.headerOffset += columnHeaderSize + e.pageOffset = w.Finish(i, rows, e.pageOffset, e.buf) + } +} + +// Finish finalizes the block encoding, returning the encoded block. The +// returned byte slice points to the encoder's buffer, so if the encoder is +// reused the returned slice will be invalidated. +func (e *BlockEncoder) Finish() []byte { + e.buf[e.pageOffset] = 0x00 // Padding byte + e.pageOffset++ + if e.pageOffset != uint32(len(e.buf)) { + panic(errors.AssertionFailedf("expected pageOffset=%d to equal size=%d", e.pageOffset, len(e.buf))) + } + return e.buf +} + +// FinishBlock writes the columnar block to a heap-allocated byte slice. +// FinishBlock assumes all columns have the same number of rows. If that's not +// the case, the caller should manually construct their own block. +func FinishBlock(rows int, writers []ColumnWriter) []byte { + size := HeaderSize(len(writers), 0) + nCols := uint16(0) + for _, cw := range writers { + size = cw.Size(rows, size) + nCols += uint16(cw.NumColumns()) + } + size++ // +1 for the trailing version byte. + + var enc BlockEncoder + enc.Init(int(size), Header{ + Version: Version1, + Columns: nCols, + Rows: uint32(rows), + }, 0) + for _, cw := range writers { + enc.Encode(rows, cw) + } + return enc.Finish() +} + +// DecodeColumn decodes the col'th column of the provided reader's block as a +// column of dataType using decodeFunc. +func DecodeColumn[V any]( + d *BlockDecoder, col int, rows int, dataType DataType, decodeFunc DecodeFunc[V], +) V { + if uint16(col) >= d.header.Columns { + panic(errors.AssertionFailedf("column %d is out of range [0, %d)", col, d.header.Columns)) + } + if dt := d.dataType(col); dt != dataType { + panic(errors.AssertionFailedf("column %d is type %s; not %s", col, dt, dataType)) + } + v, endOffset := decodeFunc(d.data, d.pageStart(col), rows) + if nextColumnOff := d.pageStart(col + 1); endOffset != nextColumnOff { + panic(errors.AssertionFailedf("column %d decoded to offset %d; expected %d", col, endOffset, nextColumnOff)) + } + return v +} + +// A BlockDecoder holds metadata for accessing the columns of a columnar block. +type BlockDecoder struct { + data []byte + header Header + customHeaderSize uint32 +} + +// DecodeBlock decodes the header of the provided columnar block and returns a +// new BlockDecoder configured to read from the block. The caller must ensure +// that the data is formatted as to the block layout specification. +func DecodeBlock(data []byte, customHeaderSize uint32) BlockDecoder { + d := BlockDecoder{} + d.Init(data, customHeaderSize) + return d +} + +// Init initializes a BlockDecoder with the data contained in the provided block. +func (d *BlockDecoder) Init(data []byte, customHeaderSize uint32) { + *d = BlockDecoder{ + data: data, + header: DecodeHeader(data[customHeaderSize:]), + customHeaderSize: customHeaderSize, + } +} + +// Rows returns the number of rows in the block, as indicated by the block header. +func (d *BlockDecoder) Rows() int { + return int(d.header.Rows) +} + +// DataType returns the data type of the col'th column. Every column's data type +// is encoded within the block header. +func (d *BlockDecoder) DataType(col int) DataType { + if uint16(col) >= d.header.Columns { + panic(errors.AssertionFailedf("column %d is out of range [0, %d)", col, d.header.Columns)) + } + return d.dataType(col) +} + +func (d *BlockDecoder) dataType(col int) DataType { + return DataType(*(*uint8)(d.pointer(d.customHeaderSize + blockHeaderBaseSize + columnHeaderSize*uint32(col)))) +} + +// Bitmap retrieves the col'th column as a bitmap. The column must be of type +// DataTypeBool. +func (d *BlockDecoder) Bitmap(col int) Bitmap { + return DecodeColumn(d, col, int(d.header.Rows), DataTypeBool, DecodeBitmap) +} + +// RawBytes retrieves the col'th column as a column of byte slices. The column +// must be of type DataTypeBytes. +func (d *BlockDecoder) RawBytes(col int) RawBytes { + return DecodeColumn(d, col, int(d.header.Rows), DataTypeBytes, DecodeRawBytes) +} + +// PrefixBytes retrieves the col'th column as a prefix-compressed byte slice column. The column +// must be of type DataTypePrefixBytes. +func (d *BlockDecoder) PrefixBytes(col int) PrefixBytes { + return DecodeColumn(d, col, int(d.header.Rows), DataTypePrefixBytes, DecodePrefixBytes) +} + +// Uints retrieves the col'th column as a column of uints. The column must be +// of type DataTypeUint. +func (d *BlockDecoder) Uints(col int) UnsafeUints { + return DecodeColumn(d, col, int(d.header.Rows), DataTypeUint, DecodeUnsafeUints) +} + +// Data returns the underlying buffer. +func (d *BlockDecoder) Data() []byte { + return d.data +} + +func (d *BlockDecoder) pageStart(col int) uint32 { + if uint16(col) >= d.header.Columns { + // -1 for the trailing version byte + return uint32(len(d.data) - 1) + } + return binary.LittleEndian.Uint32( + unsafe.Slice((*byte)(d.pointer(d.customHeaderSize+uint32(blockHeaderBaseSize+columnHeaderSize*col+1))), 4)) +} + +func (d *BlockDecoder) pointer(offset uint32) unsafe.Pointer { + return unsafe.Pointer(uintptr(unsafe.Pointer(&d.data[0])) + uintptr(offset)) +} + +// FormattedString returns a formatted representation of the block's binary +// data. +func (d *BlockDecoder) FormattedString() string { + f := binfmt.New(d.data) + tp := treeprinter.New() + n := tp.Child("block") + d.HeaderToBinFormatter(f, n) + for i := 0; i < int(d.header.Columns); i++ { + d.ColumnToBinFormatter(f, n, i, int(d.header.Rows)) + } + f.HexBytesln(1, "block trailer padding") + f.ToTreePrinter(n) + return tp.String() +} + +// HeaderToBinFormatter formats the block header to f and tp. +func (d *BlockDecoder) HeaderToBinFormatter(f *binfmt.Formatter, tp treeprinter.Node) { + f.HexBytesln(1, "version %v", Version(f.PeekUint(1))) + f.HexBytesln(2, "%d columns", d.header.Columns) + f.HexBytesln(4, "%d rows", d.header.Rows) + for i := 0; i < int(d.header.Columns); i++ { + f.Byte("col %d: %s", i, d.DataType(i)) + f.HexBytesln(4, "col %d: page start %d", i, d.pageStart(i)) + } + f.ToTreePrinter(tp.Child("columnar block header")) +} + +func (d *BlockDecoder) formatColumn( + f *binfmt.Formatter, + tp treeprinter.Node, + col int, + fn func(*binfmt.Formatter, treeprinter.Node, DataType), +) { + dataType := d.DataType(col) + colSize := d.pageStart(col+1) - d.pageStart(col) + endOff := f.Offset() + int(colSize) + fn(f, tp, dataType) + + // We expect formatting the column data to have consumed all the bytes + // between the column's pageOffset and the next column's pageOffset. + switch v := endOff - f.Offset(); cmp.Compare[int](v, 0) { + case +1: + panic(fmt.Sprintf("expected f.Offset() = %d, but found %d; did column %s format too few bytes?", endOff, f.Offset(), dataType)) + case 0: + case -1: + panic(fmt.Sprintf("expected f.Offset() = %d, but found %d; did column %s format too many bytes?", endOff, f.Offset(), dataType)) + } +} + +// ColumnToBinFormatter formats the col'th column to f and tp. +func (d *BlockDecoder) ColumnToBinFormatter( + f *binfmt.Formatter, tp treeprinter.Node, col, rows int, +) { + d.formatColumn(f, tp, col, func(f *binfmt.Formatter, tp treeprinter.Node, dataType DataType) { + n := tp.Childf("data for column %d (%s)", col, dataType) + switch dataType { + case DataTypeBool: + bitmapToBinFormatter(f, n, rows) + case DataTypeUint: + uintsToBinFormatter(f, n, rows, nil) + case DataTypePrefixBytes: + prefixBytesToBinFormatter(f, n, rows, nil) + case DataTypeBytes: + rawBytesToBinFormatter(f, n, rows, nil) + default: + panic("unimplemented") + } + }) + +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/column.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/column.go new file mode 100644 index 0000000..9267e18 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/column.go @@ -0,0 +1,110 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import "io" + +// DataType describes the logical type of a column's values. Some data types +// have multiple possible physical representations. Encoding a column may choose +// between possible physical representations depending on the distribution of +// values and the size of the resulting physical representation. +type DataType uint8 + +const ( + // DataTypeInvalid represents an unset or invalid data type. + DataTypeInvalid DataType = 0 + // DataTypeBool is a data type encoding a bool per row. + DataTypeBool DataType = 1 + // DataTypeUint is a data type encoding a fixed 8 bits per row. + DataTypeUint DataType = 2 + // DataTypeBytes is a data type encoding a variable-length byte string per + // row. + DataTypeBytes DataType = 3 + // DataTypePrefixBytes is a data type encoding variable-length, + // lexicographically-sorted byte strings, with prefix compression. + DataTypePrefixBytes DataType = 4 + + dataTypesCount DataType = 5 +) + +var dataTypeName [dataTypesCount]string = [dataTypesCount]string{ + DataTypeInvalid: "invalid", + DataTypeBool: "bool", + DataTypeUint: "uint", + DataTypeBytes: "bytes", + DataTypePrefixBytes: "prefixbytes", +} + +// String returns a human-readable string representation of the data type. +func (t DataType) String() string { + return dataTypeName[t] +} + +// ColumnWriter is an interface implemented by column encoders that accumulate a +// column's values and then serialize them. +type ColumnWriter interface { + Encoder + // NumColumns returns the number of columns the ColumnWriter will encode. + NumColumns() int + // DataType returns the data type of the col'th column. + DataType(col int) DataType + // Finish serializes the column at the specified index, writing the column's + // data to buf at offset, and returning the offset at which the next column + // should be encoded. + // + // The supplied buf must have enough space at the provided offset to fit the + // column. The caller may use Size() to calculate the exact size required. + // The caller passes the number of rows they want to serialize. All + // implementations of Finish must support cases where rows is the number of + // rows the caller has set, or one less. Some implementations may be more + // permissive. + // + // The provided column index must be less than NumColumns(). Finish is + // called for each index < NumColumns() in order. + // + // The provided buf must be word-aligned (at offset 0). If a column writer + // requires a particularly alignment, it's responsible for padding offset + // appropriately first. + Finish(col, rows int, offset uint32, buf []byte) (nextOffset uint32) +} + +// Encoder is an interface implemented by column encoders. +type Encoder interface { + // Reset clears the ColumnWriter's internal state, preparing it for reuse. + Reset() + // Size returns the size required to encode the column's current values. + // + // The `rows` argument must be the current number of logical rows in the + // column. Some implementations support defaults, and these implementations + // rely on the caller to inform them the current number of logical rows. The + // provided `rows` must be greater than or equal to the largest row set + 1. + // In other words, Size does not support determining the size of a column's + // earlier size before additional rows were added. + Size(rows int, offset uint32) uint32 + // WriteDebug writes a human-readable description of the current column + // state to the provided writer. + WriteDebug(w io.Writer, rows int) +} + +// A DecodeFunc decodes a data structure from a byte slice, returning an +// accessor for the data and the offset of the first byte after the structure. +// The rows argument must be number of logical rows encoded within the data +// structure. +type DecodeFunc[T any] func(buf []byte, offset uint32, rows int) (decoded T, nextOffset uint32) + +// An Array provides indexed access to an array of values. +type Array[V any] interface { + // At returns the i'th value in the array. + At(i int) V +} + +// Clone clones the first n elements of the array a. +func Clone[V any](a Array[V], n int) []V { + c := make([]V, n) + for i := 0; i < n; i++ { + c[i] = a.At(i) + } + return c +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/data_block.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/data_block.go new file mode 100644 index 0000000..330dfcb --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/data_block.go @@ -0,0 +1,1554 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "bytes" + "cmp" + "context" + "encoding/binary" + "fmt" + "io" + "math" + "unsafe" + + "github.com/cockroachdb/crlib/crbytes" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/bytealloc" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// KeySchema defines the schema of a user key, as defined by the user's +// application. +// +// TODO(jackson): Consider making this KVSchema. It feels like there's an +// opportunity to generalize the ShortAttribute so that when a value is stored +// out-of-band, the DataBlockEncoder calls user-provided code to store the short +// attributes inlined within the data block. For inlined-values, the +// user-defined value columns would be implicitly null. +type KeySchema struct { + Name string + // KeySchema implementations can optionally make use a fixed-sized custom + // header inside each block. + HeaderSize uint32 + ColumnTypes []DataType + NewKeyWriter func() KeyWriter + + // InitKeySeekerMetadata initializes the provided KeySeekerMetadata. This + // happens once when a block enters the block cache and can be used to save + // computation in NewKeySeeker. + InitKeySeekerMetadata func(meta *KeySeekerMetadata, d *DataBlockDecoder) + + // KeySeeker returns a KeySeeker using metadata that was previously + // initialized with InitKeySeekerMetadata. The returned key seeker can be an + // unsafe cast of the metadata itself. + KeySeeker func(meta *KeySeekerMetadata) KeySeeker +} + +// KeySeekerMetadata is an in-memory buffer that stores metadata for a block. It +// is allocated together with the buffer storing the block and is initialized +// once when the block is read from disk. It is always 8-byte aligned. +// +// Portions of this buffer can be cast to the structures we need (through +// unsafe.Pointer), but note that any pointers in these structures will be +// invisible to the GC. Pointers to the block's data buffer are ok, since the +// metadata and the data have the same lifetime (sharing the underlying +// allocation). +// +// KeySeekerMetadata is stored inside block.Metadata. +type KeySeekerMetadata [KeySeekerMetadataSize]byte + +// KeySeekerMetadataSize is chosen to fit the CockroachDB key seeker +// implementation. +const KeySeekerMetadataSize = 176 + +// A KeyWriter maintains ColumnWriters for a data block for writing user keys +// into the database-specific key schema. Users may define their own key schema +// and implement KeyWriter to encode keys into custom columns that are aware of +// the structure of user keys. +type KeyWriter interface { + ColumnWriter + // ComparePrev compares the provided user to the previously-written user + // key. The returned KeyComparison's UserKeyComparison field is equivalent + // to Compare(key, prevKey) where prevKey is the last key passed to + // WriteKey. + // + // If no key has been written yet, ComparePrev returns a KeyComparison with + // PrefixLen set and UserKeyComparison=1. + ComparePrev(key []byte) KeyComparison + // WriteKey writes a user key into the KeyWriter's columns. The + // keyPrefixLenSharedWithPrev parameter takes the number of bytes prefixing + // the key's logical prefix (as defined by (base.Comparer).Split) that the + // previously-written key's prefix shares. + // + // WriteKey is guaranteed to be called sequentially with increasing row + // indexes, beginning at zero. + WriteKey(row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32) + // MaterializeKey appends the zero-indexed row'th key written to dst, + // returning the result. + MaterializeKey(dst []byte, row int) []byte + // FinishHeader serializes an internal header of exactly KeySchema.HeaderSize bytes. + FinishHeader(dst []byte) +} + +// AssertKeyCompare compares two keys using the provided comparer, ensuring the +// provided KeyComparison accurately describing the result. Panics if the +// assertion does not hold. +func AssertKeyCompare(comparer *base.Comparer, a, b []byte, kcmp KeyComparison) { + bi := comparer.Split(b) + var recomputed KeyComparison + recomputed.PrefixLen = int32(comparer.Split(a)) + recomputed.CommonPrefixLen = int32(crbytes.CommonPrefix(a[:recomputed.PrefixLen], b[:bi])) + recomputed.UserKeyComparison = int32(comparer.Compare(a, b)) + if recomputed.PrefixEqual() != bytes.Equal(a[:recomputed.PrefixLen], b[:bi]) { + panic(errors.AssertionFailedf("PrefixEqual()=%t doesn't hold: %q, %q", kcmp.PrefixEqual(), a, b)) + } + if recomputed != kcmp { + panic(errors.AssertionFailedf("KeyComparison of (%q, %q) = %s, ComparePrev gave %s", + a, b, recomputed, kcmp)) + } +} + +// KeyComparison holds information about a key and its comparison to another a +// key. +type KeyComparison struct { + // PrefixLen is the length of the prefix of the key. It's the outcome of + // calling base.Split on the key. + PrefixLen int32 + // CommonPrefixLen is the length of the physical (byte-wise) prefix of the + // logical prefix that is shared with the other key. For example, for + // "apple@1" and "applied@3" the value is 4 (the length of "appl"). For + // "apple@1" and "apple@10" the value is 5 (the length of "apple"), because + // the shared bytes within the suffix are not included. + CommonPrefixLen int32 + // UserKeyComparison is the comparison of the user keys of the two keys. + // Should be equivalent to + // + // Compare(key, otherKey) + UserKeyComparison int32 +} + +// String returns a string representation of the KeyComparison. +func (kcmp KeyComparison) String() string { + return fmt.Sprintf("(prefix={%d,common=%d} cmp=%d)", + kcmp.PrefixLen, kcmp.CommonPrefixLen, kcmp.UserKeyComparison) +} + +// PrefixEqual returns true if the key comparison determined that the keys have +// equal prefixes. +func (kcmp KeyComparison) PrefixEqual() bool { return kcmp.PrefixLen == kcmp.CommonPrefixLen } + +// KeySeeker iterates over the keys in a columnar data block. +// +// Users of Pebble who define their own key schema must implement KeySeeker to +// seek over their decomposed keys. +// +// KeySeeker implementations must be safe for concurrent use by multiple +// goroutines. In practice, multiple DataBlockIterators may use the same +// KeySeeker. +type KeySeeker interface { + // IsLowerBound returns true if all keys in the data block (after suffix + // replacement if syntheticSuffix is not empty) are >= the given key. If the + // data block contains no keys, returns true. + IsLowerBound(k []byte, syntheticSuffix []byte) bool + // SeekGE returns the index of the first row with a key greater than or equal + // to [key], and whether that row has the same prefix as [key]. + // + // If the caller externally knows a bound on where the key is located, it + // may indicate it through [boundRow] and [searchDir]. A [searchDir] value + // of -1 indicates that the sought row must be at an index ≤ [boundRow]. A + // [searchDir] value of +1 indicates that the sought row must be at an index + // ≥ [boundRow]. Implementations may use this information to constrain the + // search. See (base.SeekGEFlags).TrySeekUsingNext for context on when this + // may be set in practice. + SeekGE(key []byte, boundRow int, searchDir int8) (row int, equalPrefix bool) + // MaterializeUserKey materializes the user key of the specified row, + // returning a slice of the materialized user key. + // + // The provided keyIter must have a buffer large enough to hold the key. + // + // The prevRow parameter is the row MaterializeUserKey was last invoked with + // (or a negative number if not applicable). Implementations may take + // advantage of that knowledge to reduce work. + MaterializeUserKey(keyIter *PrefixBytesIter, prevRow, row int) []byte + // MaterializeUserKeyWithSyntheticSuffix is a variant of MaterializeUserKey + // where the suffix is replaced. + // + // The provided keyIter must have a buffer large enough to hold the key after + // suffix replacement. + // + // The prevRow parameter is the row MaterializeUserKeyWithSyntheticSuffix was + // last invoked with (or a negative number if not applicable). Implementations + // may take advantage of that knowledge to reduce work. + MaterializeUserKeyWithSyntheticSuffix( + keyIter *PrefixBytesIter, syntheticSuffix []byte, prevRow, row int, + ) []byte +} + +const ( + defaultKeySchemaColumnPrefix int = iota + defaultKeySchemaColumnSuffix +) + +var defaultSchemaColumnTypes = []DataType{ + defaultKeySchemaColumnPrefix: DataTypePrefixBytes, + defaultKeySchemaColumnSuffix: DataTypeBytes, +} + +// DefaultKeySchema returns the default key schema that decomposes a user key +// into its prefix and suffix. Prefixes are sorted in lexicographical order. +func DefaultKeySchema(comparer *base.Comparer, prefixBundleSize int) KeySchema { + return KeySchema{ + Name: fmt.Sprintf("DefaultKeySchema(%s,%d)", comparer.Name, prefixBundleSize), + HeaderSize: 0, + ColumnTypes: defaultSchemaColumnTypes, + NewKeyWriter: func() KeyWriter { + kw := &defaultKeyWriter{comparer: comparer} + kw.prefixes.Init(prefixBundleSize) + kw.suffixes.Init() + return kw + }, + InitKeySeekerMetadata: func(meta *KeySeekerMetadata, d *DataBlockDecoder) { + ks := (*defaultKeySeeker)(unsafe.Pointer(&meta[0])) + ks.comparer = comparer + ks.init(d) + }, + KeySeeker: func(meta *KeySeekerMetadata) KeySeeker { + ks := (*defaultKeySeeker)(unsafe.Pointer(&meta[0])) + return ks + }, + } +} + +// Assert that *defaultKeyWriter implements the KeyWriter interface. +var _ KeyWriter = (*defaultKeyWriter)(nil) + +type defaultKeyWriter struct { + comparer *base.Comparer + prefixes PrefixBytesBuilder + suffixes RawBytesBuilder +} + +func (w *defaultKeyWriter) ComparePrev(key []byte) KeyComparison { + var cmpv KeyComparison + cmpv.PrefixLen = int32(w.comparer.Split(key)) + if w.prefixes.nKeys == 0 { + // The first key has no previous key to compare to. + cmpv.UserKeyComparison = 1 + return cmpv + } + lp := w.prefixes.UnsafeGet(w.prefixes.nKeys - 1) + cmpv.CommonPrefixLen = int32(crbytes.CommonPrefix(lp, key[:cmpv.PrefixLen])) + + // Keys are written in order and prefixes must be sorted lexicograpgically, + // so CommonPrefixLen == PrefixLen implies that the keys share the same + // logical prefix. (If the previous key had a prefix longer than + // CommonPrefixLen, it would sort after [key].) + if cmpv.CommonPrefixLen == cmpv.PrefixLen { + // The keys share the same MVCC prefix. Compare the suffixes. + cmpv.UserKeyComparison = int32(w.comparer.ComparePointSuffixes(key[cmpv.PrefixLen:], + w.suffixes.UnsafeGet(w.suffixes.rows-1))) + if invariants.Enabled { + if !w.comparer.Equal(lp, key[:cmpv.PrefixLen]) { + panic(errors.AssertionFailedf("keys have different logical prefixes: %q != %q", lp, key[:cmpv.PrefixLen])) + } + } + return cmpv + } + + // The keys have different MVCC prefixes. We haven't determined which is + // greater, but we know the index at which they diverge. The base.Comparer + // contract dictates that prefixes must be lexicographically ordered. + if len(lp) == int(cmpv.CommonPrefixLen) { + // cmpv.PrefixLen > cmpv.PrefixLenShared; key is greater. + cmpv.UserKeyComparison = +1 + } else { + // Both keys have at least 1 additional byte at which they diverge. + // Compare the diverging byte. + cmpv.UserKeyComparison = int32(cmp.Compare(key[cmpv.CommonPrefixLen], lp[cmpv.CommonPrefixLen])) + } + if invariants.Enabled { + // In this case we've determined that the keys have different prefixes, + // so the UserKeyComparison should be equal to the result of comparing + // the prefixes and nonzero. + if cmpv.UserKeyComparison == 0 { + panic(errors.AssertionFailedf("user keys should not be equal: %q+%q, %q", + lp, w.suffixes.UnsafeGet(w.suffixes.rows-1), key)) + } + if v := w.comparer.Compare(key, lp); v != int(cmpv.UserKeyComparison) { + panic(errors.AssertionFailedf("user key comparison mismatch: Compare(%q, %q) = %d ≠ %d", + key, lp, v, cmpv.UserKeyComparison)) + } + } + return cmpv +} + +func (w *defaultKeyWriter) WriteKey( + row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32, +) { + w.prefixes.Put(key[:keyPrefixLen], int(keyPrefixLenSharedWithPrev)) + w.suffixes.Put(key[keyPrefixLen:]) +} + +func (w *defaultKeyWriter) MaterializeKey(dst []byte, row int) []byte { + dst = append(dst, w.prefixes.UnsafeGet(row)...) + dst = append(dst, w.suffixes.UnsafeGet(row)...) + return dst +} + +func (w *defaultKeyWriter) NumColumns() int { + return 2 +} + +func (w *defaultKeyWriter) DataType(col int) DataType { + return defaultSchemaColumnTypes[col] +} + +func (w *defaultKeyWriter) Reset() { + w.prefixes.Reset() + w.suffixes.Reset() +} + +func (w *defaultKeyWriter) WriteDebug(dst io.Writer, rows int) { + fmt.Fprint(dst, "0: prefixes: ") + w.prefixes.WriteDebug(dst, rows) + fmt.Fprintln(dst) + fmt.Fprint(dst, "1: suffixes: ") + w.suffixes.WriteDebug(dst, rows) + fmt.Fprintln(dst) +} + +func (w *defaultKeyWriter) Size(rows int, offset uint32) uint32 { + offset = w.prefixes.Size(rows, offset) + offset = w.suffixes.Size(rows, offset) + return offset +} + +func (w *defaultKeyWriter) FinishHeader([]byte) {} + +func (w *defaultKeyWriter) Finish(col, rows int, offset uint32, buf []byte) (nextOffset uint32) { + switch col { + case defaultKeySchemaColumnPrefix: + return w.prefixes.Finish(0, rows, offset, buf) + case defaultKeySchemaColumnSuffix: + return w.suffixes.Finish(0, rows, offset, buf) + default: + panic(fmt.Sprintf("unknown default key column: %d", col)) + } +} + +// Assert that *defaultKeySeeker implements KeySeeker. +var _ KeySeeker = (*defaultKeySeeker)(nil) + +// Assert that the metadata fits the defalut key seeker. +var _ uint = KeySeekerMetadataSize - uint(unsafe.Sizeof(defaultKeySeeker{})) + +type defaultKeySeeker struct { + comparer *base.Comparer + decoder *DataBlockDecoder + prefixes PrefixBytes + suffixes RawBytes + sharedPrefix []byte +} + +func (ks *defaultKeySeeker) init(d *DataBlockDecoder) { + ks.decoder = d + ks.prefixes = d.d.PrefixBytes(defaultKeySchemaColumnPrefix) + ks.suffixes = d.d.RawBytes(defaultKeySchemaColumnSuffix) + ks.sharedPrefix = ks.prefixes.SharedPrefix() +} + +// IsLowerBound is part of the KeySeeker interface. +func (ks *defaultKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) bool { + si := ks.comparer.Split(k) + if v := ks.comparer.Compare(ks.prefixes.UnsafeFirstSlice(), k[:si]); v != 0 { + return v > 0 + } + suffix := syntheticSuffix + if len(suffix) == 0 { + suffix = ks.suffixes.At(0) + } + return ks.comparer.Compare(suffix, k[si:]) >= 0 +} + +// SeekGE is part of the KeySeeker interface. +func (ks *defaultKeySeeker) SeekGE( + key []byte, boundRow int, searchDir int8, +) (row int, equalPrefix bool) { + si := ks.comparer.Split(key) + row, eq := ks.prefixes.Search(key[:si]) + if eq { + return ks.seekGEOnSuffix(row, key[si:]), true + } + return row, false +} + +// seekGEOnSuffix is a helper function for SeekGE when a seek key's prefix +// exactly matches a row. seekGEOnSuffix finds the first row at index or later +// with the same prefix as index and a suffix greater than or equal to [suffix], +// or if no such row exists, the next row with a different prefix. +func (ks *defaultKeySeeker) seekGEOnSuffix(index int, suffix []byte) (row int) { + // The search key's prefix exactly matches the prefix of the row at index. + // If the row at index has a suffix >= [suffix], then return the row. + if ks.comparer.ComparePointSuffixes(ks.suffixes.At(index), suffix) >= 0 { + return index + } + // Otherwise, the row at [index] sorts before the search key and we need to + // search forward. Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)]. + // + // Define f(l-1) == false and f(u) == true. + // Invariant: f(l-1) == false, f(u) == true. + l := index + 1 + u := ks.decoder.prefixChanged.SeekSetBitGE(index + 1) + for l < u { + h := int(uint(l+u) >> 1) // avoid overflow when computing h + // l ≤ h < u + if ks.comparer.ComparePointSuffixes(ks.suffixes.At(h), suffix) >= 0 { + u = h // preserves f(u) == true + } else { + l = h + 1 // preserves f(l-1) == false + } + } + return l +} + +// MaterializeUserKey is part of the colblk.KeySeeker interface. +func (ks *defaultKeySeeker) MaterializeUserKey(keyIter *PrefixBytesIter, prevRow, row int) []byte { + if row == prevRow+1 && prevRow >= 0 { + ks.prefixes.SetNext(keyIter) + } else { + ks.prefixes.SetAt(keyIter, row) + } + suffix := ks.suffixes.At(row) + res := keyIter.Buf[:len(keyIter.Buf)+len(suffix)] + memmove( + unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(keyIter.Buf)))+uintptr(len(keyIter.Buf))), + unsafe.Pointer(unsafe.SliceData(suffix)), + uintptr(len(suffix)), + ) + return res +} + +// MaterializeUserKeyWithSyntheticSuffix is part of the colblk.KeySeeker interface. +func (ks *defaultKeySeeker) MaterializeUserKeyWithSyntheticSuffix( + keyIter *PrefixBytesIter, suffix []byte, prevRow, row int, +) []byte { + if row == prevRow+1 && prevRow >= 0 { + ks.prefixes.SetNext(keyIter) + } else { + ks.prefixes.SetAt(keyIter, row) + } + res := keyIter.Buf[:len(keyIter.Buf)+len(suffix)] + memmove( + unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(keyIter.Buf)))+uintptr(len(keyIter.Buf))), + unsafe.Pointer(unsafe.SliceData(suffix)), + uintptr(len(suffix)), + ) + return res +} + +// DataBlockEncoder encodes columnar data blocks using a user-defined schema. +type DataBlockEncoder struct { + Schema *KeySchema + KeyWriter KeyWriter + // trailers is the column writer for InternalKey uint64 trailers. + trailers UintBuilder + // prefixSame is the column writer for the prefix-changed bitmap that + // indicates when a new key prefix begins. During block building, the bitmap + // represents when the prefix stays the same, which is expected to be a + // rarer case. Before Finish-ing the column, we invert the bitmap. + prefixSame BitmapBuilder + // values is the column writer for values. Iff the isValueExternal bitmap + // indicates a value is external, the value is prefixed with a ValuePrefix + // byte. + values RawBytesBuilder + // isValueExternal is the column writer for the is-value-external bitmap + // that indicates when a value is stored out-of-band in a value block. + isValueExternal BitmapBuilder + // isObsolete is the column writer for the is-obsolete bitmap that indicates + // when a key is known to be obsolete/non-live (i.e., shadowed by another + // identical point key or range deletion with a higher sequence number). + isObsolete BitmapBuilder + + enc BlockEncoder + rows int + maximumKeyLength int + valuePrefixTmp [1]byte + lastUserKeyTmp []byte +} + +const ( + dataBlockColumnTrailer = iota + dataBlockColumnPrefixChanged + dataBlockColumnValue + dataBlockColumnIsValueExternal + dataBlockColumnIsObsolete + dataBlockColumnMax +) + +// The data block header is a 4-byte uint32 encoding the maximum length of a key +// contained within the block. This is used by iterators to avoid the need to +// grow key buffers while iterating over the block, ensuring that the key buffer +// is always sufficiently large. +// This is serialized immediately after the KeySchema specific header. +const dataBlockCustomHeaderSize = 4 + +// Init initializes the data block writer. +func (w *DataBlockEncoder) Init(schema *KeySchema) { + w.Schema = schema + w.KeyWriter = schema.NewKeyWriter() + w.trailers.Init() + w.prefixSame.Reset() + w.values.Init() + w.isValueExternal.Reset() + w.isObsolete.Reset() + w.rows = 0 + w.maximumKeyLength = 0 + w.lastUserKeyTmp = w.lastUserKeyTmp[:0] + w.enc.Reset() +} + +// Reset resets the data block writer to its initial state, retaining buffers. +func (w *DataBlockEncoder) Reset() { + w.KeyWriter.Reset() + w.trailers.Reset() + w.prefixSame.Reset() + w.values.Reset() + w.isValueExternal.Reset() + w.isObsolete.Reset() + w.rows = 0 + w.maximumKeyLength = 0 + w.lastUserKeyTmp = w.lastUserKeyTmp[:0] + w.enc.Reset() +} + +// String outputs a human-readable summary of internal DataBlockEncoder state. +func (w *DataBlockEncoder) String() string { + var buf bytes.Buffer + size := uint32(w.Size()) + fmt.Fprintf(&buf, "size=%d:\n", size) + w.KeyWriter.WriteDebug(&buf, w.rows) + + fmt.Fprintf(&buf, "%d: trailers: ", len(w.Schema.ColumnTypes)+dataBlockColumnTrailer) + w.trailers.WriteDebug(&buf, w.rows) + fmt.Fprintln(&buf) + + fmt.Fprintf(&buf, "%d: prefix changed: ", len(w.Schema.ColumnTypes)+dataBlockColumnPrefixChanged) + w.prefixSame.WriteDebug(&buf, w.rows) + fmt.Fprintln(&buf) + + fmt.Fprintf(&buf, "%d: values: ", len(w.Schema.ColumnTypes)+dataBlockColumnValue) + w.values.WriteDebug(&buf, w.rows) + fmt.Fprintln(&buf) + + fmt.Fprintf(&buf, "%d: is-value-ext: ", len(w.Schema.ColumnTypes)+dataBlockColumnIsValueExternal) + w.isValueExternal.WriteDebug(&buf, w.rows) + fmt.Fprintln(&buf) + + fmt.Fprintf(&buf, "%d: is-obsolete: ", len(w.Schema.ColumnTypes)+dataBlockColumnIsObsolete) + w.isObsolete.WriteDebug(&buf, w.rows) + fmt.Fprintln(&buf) + + return buf.String() +} + +// Add adds the provided key to the data block. Keys must be added in order. The +// caller must supply a KeyComparison containing the comparison of the key to +// the previously-added key, obtainable through +// +// KeyWriter.ComparePrev(ikey.UserKey) +// +// The caller is required to pass this in because in expected use cases, the +// caller will also require the same information. +func (w *DataBlockEncoder) Add( + ikey base.InternalKey, + value []byte, + valuePrefix block.ValuePrefix, + kcmp KeyComparison, + isObsolete bool, +) { + w.KeyWriter.WriteKey(w.rows, ikey.UserKey, kcmp.PrefixLen, kcmp.CommonPrefixLen) + if kcmp.PrefixEqual() { + w.prefixSame.Set(w.rows) + } + if isObsolete { + w.isObsolete.Set(w.rows) + } + w.trailers.Set(w.rows, uint64(ikey.Trailer)) + if !valuePrefix.IsInPlaceValue() { + w.isValueExternal.Set(w.rows) + // Write the value with the value prefix byte preceding the value. + w.valuePrefixTmp[0] = byte(valuePrefix) + w.values.PutConcat(w.valuePrefixTmp[:], value) + } else { + // Elide the value prefix. Readers will examine the isValueExternal + // bitmap and know there is no value prefix byte if !isValueExternal. + w.values.Put(value) + } + if len(ikey.UserKey) > int(w.maximumKeyLength) { + w.maximumKeyLength = len(ikey.UserKey) + } + w.rows++ +} + +// Rows returns the number of rows in the current pending data block. +func (w *DataBlockEncoder) Rows() int { + return w.rows +} + +// Size returns the size of the current pending data block. +func (w *DataBlockEncoder) Size() int { + off := HeaderSize(len(w.Schema.ColumnTypes)+dataBlockColumnMax, dataBlockCustomHeaderSize+w.Schema.HeaderSize) + off = w.KeyWriter.Size(w.rows, off) + off = w.trailers.Size(w.rows, off) + off = w.prefixSame.InvertedSize(w.rows, off) + off = w.values.Size(w.rows, off) + off = w.isValueExternal.Size(w.rows, off) + off = w.isObsolete.Size(w.rows, off) + off++ // trailer padding byte + return int(off) +} + +// MaterializeLastUserKey materializes the last added user key. +func (w *DataBlockEncoder) MaterializeLastUserKey(appendTo []byte) []byte { + return w.KeyWriter.MaterializeKey(appendTo, w.rows-1) +} + +// Finish serializes the pending data block, including the first [rows] rows. +// The value of [rows] must be Rows() or Rows()-1. The provided size must be the +// size of the data block with the provided row count (i.e., the return value of +// [Size] when DataBlockEncoder.Rows() = [rows]). +// +// Finish the returns the serialized, uncompressed data block and the +// InternalKey of the last key contained within the data block. The memory of +// the lastKey's UserKey is owned by the DataBlockEncoder. The caller must +// copy it if they require it to outlive a Reset of the writer. +func (w *DataBlockEncoder) Finish(rows, size int) (finished []byte, lastKey base.InternalKey) { + if invariants.Enabled && rows != w.rows && rows != w.rows-1 { + panic(errors.AssertionFailedf("data block has %d rows; asked to finish %d", w.rows, rows)) + } + + cols := len(w.Schema.ColumnTypes) + dataBlockColumnMax + h := Header{ + Version: Version1, + Columns: uint16(cols), + Rows: uint32(rows), + } + + // Invert the prefix-same bitmap before writing it out, because we want it + // to represent when the prefix changes. + w.prefixSame.Invert(rows) + + w.enc.Init(size, h, dataBlockCustomHeaderSize+w.Schema.HeaderSize) + + // Write the key schema custom header. + w.KeyWriter.FinishHeader(w.enc.Data()[:w.Schema.HeaderSize]) + // Write the max key length in the data block custom header. + binary.LittleEndian.PutUint32(w.enc.Data()[w.Schema.HeaderSize:w.Schema.HeaderSize+dataBlockCustomHeaderSize], uint32(w.maximumKeyLength)) + w.enc.Encode(rows, w.KeyWriter) + w.enc.Encode(rows, &w.trailers) + w.enc.Encode(rows, &w.prefixSame) + w.enc.Encode(rows, &w.values) + w.enc.Encode(rows, &w.isValueExternal) + w.enc.Encode(rows, &w.isObsolete) + finished = w.enc.Finish() + + w.lastUserKeyTmp = w.lastUserKeyTmp[:0] + w.lastUserKeyTmp = w.KeyWriter.MaterializeKey(w.lastUserKeyTmp[:0], rows-1) + lastKey = base.InternalKey{ + UserKey: w.lastUserKeyTmp, + Trailer: base.InternalKeyTrailer(w.trailers.Get(rows - 1)), + } + return finished, lastKey +} + +// DataBlockRewriter rewrites data blocks. See RewriteSuffixes. +type DataBlockRewriter struct { + KeySchema *KeySchema + + encoder DataBlockEncoder + decoder DataBlockDecoder + iter DataBlockIter + keySeeker KeySeeker + comparer *base.Comparer + keyBuf []byte + // keyAlloc grown throughout the lifetime of the rewriter. + keyAlloc bytealloc.A + prefixBytesIter PrefixBytesIter + initialized bool +} + +// NewDataBlockRewriter creates a block rewriter. +func NewDataBlockRewriter(keySchema *KeySchema, comparer *base.Comparer) *DataBlockRewriter { + return &DataBlockRewriter{ + KeySchema: keySchema, + comparer: comparer, + } +} + +type assertNoExternalValues struct{} + +var _ block.GetInternalValueForPrefixAndValueHandler = assertNoExternalValues{} + +func (assertNoExternalValues) GetInternalValueForPrefixAndValueHandle( + value []byte, +) base.InternalValue { + panic(errors.AssertionFailedf("pebble: sstable contains values in value blocks")) +} + +// RewriteSuffixes rewrites the input block. It expects the input block to only +// contain keys with the suffix `from`. It rewrites the block to contain the +// same keys with the suffix `to`. +// +// RewriteSuffixes returns the start and end keys of the rewritten block, and +// the finished rewritten block. The returned start and end keys have indefinite +// lifetimes. The returned rewritten block is owned by the DataBlockRewriter. If +// it must be retained beyond the next call to RewriteSuffixes, the caller +// should make a copy. +// +// Note that the input slice must be 8-byte aligned. +func (rw *DataBlockRewriter) RewriteSuffixes( + input []byte, from []byte, to []byte, +) (start, end base.InternalKey, rewritten []byte, err error) { + if !rw.initialized { + rw.iter.InitOnce(rw.KeySchema, rw.comparer, assertNoExternalValues{}) + rw.encoder.Init(rw.KeySchema) + rw.initialized = true + } + + // TODO(jackson): RewriteSuffixes performs a naïve rewrite of the block, + // iterating over the input block while building a new block, KV-by-KV. + // Since key columns are stored separately from other data, we could copy + // columns that are unchanged (at least all the non-key columns, and in + // practice a PrefixBytes column) wholesale without retrieving rows + // one-by-one. In practice, there a few obstacles to making this work: + // + // - Only the beginning of a data block is assumed to be aligned. Columns + // then add padding as necessary to align data that needs to be aligned. + // If we copy a column, we have no guarantee that the alignment of the + // column start in the old block matches the alignment in the new block. + // We'd have to add padding to between columns to match the original + // alignment. It's a bit subtle. + // - We still need to read all the key columns in order to synthesize + // [start] and [end]. + // + // The columnar format is designed to support fast IterTransforms at read + // time, including IterTransforms.SyntheticSuffix. Our effort might be + // better spent dropping support for the physical rewriting of data blocks + // we're performing here and instead use a read-time IterTransform. + + rw.decoder.Init(rw.KeySchema, input) + meta := &KeySeekerMetadata{} + rw.KeySchema.InitKeySeekerMetadata(meta, &rw.decoder) + rw.keySeeker = rw.KeySchema.KeySeeker(meta) + rw.encoder.Reset() + if err = rw.iter.Init(&rw.decoder, block.IterTransforms{}); err != nil { + return base.InternalKey{}, base.InternalKey{}, nil, err + } + + // Allocate a keyIter buffer that's large enough to hold the largest user + // key in the block with 1 byte to spare (so that pointer arithmetic is + // never pointing beyond the allocation, which would violate Go rules). + if cap(rw.prefixBytesIter.Buf) < int(rw.decoder.maximumKeyLength)+1 { + rw.prefixBytesIter.Buf = make([]byte, rw.decoder.maximumKeyLength+1) + } + if newMax := int(rw.decoder.maximumKeyLength) - len(from) + len(to) + 1; cap(rw.keyBuf) < newMax { + rw.keyBuf = make([]byte, newMax) + } + + // Rewrite each key-value pair one-by-one. + for i, kv := 0, rw.iter.First(); kv != nil; i, kv = i+1, rw.iter.Next() { + value := kv.V.LazyValue().ValueOrHandle + valuePrefix := block.InPlaceValuePrefix(false /* setHasSamePrefix (unused) */) + isValueExternal := rw.decoder.isValueExternal.At(i) + if isValueExternal { + valuePrefix = block.ValuePrefix(value[0]) + value = value[1:] + } + kcmp := rw.encoder.KeyWriter.ComparePrev(kv.K.UserKey) + if !bytes.Equal(kv.K.UserKey[kcmp.PrefixLen:], from) { + return base.InternalKey{}, base.InternalKey{}, nil, + errors.Newf("key %s has suffix 0x%x; require 0x%x", kv.K, kv.K.UserKey[kcmp.PrefixLen:], from) + } + rw.keyBuf = append(rw.keyBuf[:0], kv.K.UserKey[:kcmp.PrefixLen]...) + rw.keyBuf = append(rw.keyBuf, to...) + if i == 0 { + start.UserKey, rw.keyAlloc = rw.keyAlloc.Copy(rw.keyBuf) + start.Trailer = kv.K.Trailer + } + k := base.InternalKey{UserKey: rw.keyBuf, Trailer: kv.K.Trailer} + rw.encoder.Add(k, value, valuePrefix, kcmp, rw.decoder.isObsolete.At(i)) + } + rewritten, end = rw.encoder.Finish(int(rw.decoder.d.header.Rows), rw.encoder.Size()) + end.UserKey, rw.keyAlloc = rw.keyAlloc.Copy(end.UserKey) + return start, end, rewritten, nil +} + +// dataBlockDecoderSize is the size of DataBlockDecoder, round up to 8 bytes. +const dataBlockDecoderSize = (unsafe.Sizeof(DataBlockDecoder{}) + 7) &^ 7 + +// Assert that dataBlockDecoderSize is a multiple of 8 bytes (so that +// KeySeekerMetadata is also aligned). +const _ uint = uint(-(dataBlockDecoderSize % 8)) + +// Assert that a DataBlockDecoder and a KeySeekerMetadata can fit inside block.Metadata. +const _ uint = block.MetadataSize - uint(dataBlockDecoderSize) - KeySeekerMetadataSize + +// InitDataBlockMetadata initializes the metadata for a data block. +func InitDataBlockMetadata(schema *KeySchema, md *block.Metadata, data []byte) (err error) { + type blockDecoderAndKeySeekerMetadata struct { + d DataBlockDecoder + // Pad to ensure KeySeekerMetadata is 8-byte aligned. + _ [dataBlockDecoderSize - unsafe.Sizeof(DataBlockDecoder{})]byte + keySchemaMeta KeySeekerMetadata + } + metadatas := block.CastMetadataZero[blockDecoderAndKeySeekerMetadata](md) + // Initialization can panic; convert panics to corruption errors (so higher + // layers can add file number and offset information). + defer func() { + if r := recover(); r != nil { + err = base.CorruptionErrorf("error initializing data block metadata: %v", r) + } + }() + metadatas.d.Init(schema, data) + schema.InitKeySeekerMetadata(&metadatas.keySchemaMeta, &metadatas.d) + return nil +} + +// Assert that an IndexBlockDecoder can fit inside block.Metadata. +const _ uint = block.MetadataSize - uint(unsafe.Sizeof(IndexBlockDecoder{})) + +// InitIndexBlockMetadata initializes the metadata for an index block. +func InitIndexBlockMetadata(md *block.Metadata, data []byte) (err error) { + d := block.CastMetadataZero[IndexBlockDecoder](md) + // Initialization can panic; convert panics to corruption errors (so higher + // layers can add file number and offset information). + defer func() { + if r := recover(); r != nil { + err = base.CorruptionErrorf("error initializing index block metadata: %v", r) + } + }() + d.Init(data) + return nil +} + +// Assert that a IndexBlockDecoder can fit inside block.Metadata. +const _ uint = block.MetadataSize - uint(unsafe.Sizeof(KeyspanDecoder{})) + +// InitKeyspanBlockMetadata initializes the metadata for a rangedel or range key block. +func InitKeyspanBlockMetadata(md *block.Metadata, data []byte) (err error) { + d := block.CastMetadataZero[KeyspanDecoder](md) + // Initialization can panic; convert panics to corruption errors (so higher + // layers can add file number and offset information). + defer func() { + if r := recover(); r != nil { + err = base.CorruptionErrorf("error initializing keyspan block metadata: %v", r) + } + }() + d.Init(data) + return nil +} + +// A DataBlockDecoder holds state for interpreting a columnar data block. It may +// be shared among multiple DataBlockIters. +type DataBlockDecoder struct { + d BlockDecoder + // trailers holds an array of the InternalKey trailers, encoding the key + // kind and sequence number of each key. + trailers UnsafeUints + // prefixChanged is a bitmap indicating when the prefix (as defined by + // Split) of a key changes, relative to the preceding key. This is used to + // bound seeks within a prefix, and to optimize NextPrefix. + prefixChanged Bitmap + // values is the column reader for values. If the isValueExternal bitmap + // indicates a value is external, the value is prefixed with a ValuePrefix + // byte. + values RawBytes + // isValueExternal is the column reader for the is-value-external bitmap + // that indicates whether a value is stored out-of-band in a value block. If + // true, the value contains a ValuePrefix byte followed by an encoded value + // handle indicating the value's location within the value block(s). + isValueExternal Bitmap + // isObsolete is the column reader for the is-obsolete bitmap + // that indicates whether a key is obsolete/non-live. + isObsolete Bitmap + // maximumKeyLength is the maximum length of a user key in the block. + // Iterators may use it to allocate a sufficiently large buffer up front, + // and elide size checks during iteration. Note that iterators should add +1 + // to the key length to ensure pointer arithmetric that computes a pointer + // to the tail of the key does not point to memory beyond the allocation + // (prohibited by Go pointer rules). + maximumKeyLength uint32 +} + +// BlockDecoder returns a pointer to the underlying BlockDecoder. +func (d *DataBlockDecoder) BlockDecoder() *BlockDecoder { + return &d.d +} + +// PrefixChanged returns the prefix-changed bitmap. +func (d *DataBlockDecoder) PrefixChanged() Bitmap { + return d.prefixChanged +} + +// KeySchemaHeader returns the KeySchema-specific header. +func (d *DataBlockDecoder) KeySchemaHeader() []byte { + return d.d.data[:d.d.customHeaderSize-dataBlockCustomHeaderSize] +} + +// Init initializes the data block reader with the given serialized data block. +func (d *DataBlockDecoder) Init(schema *KeySchema, data []byte) { + if uintptr(unsafe.Pointer(unsafe.SliceData(data)))&7 != 0 { + panic("data buffer not 8-byte aligned") + } + d.d.Init(data, dataBlockCustomHeaderSize+schema.HeaderSize) + d.trailers = d.d.Uints(len(schema.ColumnTypes) + dataBlockColumnTrailer) + d.prefixChanged = d.d.Bitmap(len(schema.ColumnTypes) + dataBlockColumnPrefixChanged) + d.values = d.d.RawBytes(len(schema.ColumnTypes) + dataBlockColumnValue) + d.isValueExternal = d.d.Bitmap(len(schema.ColumnTypes) + dataBlockColumnIsValueExternal) + d.isObsolete = d.d.Bitmap(len(schema.ColumnTypes) + dataBlockColumnIsObsolete) + d.maximumKeyLength = binary.LittleEndian.Uint32(data[schema.HeaderSize:]) +} + +// Describe descirbes the binary format of the data block, assuming f.Offset() +// is positioned at the beginning of the same data block described by d. +func (d *DataBlockDecoder) Describe(f *binfmt.Formatter, tp treeprinter.Node) { + // Set the relative offset. When loaded into memory, the beginning of blocks + // are aligned. Padding that ensures alignment is done relative to the + // current offset. Setting the relative offset ensures that if we're + // describing this block within a larger structure (eg, f.Offset()>0), we + // compute padding appropriately assuming the current byte f.Offset() is + // aligned. + f.SetAnchorOffset() + + n := tp.Child("data block header") + if keySchemaHeaderSize := int(d.d.customHeaderSize - 4); keySchemaHeaderSize > 0 { + f.HexBytesln(keySchemaHeaderSize, "key schema header") + } + f.HexBytesln(4, "maximum key length: %d", d.maximumKeyLength) + d.d.HeaderToBinFormatter(f, n) + for i := 0; i < int(d.d.header.Columns); i++ { + d.d.ColumnToBinFormatter(f, n, i, int(d.d.header.Rows)) + } + f.HexBytesln(1, "block padding byte") + f.ToTreePrinter(n) +} + +// A DataBlockValidator validates invariants that should hold across all data +// blocks. It may be used multiple times and will reuse allocations across +// Validate invocations when possible. +type DataBlockValidator struct { + dec DataBlockDecoder + keySeekerMeta KeySeekerMetadata + curKeyIter PrefixBytesIter + prevUserKeyBuf []byte +} + +// Validate validates the provided block. It returns an error if the block is +// invalid. +func (v *DataBlockValidator) Validate( + data []byte, comparer *base.Comparer, keySchema *KeySchema, +) error { + v.dec.Init(keySchema, data) + n := v.dec.d.header.Rows + keySchema.InitKeySeekerMetadata(&v.keySeekerMeta, &v.dec) + keySeeker := keySchema.KeySeeker(&v.keySeekerMeta) + + if cap(v.prevUserKeyBuf) < int(v.dec.maximumKeyLength)+1 { + v.prevUserKeyBuf = make([]byte, 0, v.dec.maximumKeyLength+1) + } + prevKey := base.InternalKey{UserKey: v.prevUserKeyBuf[:0]} + v.curKeyIter.Init(int(v.dec.maximumKeyLength), nil) + + for i := 0; i < int(n); i++ { + k := base.InternalKey{ + UserKey: keySeeker.MaterializeUserKey(&v.curKeyIter, i-1, i), + Trailer: base.InternalKeyTrailer(v.dec.trailers.At(i)), + } + // Ensure the keys are ordered. + ucmp := comparer.Compare(k.UserKey, prevKey.UserKey) + if ucmp < 0 || (ucmp == 0 && k.Trailer >= prevKey.Trailer) { + return errors.AssertionFailedf("key %s (row %d) and key %s (row %d) are out of order", + prevKey, i-1, k, i) + } + // Ensure the obsolete bit is set if the key is definitively obsolete. + // Not all sources of obsolescence are evident with only a data block + // available (range deletions or point keys in previous blocks may cause + // a key to be obsolete). + if ucmp == 0 && prevKey.Kind() != base.InternalKeyKindMerge && !v.dec.isObsolete.At(i) { + return errors.AssertionFailedf("key %s (row %d) is shadowed by previous key %s but is not marked as obsolete", + k, i, prevKey) + } + // Ensure that the prefix-changed bit is set correctly. + if i > 0 { + currPrefix := comparer.Split.Prefix(k.UserKey) + prevPrefix := comparer.Split.Prefix(prevKey.UserKey) + prefixChanged := !bytes.Equal(prevPrefix, currPrefix) + if prefixChanged != v.dec.prefixChanged.At(i) { + return errors.AssertionFailedf("prefix changed bit for key %q (row %d) is %t, expected %t [prev key was %q]", + k.UserKey, i, v.dec.prefixChanged.At(i), prefixChanged, prevKey.UserKey) + } + } + + prevKey.CopyFrom(k) + } + return nil +} + +// Assert that *DataBlockIter implements block.DataBlockIterator. +var _ block.DataBlockIterator = (*DataBlockIter)(nil) + +// DataBlockIter iterates over a columnar data block. +type DataBlockIter struct { + // -- Fields that are initialized once -- + // For any changes to these fields, InitOnce should be updated. + + // keySchema configures the DataBlockIterConfig to use the provided + // KeySchema when initializing the DataBlockIter for iteration over a new + // block. + keySchema *KeySchema + suffixCmp base.ComparePointSuffixes + split base.Split + // getLazyValuer configures the DataBlockIterConfig to initialize the + // DataBlockIter to use the provided handler for retrieving lazy values. + getLazyValuer block.GetInternalValueForPrefixAndValueHandler + + // -- Fields that are initialized for each block -- + // For any changes to these fields, InitHandle should be updated. + + d *DataBlockDecoder + h block.BufferHandle + maxRow int + transforms block.IterTransforms + noTransforms bool + keySeeker KeySeeker + + // -- State -- + // For any changes to these fields, InitHandle (which resets them) should be + // updated. + + keyIter PrefixBytesIter + row int + kv base.InternalKV + kvRow int // the row currently held in kv + + // nextObsoletePoint is the row index of the first obsolete point after i.row. + // It is used to optimize skipping of obsolete points during forward + // iteration. + nextObsoletePoint int +} + +// InitOnce configures the data block iterator's key schema and lazy value +// handler. The iterator must be initialized with a block before it can be used. +// It may be reinitialized with new blocks without calling InitOnce again. +func (i *DataBlockIter) InitOnce( + keySchema *KeySchema, + comparer *base.Comparer, + getLazyValuer block.GetInternalValueForPrefixAndValueHandler, +) { + i.keySchema = keySchema + i.suffixCmp = comparer.ComparePointSuffixes + i.split = comparer.Split + i.getLazyValuer = getLazyValuer +} + +// Init initializes the data block iterator, configuring it to read from the +// provided decoder. +func (i *DataBlockIter) Init(d *DataBlockDecoder, transforms block.IterTransforms) error { + i.d = d + // Leave i.h unchanged. + numRows := int(d.d.header.Rows) + i.maxRow = numRows - 1 + i.transforms = transforms + if i.transforms.HideObsoletePoints && d.isObsolete.SeekSetBitGE(0) == numRows { + // There are no obsolete points in the block; don't bother checking. + i.transforms.HideObsoletePoints = false + } + i.noTransforms = i.transforms.NoTransforms() + + // TODO(radu): see if this allocation can be a problem for the suffix rewriter. + meta := &KeySeekerMetadata{} + i.keySchema.InitKeySeekerMetadata(meta, d) + i.keySeeker = i.keySchema.KeySeeker(meta) + + // The worst case is when the largest key in the block has no suffix. + maxKeyLength := int(i.transforms.SyntheticPrefixAndSuffix.PrefixLen() + d.maximumKeyLength + i.transforms.SyntheticPrefixAndSuffix.SuffixLen()) + i.keyIter.Init(maxKeyLength, i.transforms.SyntheticPrefix()) + i.row = -1 + i.kv = base.InternalKV{} + i.kvRow = math.MinInt + i.nextObsoletePoint = 0 + return nil +} + +// InitHandle initializes the block from the provided buffer handle. InitHandle +// assumes that the block's metadata was initialized using +// InitDataBlockMetadata(). +func (i *DataBlockIter) InitHandle( + comparer *base.Comparer, h block.BufferHandle, transforms block.IterTransforms, +) error { + i.suffixCmp = comparer.ComparePointSuffixes + i.split = comparer.Split + blockMeta := h.BlockMetadata() + i.d = (*DataBlockDecoder)(unsafe.Pointer(blockMeta)) + keySeekerMeta := (*KeySeekerMetadata)(blockMeta[unsafe.Sizeof(DataBlockDecoder{}):]) + i.h.Release() + i.h = h + + numRows := int(i.d.d.header.Rows) + i.maxRow = numRows - 1 + + i.transforms = transforms + if i.transforms.HideObsoletePoints && i.d.isObsolete.SeekSetBitGE(0) == numRows { + // There are no obsolete points in the block; don't bother checking. + i.transforms.HideObsoletePoints = false + } + i.noTransforms = i.transforms.NoTransforms() + + // The worst case is when the largest key in the block has no suffix. + maxKeyLength := int(i.transforms.SyntheticPrefixAndSuffix.PrefixLen() + i.d.maximumKeyLength + i.transforms.SyntheticPrefixAndSuffix.SuffixLen()) + i.keyIter.Init(maxKeyLength, i.transforms.SyntheticPrefix()) + i.row = -1 + i.kv = base.InternalKV{} + i.kvRow = math.MinInt + i.nextObsoletePoint = 0 + i.keySeeker = i.keySchema.KeySeeker(keySeekerMeta) + return nil +} + +// Handle returns the handle to the block. +func (i *DataBlockIter) Handle() block.BufferHandle { + return i.h +} + +// Valid returns true if the iterator is currently positioned at a valid KV. +func (i *DataBlockIter) Valid() bool { + return i.row >= 0 && i.row <= i.maxRow && !i.IsDataInvalidated() +} + +// KV returns the key-value pair at the current iterator position. The +// iterator must be positioned over a valid KV. +func (i *DataBlockIter) KV() *base.InternalKV { + return &i.kv +} + +// Invalidate invalidates the block iterator, removing references to the block +// it was initialized with. The iterator may continue to be used after +// a call to Invalidate, but all positioning methods should return false. +// Valid() must also return false. +func (i *DataBlockIter) Invalidate() { + i.d = nil +} + +// IsDataInvalidated returns true when the iterator has been invalidated +// using an Invalidate call. +func (i *DataBlockIter) IsDataInvalidated() bool { + return i.d == nil +} + +// IsLowerBound implements the block.DataBlockIterator interface. +func (i *DataBlockIter) IsLowerBound(k []byte) bool { + if i.transforms.HasSyntheticPrefix() { + var keyPrefix []byte + keyPrefix, k = splitKey(k, len(i.transforms.SyntheticPrefix())) + if cmp := bytes.Compare(keyPrefix, i.transforms.SyntheticPrefix()); cmp != 0 { + return cmp < 0 + } + } + // If we are hiding obsolete points, it is possible that all points < k are + // hidden. + // Note: we ignore HideObsoletePoints, but false negatives are allowed. + return i.keySeeker.IsLowerBound(k, i.transforms.SyntheticSuffix()) +} + +// splitKey splits a key into k[:at] and k[at:]. +func splitKey(k []byte, at int) (before, after []byte) { + if len(k) <= at { + return k, nil + } + return k[:at], k[at:] +} + +// seekGEInternal is a wrapper around keySeeker.SeekGE which takes into account +// the synthetic prefix and suffix. +func (i *DataBlockIter) seekGEInternal(key []byte, boundRow int, searchDir int8) (row int) { + if i.transforms.HasSyntheticPrefix() { + var keyPrefix []byte + keyPrefix, key = splitKey(key, len(i.transforms.SyntheticPrefix())) + if cmp := bytes.Compare(keyPrefix, i.transforms.SyntheticPrefix()); cmp != 0 { + if cmp < 0 { + return 0 + } + return i.maxRow + 1 + } + } + if i.transforms.HasSyntheticSuffix() { + n := i.split(key) + row, eq := i.keySeeker.SeekGE(key[:n], boundRow, searchDir) + if eq && i.suffixCmp(key[n:], i.transforms.SyntheticSuffix()) > 0 { + row = i.d.prefixChanged.SeekSetBitGE(row + 1) + } + return row + } + row, _ = i.keySeeker.SeekGE(key, boundRow, searchDir) + return row +} + +// SeekGE implements the base.InternalIterator interface. +func (i *DataBlockIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { + if i.d == nil { + return nil + } + searchDir := int8(0) + if flags.TrySeekUsingNext() { + searchDir = +1 + } + if i.noTransforms { + // Fast path. + i.row, _ = i.keySeeker.SeekGE(key, i.row, searchDir) + return i.decodeRow() + } + i.row = i.seekGEInternal(key, i.row, searchDir) + if i.transforms.HideObsoletePoints { + i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(i.row) + if i.atObsoletePointForward() { + i.skipObsoletePointsForward() + if i.row > i.maxRow { + return nil + } + } + } + return i.decodeRow() +} + +// SeekPrefixGE implements the base.InternalIterator interface. +func (i *DataBlockIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + // This should never be called as prefix iteration is handled by + // sstable.Iterator. + + // TODO(jackson): We can implement this and avoid propagating keys without + // the prefix up to the merging iterator. It will avoid unnecessary key + // comparisons fixing up the merging iterator heap. We can also short + // circuit the search if the prefix isn't found within the prefix column. + // There's some subtlety around ensuring we continue to benefit from the + // TrySeekUsingNext optimization. + panic("pebble: SeekPrefixGE unimplemented") +} + +// SeekLT implements the base.InternalIterator interface. +func (i *DataBlockIter) SeekLT(key []byte, _ base.SeekLTFlags) *base.InternalKV { + if i.d == nil { + return nil + } + i.row = i.seekGEInternal(key, i.row, 0 /* searchDir */) - 1 + if i.transforms.HideObsoletePoints { + i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(max(i.row, 0)) + if i.atObsoletePointBackward() { + i.skipObsoletePointsBackward() + if i.row < 0 { + return nil + } + } + } + return i.decodeRow() +} + +// First implements the base.InternalIterator interface. +func (i *DataBlockIter) First() *base.InternalKV { + if i.d == nil { + return nil + } + i.row = 0 + if i.transforms.HideObsoletePoints { + i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(0) + if i.atObsoletePointForward() { + i.skipObsoletePointsForward() + if i.row > i.maxRow { + return nil + } + } + } + return i.decodeRow() +} + +// Last implements the base.InternalIterator interface. +func (i *DataBlockIter) Last() *base.InternalKV { + if i.d == nil { + return nil + } + i.row = i.maxRow + if i.transforms.HideObsoletePoints { + i.nextObsoletePoint = i.maxRow + 1 + if i.atObsoletePointBackward() { + i.skipObsoletePointsBackward() + if i.row < 0 { + return nil + } + } + } + return i.decodeRow() +} + +// Next advances to the next KV pair in the block. +func (i *DataBlockIter) Next() *base.InternalKV { + if i.d == nil { + return nil + } + // Inline decodeRow, but avoiding unnecessary checks against i.row. + if i.row >= i.maxRow { + i.row = i.maxRow + 1 + return nil + } + i.row++ + // Inline decodeKey(), adding obsolete points logic. + if i.noTransforms { + // Fast path. + i.kv.K = base.InternalKey{ + UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row), + Trailer: base.InternalKeyTrailer(i.d.trailers.At(i.row)), + } + } else { + if i.transforms.HideObsoletePoints && i.atObsoletePointForward() { + i.skipObsoletePointsForward() + if i.row > i.maxRow { + return nil + } + } + if i.transforms.HasSyntheticSuffix() { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix( + &i.keyIter, i.transforms.SyntheticSuffix(), i.kvRow, i.row, + ) + } else { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row) + } + i.kv.K.Trailer = base.InternalKeyTrailer(i.d.trailers.At(i.row)) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.kv.K.SetSeqNum(base.SeqNum(n)) + } + } + invariants.CheckBounds(i.row, i.d.values.slices) + // Inline i.d.values.At(row). + v := i.d.values.Slice(i.d.values.offsets.At2(i.row)) + if i.d.isValueExternal.At(i.row) { + i.kv.V = i.getLazyValuer.GetInternalValueForPrefixAndValueHandle(v) + } else { + i.kv.V = base.MakeInPlaceValue(v) + } + i.kvRow = i.row + return &i.kv +} + +// NextPrefix moves the iterator to the next row with a different prefix than +// the key at the current iterator position. +// +// The columnar block implementation uses a newPrefix bitmap to identify the +// next row with a differing prefix from the current row's key. If newPrefix[i] +// is set then row's i key prefix is different that row i-1. The bitmap is +// organized as a slice of 64-bit words. If a 64-bit word in the bitmap is zero +// then all of the rows corresponding to the bits in that word have the same +// prefix and we can skip ahead. If a row is non-zero a small bit of bit +// shifting and masking combined with bits.TrailingZeros64 can identify the +// next bit that is set after the current row. The bitmap uses 1 bit/row (0.125 +// bytes/row). A 32KB block containing 1300 rows (25 bytes/row) would need a +// bitmap of 21 64-bit words. Even in the worst case where every word is 0 this +// bitmap can be scanned in ~20 ns (1 ns/word) leading to a total NextPrefix +// time of ~30 ns if a row is found and decodeRow are called. In more normal +// cases, NextPrefix takes ~15% longer that a single Next call. +// +// For comparison, the rowblk nextPrefixV3 optimizations work by setting a bit +// in the value prefix byte that indicates that the current key has the same +// prefix as the previous key. Additionally, a bit is stolen from the restart +// table entries indicating whether a restart table entry has the same key +// prefix as the previous entry. Checking the value prefix byte bit requires +// locating that byte which requires decoding 3 varints per key/value pair. +func (i *DataBlockIter) NextPrefix(_ []byte) *base.InternalKV { + if i.d == nil { + return nil + } + i.row = i.d.prefixChanged.SeekSetBitGE(i.row + 1) + if i.transforms.HideObsoletePoints { + i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(i.row) + if i.atObsoletePointForward() { + i.skipObsoletePointsForward() + } + } + + return i.decodeRow() +} + +// Prev moves the iterator to the previous KV pair in the block. +func (i *DataBlockIter) Prev() *base.InternalKV { + if i.d == nil { + return nil + } + i.row-- + if i.transforms.HideObsoletePoints && i.atObsoletePointBackward() { + i.skipObsoletePointsBackward() + if i.row < 0 { + return nil + } + } + return i.decodeRow() +} + +// atObsoletePointForward returns true if i.row is an obsolete point. It is +// separate from skipObsoletePointsForward() because that method does not +// inline. It can only be used during forward iteration (i.e. i.row was +// incremented). +// +//gcassert:inline +func (i *DataBlockIter) atObsoletePointForward() bool { + if invariants.Enabled && i.row > i.nextObsoletePoint { + panic("invalid nextObsoletePoint") + } + return i.row == i.nextObsoletePoint && i.row <= i.maxRow +} + +func (i *DataBlockIter) skipObsoletePointsForward() { + if invariants.Enabled { + i.atObsoletePointCheck() + } + i.row = i.d.isObsolete.SeekUnsetBitGE(i.row) + i.nextObsoletePoint = i.d.isObsolete.SeekSetBitGE(i.row) +} + +// atObsoletePointBackward returns true if i.row is an obsolete point. It is +// separate from skipObsoletePointsBackward() because that method does not +// inline. It can only be used during reverse iteration (i.e. i.row was +// decremented). +// +//gcassert:inline +func (i *DataBlockIter) atObsoletePointBackward() bool { + return i.row >= 0 && i.d.isObsolete.At(i.row) +} + +func (i *DataBlockIter) skipObsoletePointsBackward() { + if invariants.Enabled { + i.atObsoletePointCheck() + } + i.row = i.d.isObsolete.SeekUnsetBitLE(i.row) + i.nextObsoletePoint = i.row + 1 +} + +func (i *DataBlockIter) atObsoletePointCheck() { + // We extract this code into a separate function to avoid getting a spurious + // error from GCAssert about At not being inlined because it is compiled out + // altogether in non-invariant builds. + if !i.transforms.HideObsoletePoints || !i.d.isObsolete.At(i.row) { + panic("expected obsolete point") + } +} + +// Error implements the base.InternalIterator interface. A DataBlockIter is +// infallible and always returns a nil error. +func (i *DataBlockIter) Error() error { + return nil // infallible +} + +// SetBounds implements the base.InternalIterator interface. +func (i *DataBlockIter) SetBounds(lower, upper []byte) { + // This should never be called as bounds are handled by sstable.Iterator. + panic("pebble: SetBounds unimplemented") +} + +// SetContext implements the base.InternalIterator interface. +func (i *DataBlockIter) SetContext(_ context.Context) {} + +var dataBlockTypeString string = fmt.Sprintf("%T", (*DataBlockIter)(nil)) + +// String implements the base.InternalIterator interface. +func (i *DataBlockIter) String() string { + return dataBlockTypeString +} + +// DebugTree is part of the InternalIterator interface. +func (i *DataBlockIter) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p)", i, i) +} + +// decodeRow decodes i.row into i.kv. If i.row is invalid, it returns nil. +func (i *DataBlockIter) decodeRow() *base.InternalKV { + switch { + case i.row < 0 || i.row > i.maxRow: + return nil + case i.kvRow == i.row: + // Already synthesized the kv at row. + return &i.kv + default: + // Inline decodeKey(). + if i.noTransforms { + // Fast path. + i.kv.K = base.InternalKey{ + UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row), + Trailer: base.InternalKeyTrailer(i.d.trailers.At(i.row)), + } + } else { + if i.transforms.HasSyntheticSuffix() { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix( + &i.keyIter, i.transforms.SyntheticSuffix(), i.kvRow, i.row, + ) + } else { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row) + } + i.kv.K.Trailer = base.InternalKeyTrailer(i.d.trailers.At(i.row)) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.kv.K.SetSeqNum(base.SeqNum(n)) + } + } + invariants.CheckBounds(i.row, i.d.values.slices) + // Inline i.d.values.At(row). + v := i.d.values.Slice(i.d.values.offsets.At2(i.row)) + invariants.CheckBounds(i.row, i.d.values.slices) + if i.d.isValueExternal.At(i.row) { + i.kv.V = i.getLazyValuer.GetInternalValueForPrefixAndValueHandle(v) + } else { + i.kv.V = base.MakeInPlaceValue(v) + } + i.kvRow = i.row + return &i.kv + } +} + +// decodeKey updates i.kv.K to the key for i.row (which must be valid). +// This function does not inline, so we copy its code verbatim. For any updates +// to this code, all code preceded by "Inline decodeKey" must be updated. +func (i *DataBlockIter) decodeKey() { + if i.noTransforms { + // Fast path. + i.kv.K = base.InternalKey{ + UserKey: i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row), + Trailer: base.InternalKeyTrailer(i.d.trailers.At(i.row)), + } + } else { + if i.transforms.HasSyntheticSuffix() { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKeyWithSyntheticSuffix( + &i.keyIter, i.transforms.SyntheticSuffix(), i.kvRow, i.row, + ) + } else { + i.kv.K.UserKey = i.keySeeker.MaterializeUserKey(&i.keyIter, i.kvRow, i.row) + } + i.kv.K.Trailer = base.InternalKeyTrailer(i.d.trailers.At(i.row)) + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.kv.K.SetSeqNum(base.SeqNum(n)) + } + } +} + +var _ = (*DataBlockIter).decodeKey + +// Close implements the base.InternalIterator interface. +func (i *DataBlockIter) Close() error { + i.keySeeker = nil + i.d = nil + i.h.Release() + i.h = block.BufferHandle{} + i.transforms = block.IterTransforms{} + i.kv = base.InternalKV{} + return nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/endian.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/endian.go new file mode 100644 index 0000000..2492040 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/endian.go @@ -0,0 +1,67 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "math/bits" + "unsafe" +) + +// ReverseBytes16 calls bits.ReverseBytes16 on each element of the input slice. +func ReverseBytes16(s []uint16) { + if len(s) >= 4 { + // We convert the slice (up to the tail) to a slice of [4]uint16. This helps + // the compiler elide bound checks. + quads := unsafe.Slice((*[4]uint16)(unsafe.Pointer(unsafe.SliceData(s))), len(s)>>2) + for i := range quads { + quads[i][0] = bits.ReverseBytes16(quads[i][0]) //gcassert:bce + quads[i][1] = bits.ReverseBytes16(quads[i][1]) //gcassert:bce + quads[i][2] = bits.ReverseBytes16(quads[i][2]) //gcassert:bce + quads[i][3] = bits.ReverseBytes16(quads[i][3]) //gcassert:bce + } + } + tail := s[len(s)&^3:] + for i := range tail { + tail[i] = bits.ReverseBytes16(tail[i]) //gcassert:bce + } +} + +// ReverseBytes32 calls bits.ReverseBytes32 on each element of the input slice. +func ReverseBytes32(s []uint32) { + if len(s) >= 4 { + // We convert the slice (up to the tail) to a slice of [4]uint32. This helps + // the compiler elide bound checks. + quads := unsafe.Slice((*[4]uint32)(unsafe.Pointer(unsafe.SliceData(s))), len(s)>>2) + for i := range quads { + quads[i][0] = bits.ReverseBytes32(quads[i][0]) //gcassert:bce + quads[i][1] = bits.ReverseBytes32(quads[i][1]) //gcassert:bce + quads[i][2] = bits.ReverseBytes32(quads[i][2]) //gcassert:bce + quads[i][3] = bits.ReverseBytes32(quads[i][3]) //gcassert:bce + } + } + tail := s[len(s)&^3:] + for i := range tail { + tail[i] = bits.ReverseBytes32(tail[i]) //gcassert:bce + } +} + +// ReverseBytes64 calls bits.ReverseBytes64 on each element of the input slice. +func ReverseBytes64(s []uint64) { + if len(s) >= 4 { + // We convert the slice (up to the tail) to a slice of [4]uint64. This helps + // the compiler elide bound checks. + quads := unsafe.Slice((*[4]uint64)(unsafe.Pointer(unsafe.SliceData(s))), len(s)>>2) + for i := range quads { + quads[i][0] = bits.ReverseBytes64(quads[i][0]) //gcassert:bce + quads[i][1] = bits.ReverseBytes64(quads[i][1]) //gcassert:bce + quads[i][2] = bits.ReverseBytes64(quads[i][2]) //gcassert:bce + quads[i][3] = bits.ReverseBytes64(quads[i][3]) //gcassert:bce + } + } + tail := s[len(s)&^3:] + for i := range tail { + tail[i] = bits.ReverseBytes64(tail[i]) //gcassert:bce + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/endian_big.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/endian_big.go new file mode 100644 index 0000000..e70033f --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/endian_big.go @@ -0,0 +1,89 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// NB: this list of tags is taken from encoding/binary/native_endian_big.go +//go:build armbe || arm64be || m68k || mips || mips64 || mips64p32 || ppc || ppc64 || s390 || s390x || shbe || sparc || sparc64 + +package colblk + +import ( + "math/bits" + "unsafe" +) + +// BigEndian is true if the target platform is big endian. +const BigEndian = true + +//gcassert:inline +func (s unsafeUint64Decoder) At(idx int) uint64 { + return bits.ReverseBytes64(*(*uint64)(unsafe.Add(s.ptr, uintptr(idx)<> 16 + } + if s.width <= 1 { + if s.width == 0 { + return 0, 0 + } + v := *(*uint16)(unsafe.Add(s.ptr, uintptr(i))) + // No need to ReverseBytes16, we can just return in the correct order. + return uint32(v >> 8), uint32(v & 0xFF) + } + v := *(*uint64)(unsafe.Add(s.ptr, uintptr(i)<> 32) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/endian_little.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/endian_little.go new file mode 100644 index 0000000..f536651 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/endian_little.go @@ -0,0 +1,81 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +// NB: this list of tags is taken from encoding/binary/native_endian_little.go +//go:build 386 || amd64 || amd64p32 || alpha || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || nios2 || ppc64le || riscv || riscv64 || sh || wasm + +package colblk + +import "unsafe" + +// BigEndian is true if the target platform is big endian. +const BigEndian = false + +//gcassert:inline +func (s unsafeUint64Decoder) At(idx int) uint64 { + return *(*uint64)(unsafe.Add(s.ptr, uintptr(idx)<> 16 + } + if s.width <= 1 { + if s.width == 0 { + return 0, 0 + } + v := *(*uint16)(unsafe.Add(s.ptr, uintptr(i))) + return uint32(v & 0xFF), uint32(v >> 8) + } + v := *(*uint64)(unsafe.Add(s.ptr, uintptr(i)<> 32) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/index_block.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/index_block.go new file mode 100644 index 0000000..8de583e --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/index_block.go @@ -0,0 +1,391 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "slices" + "unsafe" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +const indexBlockCustomHeaderSize = 0 + +// IndexBlockWriter writes columnar index blocks. The writer is used for both +// first-level and second-level index blocks. The index block schema consists of +// three primary columns: +// - Separators: a user key that is ≥ the largest user key in the +// corresponding entry, and ≤ the smallest user key in the next entry. +// Note that this allows consecutive separators to be equal. This is +// possible when snapshots required we preserve duplicate user keys at +// different sequence numbers. +// - Offsets: the offset of the end of the corresponding block. +// - Lengths: the length in bytes of the corresponding block. +// - Block properties: a slice encoding arbitrary user-defined block +// properties. +// +// TODO(jackson): Consider splitting separators into prefixes and suffixes (even +// without user-defined columns). This would allow us to use prefix compression +// for the prefix. Separators should typically be suffixless unless two KVs with +// the same prefix straddle a block boundary. We would need to use a buffer to +// materialize the separator key when we need to use it outside the context of +// seeking within the block. +type IndexBlockWriter struct { + separators RawBytesBuilder + offsets UintBuilder + lengths UintBuilder + blockProperties RawBytesBuilder + rows int + enc BlockEncoder +} + +const ( + indexBlockColumnSeparator = iota + indexBlockColumnOffsets + indexBlockColumnLengths + indexBlockColumnBlockProperties + indexBlockColumnCount +) + +// Init initializes the index block writer. +func (w *IndexBlockWriter) Init() { + w.separators.Init() + w.offsets.Init() + w.lengths.Init() + w.blockProperties.Init() + w.rows = 0 +} + +// Reset resets the index block writer to its initial state, retaining buffers. +func (w *IndexBlockWriter) Reset() { + w.separators.Reset() + w.offsets.Reset() + w.lengths.Reset() + w.blockProperties.Reset() + w.rows = 0 + w.enc.Reset() +} + +// Rows returns the number of entries in the index block so far. +func (w *IndexBlockWriter) Rows() int { + return w.rows +} + +// AddBlockHandle adds a new separator and end offset of a data block to the +// index block. Add returns the index of the row. +// +// AddBlockHandle should only be used for first-level index blocks. +func (w *IndexBlockWriter) AddBlockHandle( + separator []byte, handle block.Handle, blockProperties []byte, +) int { + idx := w.rows + w.separators.Put(separator) + w.offsets.Set(w.rows, handle.Offset) + w.lengths.Set(w.rows, handle.Length) + w.blockProperties.Put(blockProperties) + w.rows++ + return idx +} + +// UnsafeSeparator returns the separator of the i'th entry. +func (w *IndexBlockWriter) UnsafeSeparator(i int) []byte { + return w.separators.UnsafeGet(i) +} + +// Size returns the size of the pending index block. +func (w *IndexBlockWriter) Size() int { + return w.size(w.rows) +} + +func (w *IndexBlockWriter) size(rows int) int { + off := HeaderSize(indexBlockColumnCount, indexBlockCustomHeaderSize) + off = w.separators.Size(rows, off) + off = w.offsets.Size(rows, off) + off = w.lengths.Size(rows, off) + off = w.blockProperties.Size(rows, off) + off++ + return int(off) +} + +// Finish serializes the pending index block, including the first [rows] rows. +// The value of [rows] must be Rows() or Rows()-1. +func (w *IndexBlockWriter) Finish(rows int) []byte { + if invariants.Enabled && rows != w.rows && rows != w.rows-1 { + panic(errors.AssertionFailedf("index block has %d rows; asked to finish %d", w.rows, rows)) + } + + w.enc.Init(w.size(rows), Header{ + Version: Version1, + Columns: indexBlockColumnCount, + Rows: uint32(rows), + }, indexBlockCustomHeaderSize) + w.enc.Encode(rows, &w.separators) + w.enc.Encode(rows, &w.offsets) + w.enc.Encode(rows, &w.lengths) + w.enc.Encode(rows, &w.blockProperties) + return w.enc.Finish() +} + +// An IndexBlockDecoder reads columnar index blocks. +type IndexBlockDecoder struct { + separators RawBytes + offsets UnsafeUints + lengths UnsafeUints // only used for second-level index blocks + blockProps RawBytes + bd BlockDecoder +} + +// Init initializes the index block decoder with the given serialized index +// block. +func (r *IndexBlockDecoder) Init(data []byte) { + r.bd.Init(data, indexBlockCustomHeaderSize) + r.separators = r.bd.RawBytes(indexBlockColumnSeparator) + r.offsets = r.bd.Uints(indexBlockColumnOffsets) + r.lengths = r.bd.Uints(indexBlockColumnLengths) + r.blockProps = r.bd.RawBytes(indexBlockColumnBlockProperties) +} + +// DebugString prints a human-readable explanation of the keyspan block's binary +// representation. +func (r *IndexBlockDecoder) DebugString() string { + f := binfmt.New(r.bd.data).LineWidth(20) + tp := treeprinter.New() + r.Describe(f, tp.Child("index-block-decoder")) + return tp.String() +} + +// Describe describes the binary format of the index block, assuming f.Offset() +// is positioned at the beginning of the same index block described by r. +func (r *IndexBlockDecoder) Describe(f *binfmt.Formatter, tp treeprinter.Node) { + // Set the relative offset. When loaded into memory, the beginning of blocks + // are aligned. Padding that ensures alignment is done relative to the + // current offset. Setting the relative offset ensures that if we're + // describing this block within a larger structure (eg, f.Offset()>0), we + // compute padding appropriately assuming the current byte f.Offset() is + // aligned. + f.SetAnchorOffset() + + n := tp.Child("index block header") + r.bd.HeaderToBinFormatter(f, n) + for i := 0; i < indexBlockColumnCount; i++ { + r.bd.ColumnToBinFormatter(f, n, i, int(r.bd.header.Rows)) + } + f.HexBytesln(1, "block padding byte") + f.ToTreePrinter(n) +} + +// IndexIter is an iterator over the block entries in an index block. +type IndexIter struct { + compare base.Compare + split base.Split + d *IndexBlockDecoder + n int + row int + + syntheticPrefixAndSuffix block.SyntheticPrefixAndSuffix + + h block.BufferHandle + // TODO(radu): remove allocDecoder and require any Init callers to provide the + // decoder. + allocDecoder IndexBlockDecoder + keyBuf []byte +} + +// Assert that IndexIter satisfies the block.IndexBlockIterator interface. +var _ block.IndexBlockIterator = (*IndexIter)(nil) + +// InitWithDecoder initializes an index iterator from the provided decoder. +func (i *IndexIter) InitWithDecoder( + comparer *base.Comparer, d *IndexBlockDecoder, transforms block.IterTransforms, +) { + i.compare = comparer.Compare + i.split = comparer.Split + i.d = d + i.n = int(d.bd.header.Rows) + i.row = -1 + i.syntheticPrefixAndSuffix = transforms.SyntheticPrefixAndSuffix + // Leave h, allocDecoder, keyBuf unchanged. +} + +// Init initializes an iterator from the provided block data slice. +func (i *IndexIter) Init( + comparer *base.Comparer, blk []byte, transforms block.IterTransforms, +) error { + i.h.Release() + i.h = block.BufferHandle{} + i.allocDecoder.Init(blk) + i.InitWithDecoder(comparer, &i.allocDecoder, transforms) + return nil +} + +// InitHandle initializes an iterator from the provided block handle. +func (i *IndexIter) InitHandle( + comparer *base.Comparer, blk block.BufferHandle, transforms block.IterTransforms, +) error { + i.h.Release() + i.h = blk + d := (*IndexBlockDecoder)(unsafe.Pointer(blk.BlockMetadata())) + i.InitWithDecoder(comparer, d, transforms) + return nil +} + +// RowIndex returns the index of the block entry at the iterator's current +// position. +func (i *IndexIter) RowIndex() int { + return i.row +} + +// Valid returns true if the iterator is currently positioned at a valid block +// handle. +func (i *IndexIter) Valid() bool { + return 0 <= i.row && i.row < i.n +} + +// Invalidate invalidates the block iterator, removing references to the block +// it was initialized with. +func (i *IndexIter) Invalidate() { + i.d = nil + i.n = 0 +} + +// IsDataInvalidated returns true when the iterator has been invalidated +// using an Invalidate call. NB: this is different from Valid. +func (i *IndexIter) IsDataInvalidated() bool { + return i.d == nil +} + +// Handle returns the underlying block buffer handle, if the iterator was +// initialized with one. +func (i *IndexIter) Handle() block.BufferHandle { + return i.h +} + +// Separator returns the separator at the iterator's current position. The +// iterator must be positioned at a valid row. +func (i *IndexIter) Separator() []byte { + key := i.d.separators.At(i.row) + if i.syntheticPrefixAndSuffix.IsUnset() { + return key + } + return i.applyTransforms(key) +} + +// SeparatorLT returns true if the separator at the iterator's current +// position is strictly less than the provided key. +func (i *IndexIter) SeparatorLT(key []byte) bool { + return i.compare(i.Separator(), key) < 0 +} + +// SeparatorGT returns true if the separator at the iterator's current position +// is strictly greater than (or equal, if orEqual=true) the provided key. +func (i *IndexIter) SeparatorGT(key []byte, inclusively bool) bool { + cmp := i.compare(i.Separator(), key) + return cmp > 0 || (cmp == 0 && inclusively) +} + +func (i *IndexIter) applyTransforms(key []byte) []byte { + syntheticPrefix := i.syntheticPrefixAndSuffix.Prefix() + syntheticSuffix := i.syntheticPrefixAndSuffix.Suffix() + if syntheticSuffix.IsSet() { + key = key[:i.split(key)] + } + i.keyBuf = slices.Grow(i.keyBuf[:0], len(syntheticPrefix)+len(key)+len(syntheticSuffix)) + i.keyBuf = append(i.keyBuf, syntheticPrefix...) + i.keyBuf = append(i.keyBuf, key...) + i.keyBuf = append(i.keyBuf, syntheticSuffix...) + return i.keyBuf +} + +// BlockHandleWithProperties decodes the block handle with any encoded +// properties at the iterator's current position. +func (i *IndexIter) BlockHandleWithProperties() (block.HandleWithProperties, error) { + if invariants.Enabled && !i.Valid() { + panic(errors.AssertionFailedf("invalid row %d (n=%d)", i.row, i.n)) + } + return block.HandleWithProperties{ + Handle: block.Handle{ + Offset: i.d.offsets.At(i.row), + Length: i.d.lengths.At(i.row), + }, + Props: i.d.blockProps.At(i.row), + }, nil +} + +// SeekGE seeks the index iterator to the first block entry with a separator key +// greater or equal to the given key. It returns false if the seek key is +// greater than all index block separators. +func (i *IndexIter) SeekGE(key []byte) bool { + // Define f(-1) == false and f(upper) == true. + // Invariant: f(index-1) == false, f(upper) == true. + index, upper := 0, i.n + for index < upper { + h := int(uint(index+upper) >> 1) // avoid overflow when computing h + // index ≤ h < upper + + // TODO(jackson): Is Bytes.At or Bytes.Slice(Bytes.Offset(h), + // Bytes.Offset(h+1)) faster in this code? + separator := i.d.separators.At(h) + if !i.syntheticPrefixAndSuffix.IsUnset() { + // TODO(radu): compare without materializing the transformed key. + separator = i.applyTransforms(separator) + } + // TODO(radu): experiment with splitting the separator prefix and suffix in + // separate columns and using bytes.Compare() on the prefix in the hot path. + c := i.compare(key, separator) + if c > 0 { + index = h + 1 // preserves f(index-1) == false + } else { + upper = h // preserves f(upper) == true + } + } + // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true => answer is index. + i.row = index + return index < i.n +} + +// First seeks index iterator to the first block entry. It returns false if the +// index block is empty. +func (i *IndexIter) First() bool { + i.row = 0 + return i.n > 0 +} + +// Last seeks index iterator to the last block entry. It returns false if the +// index block is empty. +func (i *IndexIter) Last() bool { + i.row = i.n - 1 + return i.n > 0 +} + +// Next steps the index iterator to the next block entry. It returns false if +// the index block is exhausted in the forward direction. A call to Next while +// already exhausted in the forward direction is a no-op. +func (i *IndexIter) Next() bool { + i.row = min(i.n, i.row+1) + return i.row < i.n +} + +// Prev steps the index iterator to the previous block entry. It returns false +// if the index block is exhausted in the reverse direction. A call to Prev +// while already exhausted in the reverse direction is a no-op. +func (i *IndexIter) Prev() bool { + i.row = max(-1, i.row-1) + return i.row >= 0 && i.row < i.n +} + +// Close closes the iterator, releasing any resources it holds. +func (i *IndexIter) Close() error { + i.h.Release() + i.h = block.BufferHandle{} + i.d = nil + i.n = 0 + i.syntheticPrefixAndSuffix = block.SyntheticPrefixAndSuffix{} + return nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/key_value_block.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/key_value_block.go new file mode 100644 index 0000000..d68d3c3 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/key_value_block.go @@ -0,0 +1,141 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "iter" + + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +const keyValueBlockCustomHeaderSize = 0 + +// KeyValueBlockWriter writes key value blocks. The writer is used as a +// drop-in replacement for the metaindex and properties blocks. +// The key value block schema consists of two primary columns: +// - Key: represented by RawBytes +// - Value: represented by RawBytes +type KeyValueBlockWriter struct { + keys RawBytesBuilder + values RawBytesBuilder + rows int + enc BlockEncoder +} + +const ( + keyValueBlockColumnKey = iota + keyValueBlockColumnValue + keyValueBlockColumnCount +) + +// Init initializes the key value block writer. +func (w *KeyValueBlockWriter) Init() { + w.keys.Init() + w.values.Init() + w.rows = 0 +} + +// Rows returns the number of entries in the key value block so far. +func (w *KeyValueBlockWriter) Rows() int { + return w.rows +} + +// AddKV adds a new key and value of a block to the key value block. +// Add returns the index of the row. +func (w *KeyValueBlockWriter) AddKV(key []byte, value []byte) { + w.keys.Put(key) + w.values.Put(value) + w.rows++ +} + +func (w *KeyValueBlockWriter) size(rows int) int { + off := HeaderSize(keyValueBlockColumnCount, keyValueBlockCustomHeaderSize) + off = w.keys.Size(rows, off) + off = w.values.Size(rows, off) + // Add a padding byte at the end to allow the block's end to be represented + // as a pointer to allocated memory. + off++ + return int(off) +} + +// Finish serializes the pending key value block. +func (w *KeyValueBlockWriter) Finish(rows int) []byte { + w.enc.Init(w.size(rows), Header{ + Version: Version1, + Columns: keyValueBlockColumnCount, + Rows: uint32(rows), + }, indexBlockCustomHeaderSize) + w.enc.Encode(rows, &w.keys) + w.enc.Encode(rows, &w.values) + return w.enc.Finish() +} + +// KeyValueBlockDecoder reads columnar key value blocks. +type KeyValueBlockDecoder struct { + keys RawBytes + values RawBytes + bd BlockDecoder +} + +// Init initializes the key value block decoder with the given serialized block. +func (r *KeyValueBlockDecoder) Init(data []byte) { + r.bd.Init(data, keyValueBlockCustomHeaderSize) + r.keys = r.bd.RawBytes(keyValueBlockColumnKey) + r.values = r.bd.RawBytes(keyValueBlockColumnValue) +} + +// DebugString prints a human-readable explanation of the block's binary +// representation. +func (r *KeyValueBlockDecoder) DebugString() string { + f := binfmt.New(r.bd.data).LineWidth(20) + tp := treeprinter.New() + r.Describe(f, tp.Child("key-value-block-decoder")) + return tp.String() +} + +// Describe describes the binary format of the key value block, assuming +// f.Offset() is positioned at the beginning of the same key value block +// described by r. +func (r *KeyValueBlockDecoder) Describe(f *binfmt.Formatter, tp treeprinter.Node) { + // Set the relative offset. When loaded into memory, the beginning of blocks + // are aligned. Padding that ensures alignment is done relative to the + // current offset. Setting the relative offset ensures that if we're + // describing this block within a larger structure (eg, f.Offset()>0), we + // compute padding appropriately assuming the current byte f.Offset() is + // aligned. + f.SetAnchorOffset() + + n := tp.Child("key value block header") + r.bd.HeaderToBinFormatter(f, n) + for i := 0; i < keyValueBlockColumnCount; i++ { + r.bd.ColumnToBinFormatter(f, n, i, int(r.bd.header.Rows)) + } + f.HexBytesln(1, "block padding byte") + f.ToTreePrinter(n) +} + +func (r *KeyValueBlockDecoder) BlockDecoder() *BlockDecoder { + return &r.bd +} + +func (r *KeyValueBlockDecoder) KeyAt(i int) []byte { + return r.keys.At(i) +} + +func (r *KeyValueBlockDecoder) ValueAt(i int) []byte { + return r.values.At(i) +} + +// All returns an iterator that ranges over all key-value pairs in the block. +func (r *KeyValueBlockDecoder) All() iter.Seq2[[]byte, []byte] { + return func(yield func([]byte, []byte) bool) { + for i := 0; i < r.BlockDecoder().Rows(); i++ { + if !yield(r.KeyAt(i), r.ValueAt(i)) { + return + } + } + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/keyspan.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/keyspan.go new file mode 100644 index 0000000..9b7c4d2 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/keyspan.go @@ -0,0 +1,612 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "bytes" + "context" + "encoding/binary" + "fmt" + "os" + "sync" + "unsafe" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// the keyspan header encodes a 32-bit count of the number of unique boundary +// user keys in the block. +const keyspanHeaderSize = 4 + +// keyspan block column indexes +const ( + // Columns with 1 row per unique boundary user key contained within the + // block (with the count indicated via the keyspan custom block header). + keyspanColBoundaryUserKeys = 0 + keyspanColBoundaryKeyIndices = 1 + // Columns with 1 row per keyspan.Key (with the count indicated via the + // columnar header's row count). + keyspanColTrailers = 2 + keyspanColSuffixes = 3 + keyspanColValues = 4 + keyspanColumnCount = 5 +) + +// A KeyspanBlockWriter writes keyspan blocks. See the colblk package +// documentation for more details on the schema. +type KeyspanBlockWriter struct { + equal base.Equal + + // boundary columns + boundaryUserKeys RawBytesBuilder + boundaryKeyIndexes UintBuilder + + // keyspan.Key columns + trailers UintBuilder + suffixes RawBytesBuilder + values RawBytesBuilder + + enc BlockEncoder + keyCount int + unsafeLastUserKey []byte +} + +// Init initializes a keyspan block writer. +func (w *KeyspanBlockWriter) Init(equal base.Equal) { + w.equal = equal + w.boundaryUserKeys.Init() + w.boundaryKeyIndexes.Init() + w.trailers.Init() + w.suffixes.Init() + w.values.Init() + w.keyCount = 0 + w.unsafeLastUserKey = nil +} + +// Reset resets the keyspan block writer to an empty state, retaining memory for +// reuse. +func (w *KeyspanBlockWriter) Reset() { + w.boundaryUserKeys.Reset() + w.boundaryKeyIndexes.Reset() + w.trailers.Reset() + w.suffixes.Reset() + w.values.Reset() + w.enc.Reset() + w.keyCount = 0 + w.unsafeLastUserKey = nil +} + +// AddSpan appends a new Span to the pending block. Spans must already be +// fragmented (non-overlapping) and added in sorted order. +func (w *KeyspanBlockWriter) AddSpan(s keyspan.Span) { + // When keyspans are fragmented, abutting spans share a user key. One span's + // end key is the next span's start key. Check if the previous user key + // equals this span's start key, and avoid encoding it again if so. + if w.unsafeLastUserKey == nil || !w.equal(w.unsafeLastUserKey, s.Start) { + w.boundaryKeyIndexes.Set(w.boundaryUserKeys.rows, uint64(w.keyCount)) + w.boundaryUserKeys.Put(s.Start) + } + // The end key must be strictly greater than the start key and spans are + // already sorted, so the end key is guaranteed to not be present in the + // column yet. We need to encode it. + w.boundaryKeyIndexes.Set(w.boundaryUserKeys.rows, uint64(w.keyCount+len(s.Keys))) + w.boundaryUserKeys.Put(s.End) + + // Hold on to a slice of the copy of s.End we just added to the bytes + // builder so that we can compare it to the next span's start key. + w.unsafeLastUserKey = w.boundaryUserKeys.data[len(w.boundaryUserKeys.data)-len(s.End):] + + // Encode each keyspan.Key in the span. + for i := range s.Keys { + w.trailers.Set(w.keyCount, uint64(s.Keys[i].Trailer)) + w.suffixes.Put(s.Keys[i].Suffix) + w.values.Put(s.Keys[i].Value) + w.keyCount++ + } +} + +// KeyCount returns the count of keyspan.Keys written to the writer. +func (w *KeyspanBlockWriter) KeyCount() int { + return w.keyCount +} + +// UnsafeBoundaryKeys returns the smallest and largest keys written to the +// keyspan block so far. The returned internal keys have user keys that point +// directly into the block writer's memory and must not be mutated. +func (w *KeyspanBlockWriter) UnsafeBoundaryKeys() (smallest, largest base.InternalKey) { + if w.keyCount == 0 { + return smallest, largest + } + smallest.UserKey = w.boundaryUserKeys.UnsafeGet(0) + smallest.Trailer = base.InternalKeyTrailer(w.trailers.Get(0)) + largest.UserKey = w.boundaryUserKeys.UnsafeGet(w.boundaryUserKeys.rows - 1) + largest.Trailer = base.MakeTrailer(base.SeqNumMax, + base.InternalKeyTrailer(w.trailers.Get(w.keyCount-1)).Kind()) + return smallest, largest +} + +// UnsafeLastSpan returns the start and end user keys of the last span written +// to the block and the trailer of its largest key. The returned keys point +// directly into the block writer's memory and must not be mutated. +func (w *KeyspanBlockWriter) UnsafeLastSpan() ( + start, end []byte, + largestTrailer base.InternalKeyTrailer, +) { + if w.keyCount == 0 { + return nil, nil, 0 + } + return w.boundaryUserKeys.UnsafeGet(w.boundaryUserKeys.rows - 2), + w.boundaryUserKeys.UnsafeGet(w.boundaryUserKeys.rows - 1), + base.InternalKeyTrailer(w.trailers.Get(w.keyCount - 1)) +} + +// Size returns the size of the pending block. +func (w *KeyspanBlockWriter) Size() int { + off := HeaderSize(keyspanColumnCount, keyspanHeaderSize) + // Span boundary columns (with userKeyCount elements). + off = w.boundaryUserKeys.Size(w.boundaryUserKeys.rows, off) + off = w.boundaryKeyIndexes.Size(w.boundaryUserKeys.rows, off) + + // keyspan.Key columns (with keyCount elements). + off = w.trailers.Size(w.keyCount, off) + off = w.suffixes.Size(w.keyCount, off) + off = w.values.Size(w.keyCount, off) + off++ // trailing padding + return int(off) +} + +// Finish finalizes the pending block and returns the encoded block. +func (w *KeyspanBlockWriter) Finish() []byte { + w.enc.Init(w.Size(), Header{ + Version: Version1, + Columns: keyspanColumnCount, + Rows: uint32(w.keyCount), + }, keyspanHeaderSize) + + // The keyspan block has a 4-byte custom header used to encode the number of + // user keys encoded within the user key and start indices columns. All + // other columns have the number of rows indicated by the shared columnar + // block header. + binary.LittleEndian.PutUint32(w.enc.Data()[:keyspanHeaderSize], uint32(w.boundaryUserKeys.rows)) + + // Columns with userKeyCount elements. + w.enc.Encode(w.boundaryUserKeys.rows, &w.boundaryUserKeys) + w.enc.Encode(w.boundaryUserKeys.rows, &w.boundaryKeyIndexes) + // Columns with keyCount elements. + w.enc.Encode(w.keyCount, &w.trailers) + w.enc.Encode(w.keyCount, &w.suffixes) + w.enc.Encode(w.keyCount, &w.values) + return w.enc.Finish() +} + +// String returns a string representation of the pending block's state. +func (w *KeyspanBlockWriter) String() string { + var buf bytes.Buffer + size := uint32(w.Size()) + fmt.Fprintf(&buf, "size=%d:\n", size) + + fmt.Fprint(&buf, "0: user keys: ") + w.boundaryUserKeys.WriteDebug(&buf, w.boundaryUserKeys.rows) + fmt.Fprintln(&buf) + fmt.Fprint(&buf, "1: start indices: ") + w.boundaryKeyIndexes.WriteDebug(&buf, w.boundaryUserKeys.rows) + fmt.Fprintln(&buf) + + fmt.Fprint(&buf, "2: trailers: ") + w.trailers.WriteDebug(&buf, w.keyCount) + fmt.Fprintln(&buf) + fmt.Fprint(&buf, "3: suffixes: ") + w.suffixes.WriteDebug(&buf, w.keyCount) + fmt.Fprintln(&buf) + fmt.Fprint(&buf, "4: values: ") + w.values.WriteDebug(&buf, w.keyCount) + fmt.Fprintln(&buf) + + return buf.String() +} + +// A KeyspanDecoder exposes facilities for decoding a keyspan block. A +// KeyspanDecoder is safe for concurrent use after initialization. +type KeyspanDecoder struct { + blockDecoder BlockDecoder + // Span boundary columns with boundaryKeysCount elements. + boundaryKeysCount uint32 + boundaryKeys RawBytes + boundaryKeyIndices UnsafeUints + + // keyspan.Key columns with blockDecoder.header.Rows elements. + trailers UnsafeUints + suffixes RawBytes + values RawBytes +} + +// Init initializes the keyspan decoder with the given block data. +func (d *KeyspanDecoder) Init(data []byte) { + d.boundaryKeysCount = binary.LittleEndian.Uint32(data[:4]) + d.blockDecoder.Init(data, keyspanHeaderSize) + // The boundary key columns have a different number of rows than the other + // columns, so we call DecodeColumn directly, taking care to pass in + // rows=r.boundaryKeysCount. + d.boundaryKeys = DecodeColumn(&d.blockDecoder, keyspanColBoundaryUserKeys, + int(d.boundaryKeysCount), DataTypeBytes, DecodeRawBytes) + d.boundaryKeyIndices = DecodeColumn(&d.blockDecoder, keyspanColBoundaryKeyIndices, + int(d.boundaryKeysCount), DataTypeUint, DecodeUnsafeUints) + + d.trailers = d.blockDecoder.Uints(keyspanColTrailers) + d.suffixes = d.blockDecoder.RawBytes(keyspanColSuffixes) + d.values = d.blockDecoder.RawBytes(keyspanColValues) +} + +// DebugString prints a human-readable explanation of the keyspan block's binary +// representation. +func (d *KeyspanDecoder) DebugString() string { + f := binfmt.New(d.blockDecoder.data).LineWidth(20) + tp := treeprinter.New() + d.Describe(f, tp.Child("keyspan-decoder")) + return tp.String() +} + +// Describe describes the binary format of the keyspan block, assuming +// f.Offset() is positioned at the beginning of the same keyspan block described +// by r. +func (d *KeyspanDecoder) Describe(f *binfmt.Formatter, tp treeprinter.Node) { + // Set the relative offset. When loaded into memory, the beginning of blocks + // are aligned. Padding that ensures alignment is done relative to the + // current offset. Setting the relative offset ensures that if we're + // describing this block within a larger structure (eg, f.Offset()>0), we + // compute padding appropriately assuming the current byte f.Offset() is + // aligned. + f.SetAnchorOffset() + + n := tp.Child("keyspan block header") + f.HexBytesln(4, "user key count: %d", d.boundaryKeysCount) + f.ToTreePrinter(n) + d.blockDecoder.HeaderToBinFormatter(f, n) + + for i := 0; i < keyspanColumnCount; i++ { + // Not all columns in a keyspan block have the same number of rows; the + // boundary columns columns are different (and their lengths are held in + // the keyspan block header that precedes the ordinary columnar block + // header). + rows := int(d.blockDecoder.header.Rows) + if i == keyspanColBoundaryUserKeys || i == keyspanColBoundaryKeyIndices { + rows = int(d.boundaryKeysCount) + } + d.blockDecoder.ColumnToBinFormatter(f, n, i, rows) + } + f.HexBytesln(1, "block padding byte") + f.ToTreePrinter(n) +} + +// searchBoundaryKeys returns the index of the first boundary key greater than +// or equal to key and whether or not the key was found exactly. +func (d *KeyspanDecoder) searchBoundaryKeysWithSyntheticPrefix( + cmp base.Compare, key []byte, syntheticPrefix block.SyntheticPrefix, +) (index int, equal bool) { + if syntheticPrefix.IsSet() { + // The seek key must have the synthetic prefix, otherwise it falls entirely + // before or after the block's boundary keys. + var keyPrefix []byte + keyPrefix, key = splitKey(key, len(syntheticPrefix)) + if cmp := bytes.Compare(keyPrefix, syntheticPrefix); cmp != 0 { + if cmp < 0 { + return 0, false + } + return int(d.boundaryKeysCount), false + } + } + + i, j := 0, int(d.boundaryKeysCount) + for i < j { + h := int(uint(i+j) >> 1) // avoid overflow when computing h + // i ≤ h < j + switch cmp(key, d.boundaryKeys.At(h)) { + case +1: + i = h + 1 + case 0: + return h, true + default: + // -1 + j = h + } + } + return i, false +} + +// NewKeyspanIter constructs a new iterator over a keyspan columnar block. +func NewKeyspanIter( + cmp base.Compare, h block.BufferHandle, transforms block.FragmentIterTransforms, +) *KeyspanIter { + i := keyspanIterPool.Get().(*KeyspanIter) + i.closeCheck = invariants.CloseChecker{} + i.handle = h + d := (*KeyspanDecoder)(unsafe.Pointer(h.BlockMetadata())) + i.init(cmp, d, transforms) + return i +} + +var keyspanIterPool = sync.Pool{ + New: func() interface{} { + i := &KeyspanIter{} + if invariants.UseFinalizers { + invariants.SetFinalizer(i, func(obj interface{}) { + if i := obj.(*KeyspanIter); i.handle.Valid() { + fmt.Fprintf(os.Stderr, "KeyspanIter.handle is not nil: %#v\n", i.handle) + os.Exit(1) + } + }) + } + return i + }, +} + +// A KeyspanIter is an iterator over a columnar keyspan block. It implements the +// keyspan.FragmentIterator interface. +type KeyspanIter struct { + keyspanIter + handle block.BufferHandle + + closeCheck invariants.CloseChecker +} + +// Close closes the iterator. +func (i *KeyspanIter) Close() { + i.handle.Release() + i.handle = block.BufferHandle{} + + if invariants.Sometimes(25) { + // In invariants mode, sometimes don't add the object to the pool so + // that we can check for double closes that take longer than the object + // stays in the pool. + return + } + + i.keyspanIter.Close() + i.closeCheck.Close() + keyspanIterPool.Put(i) +} + +// A keyspanIter is an iterator over a keyspan block. It implements the +// keyspan.FragmentIterator interface. +type keyspanIter struct { + r *KeyspanDecoder + cmp base.Compare + transforms block.FragmentIterTransforms + noTransforms bool + span keyspan.Span + // When positioned, the current span's start key is the user key at + // i.r.userKeys.At(i.startBoundIndex) + // and the current span's end key is the user key at + // i.r.userKeys.At(i.startBoundIndex+1) + startBoundIndex int + keyBuf [2]keyspan.Key + // startKeyBuf and endKeyBuf are used when transforms.SyntheticPrefix is + // set. + startKeyBuf []byte + endKeyBuf []byte +} + +// Assert that KeyspanIter implements the FragmentIterator interface. +var _ keyspan.FragmentIterator = (*keyspanIter)(nil) + +// init initializes the iterator with the given comparison function and keyspan +// decoder. +func (i *keyspanIter) init( + cmp base.Compare, r *KeyspanDecoder, transforms block.FragmentIterTransforms, +) { + i.r = r + i.cmp = cmp + i.transforms = transforms + i.noTransforms = transforms.NoTransforms() + i.span.Start, i.span.End = nil, nil + i.startBoundIndex = -1 + if i.span.Keys == nil { + i.span.Keys = i.keyBuf[:0] + } + i.startKeyBuf = i.startKeyBuf[:0] + i.endKeyBuf = i.endKeyBuf[:0] + if transforms.HasSyntheticPrefix() { + i.startKeyBuf = append(i.startKeyBuf, transforms.SyntheticPrefix()...) + i.endKeyBuf = append(i.endKeyBuf, transforms.SyntheticPrefix()...) + } +} + +// SeekGE moves the iterator to the first span covering a key greater than +// or equal to the given key. This is equivalent to seeking to the first +// span with an end key greater than the given key. +func (i *keyspanIter) SeekGE(key []byte) (*keyspan.Span, error) { + // Seek among the boundary keys. + j, eq := i.r.searchBoundaryKeysWithSyntheticPrefix(i.cmp, key, i.transforms.SyntheticPrefix()) + // If the found boundary key does not exactly equal the given key, it's + // strictly greater than key. We need to back up one to consider the span + // that ends at the this boundary key. + if !eq { + j = max(j-1, 0) + } + return i.gatherKeysForward(j), nil +} + +// SeekLT moves the iterator to the last span covering a key less than the +// given key. This is equivalent to seeking to the last span with a start +// key less than the given key. +func (i *keyspanIter) SeekLT(key []byte) (*keyspan.Span, error) { + j, _ := i.r.searchBoundaryKeysWithSyntheticPrefix(i.cmp, key, i.transforms.SyntheticPrefix()) + // searchBoundaryKeys seeks to the first boundary key greater than or equal + // to key. The span beginning at the boundary key j necessarily does NOT + // cover any key less < key (it only contains keys ≥ key). Back up one to + // the first span that begins before [key], or to -1 if there is no such + // span. + j-- + + // If all boundaries are less than [key], or only the last boundary is + // greater than the key, then we want the last span so we clamp the index to + // the second to last boundary. + return i.gatherKeysBackward(min(j, int(i.r.boundaryKeysCount)-2)), nil +} + +// First moves the iterator to the first span. +func (i *keyspanIter) First() (*keyspan.Span, error) { + return i.gatherKeysForward(0), nil +} + +// Last moves the iterator to the last span. +func (i *keyspanIter) Last() (*keyspan.Span, error) { + return i.gatherKeysBackward(int(i.r.boundaryKeysCount) - 2), nil +} + +// Next moves the iterator to the next span. +func (i *keyspanIter) Next() (*keyspan.Span, error) { + return i.gatherKeysForward(i.startBoundIndex + 1), nil +} + +// Prev moves the iterator to the previous span. +func (i *keyspanIter) Prev() (*keyspan.Span, error) { + return i.gatherKeysBackward(max(i.startBoundIndex-1, -1)), nil +} + +// gatherKeysForward returns the first non-empty Span in the forward direction, +// starting with the span formed by using the boundary key at index +// [startBoundIndex] as the span's start boundary. +func (i *keyspanIter) gatherKeysForward(startBoundIndex int) *keyspan.Span { + if invariants.Enabled && startBoundIndex < 0 { + panic(errors.AssertionFailedf("out of bounds: i.startBoundIndex=%d", startBoundIndex)) + } + i.startBoundIndex = startBoundIndex + if i.startBoundIndex >= int(i.r.boundaryKeysCount)-1 { + return nil + } + if !i.isNonemptySpan(i.startBoundIndex) { + if i.startBoundIndex == int(i.r.boundaryKeysCount)-2 { + // Corruption error + panic(base.CorruptionErrorf("keyspan block has empty span at end")) + } + i.startBoundIndex++ + if !i.isNonemptySpan(i.startBoundIndex) { + panic(base.CorruptionErrorf("keyspan block has consecutive empty spans")) + } + } + return i.materializeSpan() +} + +// gatherKeysBackward returns the first non-empty Span in the backward direction, +// starting with the span formed by using the boundary key at index +// [startBoundIndex] as the span's start boundary. +func (i *keyspanIter) gatherKeysBackward(startBoundIndex int) *keyspan.Span { + i.startBoundIndex = startBoundIndex + if i.startBoundIndex < 0 { + return nil + } + if invariants.Enabled && i.startBoundIndex >= int(i.r.boundaryKeysCount)-1 { + panic(errors.AssertionFailedf("out of bounds: i.startBoundIndex=%d, i.r.boundaryKeysCount=%d", + i.startBoundIndex, i.r.boundaryKeysCount)) + } + if !i.isNonemptySpan(i.startBoundIndex) { + if i.startBoundIndex == 0 { + // Corruption error + panic(base.CorruptionErrorf("keyspan block has empty span at beginning")) + } + i.startBoundIndex-- + if !i.isNonemptySpan(i.startBoundIndex) { + panic(base.CorruptionErrorf("keyspan block has consecutive empty spans")) + } + } + return i.materializeSpan() +} + +// isNonemptySpan returns true if the span starting at i.startBoundIndex +// contains keys. +func (i *keyspanIter) isNonemptySpan(startBoundIndex int) bool { + return i.r.boundaryKeyIndices.At(startBoundIndex) < i.r.boundaryKeyIndices.At(startBoundIndex+1) +} + +// materializeSpan constructs the current span from i.startBoundIndex and +// i.{start,end}KeyIndex. +func (i *keyspanIter) materializeSpan() *keyspan.Span { + i.span = keyspan.Span{ + Start: i.r.boundaryKeys.At(i.startBoundIndex), + End: i.r.boundaryKeys.At(i.startBoundIndex + 1), + Keys: i.span.Keys[:0], + } + startIndex := i.r.boundaryKeyIndices.At(i.startBoundIndex) + endIndex := i.r.boundaryKeyIndices.At(i.startBoundIndex + 1) + if cap(i.span.Keys) < int(endIndex-startIndex) { + i.span.Keys = make([]keyspan.Key, 0, int(endIndex-startIndex)) + } + for j := startIndex; j < endIndex; j++ { + i.span.Keys = append(i.span.Keys, keyspan.Key{ + Trailer: base.InternalKeyTrailer(i.r.trailers.At(int(j))), + Suffix: i.r.suffixes.At(int(j)), + Value: i.r.values.At(int(j)), + }) + } + if i.noTransforms { + return &i.span + } + if i.transforms.SyntheticSeqNum != block.NoSyntheticSeqNum { + for j := range i.span.Keys { + i.span.Keys[j].Trailer = base.MakeTrailer( + base.SeqNum(i.transforms.SyntheticSeqNum), i.span.Keys[j].Trailer.Kind()) + } + } + if i.transforms.HasSyntheticSuffix() { + for j := range i.span.Keys { + k := &i.span.Keys[j] + switch k.Kind() { + case base.InternalKeyKindRangeKeySet: + if len(k.Suffix) > 0 { + // TODO(jackson): Assert synthetic suffix is >= k.Suffix. + k.Suffix = i.transforms.SyntheticSuffix() + } + case base.InternalKeyKindRangeKeyDelete: + // Nothing to do. + default: + panic(base.AssertionFailedf("synthetic suffix not supported with key kind %s", k.Kind())) + } + } + } + if i.transforms.HasSyntheticPrefix() || invariants.Sometimes(10) { + syntheticPrefix := i.transforms.SyntheticPrefix() + i.startKeyBuf = i.startKeyBuf[:len(syntheticPrefix)] + i.endKeyBuf = i.endKeyBuf[:len(syntheticPrefix)] + if invariants.Enabled { + if !bytes.Equal(i.startKeyBuf, syntheticPrefix) { + panic(errors.AssertionFailedf("keyspanIter: synthetic prefix mismatch %q, %q", + i.startKeyBuf, syntheticPrefix)) + } + if !bytes.Equal(i.endKeyBuf, syntheticPrefix) { + panic(errors.AssertionFailedf("keyspanIter: synthetic prefix mismatch %q, %q", + i.endKeyBuf, syntheticPrefix)) + } + } + i.startKeyBuf = append(i.startKeyBuf, i.span.Start...) + i.endKeyBuf = append(i.endKeyBuf, i.span.End...) + i.span.Start = i.startKeyBuf + i.span.End = i.endKeyBuf + } + + return &i.span +} + +// Close closes the iterator. +func (i *keyspanIter) Close() { + *i = keyspanIter{} +} + +// SetContext implements keyspan.FragmentIterator. +func (i *keyspanIter) SetContext(context.Context) {} + +// WrapChildren implements keyspan.FragmentIterator. +func (i *keyspanIter) WrapChildren(keyspan.WrapFn) {} + +// DebugTree is part of the FragmentIterator interface. +func (i *keyspanIter) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p)", i, i) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/prefix_bytes.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/prefix_bytes.go new file mode 100644 index 0000000..d0fdea5 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/prefix_bytes.go @@ -0,0 +1,1162 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "bytes" + "fmt" + "io" + "math/bits" + "slices" + "strings" + "unsafe" + + "github.com/cockroachdb/crlib/crbytes" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// PrefixBytes holds an array of lexicographically ordered byte slices. It +// provides prefix compression. Prefix compression applies strongly to two cases +// in CockroachDB: removal of the "[/tenantID]/tableID/indexID" prefix that is +// present on all table data keys, and multiple versions of a key that are +// distinguished only by different timestamp suffixes. With columnar blocks +// enabling the timestamp to be placed in a separate column, the multiple +// version problem becomes one of efficiently handling exact duplicate keys. +// PrefixBytes builds off of the RawBytes encoding, introducing additional +// slices for encoding (n+bundleSize-1)/bundleSize bundle prefixes and 1 +// block-level shared prefix for the column. +// +// Unlike the original prefix compression performed by rowblk (inherited from +// LevelDB and RocksDB), PrefixBytes does not perform all prefix compression +// relative to the previous key. Rather it performs prefix compression relative +// to the first key of a key's bundle. This can result in less compression, but +// simplifies reverse iteration and allows iteration to be largely stateless. +// +// To understand the PrefixBytes layout, we'll work through an example using +// these 15 keys: +// +// 0123456789 +// 0 aaabbbc +// 1 aaabbbcc +// 2 aaabbbcde +// 3 aaabbbce +// 4 aaabbbdee +// 5 aaabbbdee +// 6 aaabbbdee +// 7 aaabbbeff +// 8 aaabbe +// 9 aaabbeef +// 10 aaabbeef +// 11 aaabc +// 12 aabcceef +// 13 aabcceef +// 14 aabcceef +// +// The total length of these keys is 119 bytes. There are 3 keys which occur +// multiple times (rows 4-6, 9-10, 12-14) which models multiple versions of the +// same MVCC key in CockroachDB. There is a shared prefix to all of the keys +// which models the "[/tenantID]/tableID/indexID" present on CockroachDB table +// data keys. There are other shared prefixes which model identical values in +// table key columns. +// +// The table below shows the components of the KeyBytes encoding for these 15 +// keys when using a bundle size of 4 which results in 4 bundles. The 15 keys +// are encoded into 20 slices: 1 block prefix, 4 bundle prefixes, and 15 +// suffixes. The first slice in the table is the block prefix that is shared by +// all keys in the block. The first slice in each bundle is the bundle prefix +// which is shared by all keys in the bundle. +// +// idx | row | end offset | data +// -------+-------+------------+---------- +// 0 | | 2 | aa +// 1 | | 7 | ..abbbc +// 2 | 0 | 7 | ....... +// 3 | 1 | 8 | .......c +// 4 | 2 | 10 | .......de +// 5 | 3 | 11 | .......e +// 6 | | 15 | ..abbb +// 7 | 4 | 18 | ......dee +// 8 | 5 | 18 | ......... +// 9 | 6 | 18 | ......... +// 10 | 7 | 21 | ......eff +// 11 | | 23 | ..ab +// 12 | 8 | 25 | ....be +// 13 | 9 | 29 | ....beef +// 14 | 10 | 29 | ........ +// 15 | 11 | 30 | ....c +// 16 | | 36 | ..bcceef +// 17 | 12 | 36 | ........ +// 18 | 13 | 36 | ........ +// 19 | 14 | 36 | ........ +// +// The 'end offset' column in the table encodes the exclusive offset within the +// string data section where each of the slices end. Each slice starts at the +// previous slice's end offset. The first slice (the block prefix)'s start +// offset is implicitly zero. Note that this differs from the plain RawBytes +// encoding which always stores a zero offset at the beginning of the offsets +// array to avoid special-casing the first slice. The block prefix already +// requires special-casing, so materializing the zero start offset is not +// needed. +// +// The table above defines 20 slices: the 1 block key prefix, the 4 bundle key +// prefixes and the 15 key suffixes. Offset[0] is the length of the first slice +// which is always anchored at data[0]. The data columns display the portion of +// the data array the slice covers. For row slices, an empty suffix column +// indicates that the slice is identical to the slice at the previous index +// which is indicated by the slice's offset being equal to the previous slice's +// offset. Due to the lexicographic sorting, the key at row i can't be a prefix +// of the key at row i-1 or it would have sorted before the key at row i-1. And +// if the key differs then only the differing bytes will be part of the suffix +// and not contained in the bundle prefix. +// +// The end result of this encoding is that we can store the 119 bytes of the 15 +// keys plus their start and end offsets (which would naively consume 15*4=60 +// bytes for at least the key lengths) in 61 bytes (36 bytes of data + 4 bytes +// of offset constant + 20 bytes of offset delta data + 1 byte of bundle size). +// +// # Physical representation +// +// +==================================================================+ +// | Bundle size (1 byte) | +// | | +// | The bundle size indicates how many keys prefix compression may | +// | apply across. Every bundleSize keys, prefix compression restarts.| +// | The bundleSize is required to be a power of two, and this 1- | +// | byte prefix stores log2(bundleSize). | +// +==================================================================+ +// | RawBytes | +// | | +// | A modified RawBytes encoding is used to store the data slices. A | +// | PrefixBytes column storing n keys will encode | +// | | +// | 1 block prefix | +// | + | +// | (n + bundleSize-1)/bundleSize bundle prefixes | +// | + | +// | n row suffixes | +// | | +// | slices. Unlike the RawBytes encoding, the first offset encoded | +// | is not guaranteed to be zero. In the PrefixBytes encoding, the | +// | first offset encodes the length of the column-wide prefix. The | +// | column-wide prefix is stored in slice(0, offset(0)). | +// | | +// | +------------------------------------------------------------+ | +// | | Offset table | | +// | | | | +// | | A Uint32 column encoding offsets into the string data, | | +// | | possibly delta8 or delta16 encoded. When a delta encoding | | +// | | is used, the base constant is always zero. | | +// | +------------------------------------------------------------+ | +// | | offsetDelta[0] | offsetDelta[1] | ... | offsetDelta[m] | | +// | +------------------------------------------------------------+ | +// | | prefix-compressed string data | | +// | | ... | | +// | +------------------------------------------------------------+ | +// +==================================================================+ +// +// TODO(jackson): Consider stealing the low bit of the offset for a flag +// indicating that a key is a duplicate and then using the remaining bits to +// encode the relative index of the duplicated key's end offset. This would +// avoid the O(bundle size) scan in the case of duplicate keys, but at the cost +// of complicating logic to look up a bundle prefix (which may need to follow a +// duplicate key's relative index to uncover the start offset of the bundle +// prefix). +// +// # Reads +// +// This encoding provides O(1) access to any row by calculating the bundle for +// the row (see bundleOffsetIndexForRow), then the per-row's suffix (see +// rowSuffixIndex). If the per-row suffix's end offset equals the previous +// offset, then the row is a duplicate key and we need to step backward until we +// find a non-empty slice or the start of the bundle (a variable number of +// steps, but bounded by the bundle size). +// +// Forward iteration can easily reuse the previous row's key with a check on +// whether the row's slice is empty. Reverse iteration within a run of equal +// keys can reuse the next row's key. When reverse iteration steps backward from +// a non-empty slice onto an empty slice, it must continue backward until a +// non-empty slice is found (just as in absolute positioning) to discover the +// row suffix that is duplicated. +// +// The Seek{GE,LT} routines first binary search on the first key of each bundle +// which can be retrieved without data movement because the bundle prefix is +// immediately adjacent to it in the data array. We can slightly optimize the +// binary search by skipping over all of the keys in the bundle on prefix +// mismatches. +type PrefixBytes struct { + bundleCalc + rows int + sharedPrefixLen int + rawBytes RawBytes +} + +// Assert that PrefixBytes implements Array[[]byte]. +var _ Array[[]byte] = PrefixBytes{} + +// DecodePrefixBytes decodes the structure of a PrefixBytes, constructing an +// accessor for an array of lexicographically sorted byte slices constructed by +// PrefixBytesBuilder. Count must be the number of logical slices within the +// array. +func DecodePrefixBytes( + b []byte, offset uint32, count int, +) (prefixBytes PrefixBytes, endOffset uint32) { + if count == 0 { + panic(errors.AssertionFailedf("empty PrefixBytes")) + } + // The first byte of a PrefixBytes-encoded column is the bundle size + // expressed as log2 of the bundle size (the bundle size must always be a + // power of two) + bundleShift := uint32(*((*uint8)(unsafe.Pointer(&b[offset])))) + calc := makeBundleCalc(bundleShift) + nBundles := int(calc.bundleCount(count)) + + rb, endOffset := DecodeRawBytes(b, offset+1, count+nBundles) + pb := PrefixBytes{ + bundleCalc: calc, + rows: count, + rawBytes: rb, + } + pb.sharedPrefixLen = int(pb.rawBytes.offsets.At(0)) + return pb, endOffset +} + +// Assert that DecodePrefixBytes implements DecodeFunc. +var _ DecodeFunc[PrefixBytes] = DecodePrefixBytes + +// At returns the i'th []byte slice in the PrefixBytes. At must allocate, so +// callers should prefer accessing a slice's constituent components through +// SharedPrefix, BundlePrefix and RowSuffix. +func (b PrefixBytes) At(i int) []byte { + return slices.Concat(b.SharedPrefix(), b.RowBundlePrefix(i), b.RowSuffix(i)) +} + +// UnsafeFirstSlice returns first slice in the PrefixBytes. The returned slice +// points directly into the PrefixBytes buffer and must not be mutated. +func (b *PrefixBytes) UnsafeFirstSlice() []byte { + return b.rawBytes.Slice(0, b.rawBytes.offsets.At(2)) +} + +// PrefixBytesIter is an iterator and associated buffers for PrefixBytes. It +// provides a means for efficiently iterating over the []byte slices contained +// within a PrefixBytes, avoiding unnecessary copying when portions of slices +// are shared. +type PrefixBytesIter struct { + // Buf is used for materializing a user key. It is preallocated to the maximum + // key length in the data block. + Buf []byte + syntheticPrefixLen uint32 + sharedAndBundlePrefixLen uint32 + offsetIndex int + nextBundleOffsetIndex int +} + +// Init initializes the prefix bytes iterator; maxKeyLength must be +// large enough to fit any key in the block after applying any synthetic prefix +// and/or suffix. +func (i *PrefixBytesIter) Init(maxKeyLength int, syntheticPrefix block.SyntheticPrefix) { + // Allocate a buffer that's large enough to hold the largest user key in the + // block with 1 byte to spare (so that pointer arithmetic is never pointing + // beyond the allocation, which would violate Go rules). + n := maxKeyLength + 1 + if cap(i.Buf) < n { + ptr := mallocgc(uintptr(n), nil, false) + i.Buf = unsafe.Slice((*byte)(ptr), n) + } + i.Buf = i.Buf[:0] + i.syntheticPrefixLen = uint32(len(syntheticPrefix)) + if syntheticPrefix.IsSet() { + i.Buf = append(i.Buf, syntheticPrefix...) + } +} + +// SetAt updates the provided PrefixBytesIter to hold the i'th []byte slice in +// the PrefixBytes. The PrefixBytesIter's buffer must be sufficiently large to +// hold the i'th []byte slice, and the caller is required to statically ensure +// this. +func (b *PrefixBytes) SetAt(it *PrefixBytesIter, i int) { + // Determine the offset and length of the bundle prefix. + bundleOffsetIndex := b.bundleOffsetIndexForRow(i) + invariants.CheckBounds(bundleOffsetIndex, b.rawBytes.slices) + bundleOffsetStart, bundleOffsetEnd := b.rawBytes.offsets.At2(bundleOffsetIndex) + bundlePrefixLen := bundleOffsetEnd - bundleOffsetStart + + // Determine the offset and length of the row's individual suffix. + it.offsetIndex = b.rowSuffixIndex(i) + // TODO(jackson): rowSuffixOffsets will recompute bundleOffsetIndexForRow in + // the case that the row is a duplicate key. Is it worth optimizing to avoid + // this recomputation? The expected case is non-duplicate keys, so it may + // not be worthwhile. + rowSuffixStart, rowSuffixEnd := b.rowSuffixOffsets(i, it.offsetIndex) + rowSuffixLen := rowSuffixEnd - rowSuffixStart + + it.sharedAndBundlePrefixLen = it.syntheticPrefixLen + uint32(b.sharedPrefixLen) + bundlePrefixLen + it.Buf = it.Buf[:it.sharedAndBundlePrefixLen+rowSuffixLen] + + ptr := unsafe.Pointer(unsafe.SliceData(it.Buf)) + ptr = unsafe.Add(ptr, it.syntheticPrefixLen) + // Copy the shared key prefix. + memmove(ptr, b.rawBytes.data, uintptr(b.sharedPrefixLen)) + // Copy the bundle prefix. + ptr = unsafe.Add(ptr, b.sharedPrefixLen) + memmove( + ptr, + unsafe.Pointer(uintptr(b.rawBytes.data)+uintptr(bundleOffsetStart)), + uintptr(bundlePrefixLen)) + + // Copy the per-row suffix. + ptr = unsafe.Add(ptr, bundlePrefixLen) + memmove( + ptr, + unsafe.Pointer(uintptr(b.rawBytes.data)+uintptr(rowSuffixStart)), + uintptr(rowSuffixLen)) + // Set nextBundleOffsetIndex so that a call to SetNext can cheaply determine + // whether the next row is in the same bundle. + it.nextBundleOffsetIndex = bundleOffsetIndex + (1 << b.bundleShift) + 1 +} + +// SetNext updates the provided PrefixBytesIter to hold the next []byte slice in +// the PrefixBytes. SetNext requires the provided iter to currently hold a slice +// and for a subsequent slice to exist within the PrefixBytes. The +// PrefixBytesIter's buffer must be sufficiently large to hold the next []byte +// slice, and the caller is required to statically ensure this. +func (b *PrefixBytes) SetNext(it *PrefixBytesIter) { + it.offsetIndex++ + // If the next row is in the same bundle, we can take a fast path of only + // updating the per-row suffix. + if it.offsetIndex < it.nextBundleOffsetIndex { + invariants.CheckBounds(it.offsetIndex, b.rawBytes.slices) + rowSuffixStart, rowSuffixEnd := b.rawBytes.offsets.At2(it.offsetIndex) + rowSuffixLen := rowSuffixEnd - rowSuffixStart + if rowSuffixLen == 0 { + // The start and end offsets are equal, indicating that the key is a + // duplicate. Since it's identical to the previous key, there's + // nothing left to do, we can leave buf as-is. + return + } + it.Buf = it.Buf[:it.sharedAndBundlePrefixLen+rowSuffixLen] + // Copy in the per-row suffix. + ptr := unsafe.Pointer(unsafe.SliceData(it.Buf)) + memmove( + unsafe.Add(ptr, it.sharedAndBundlePrefixLen), + unsafe.Pointer(uintptr(b.rawBytes.data)+uintptr(rowSuffixStart)), + uintptr(rowSuffixLen)) + return + } + + // We've reached the end of the bundle. We need to update the bundle prefix. + // The offsetIndex is currently pointing to the start of the new bundle + // prefix. Increment it to point at the start of the new row suffix. + it.offsetIndex++ + invariants.CheckBounds(it.offsetIndex, b.rawBytes.slices) + rowSuffixStart, rowSuffixEnd := b.rawBytes.offsets.At2(it.offsetIndex) + rowSuffixLen := rowSuffixEnd - rowSuffixStart + + // Read the offsets of the new bundle prefix and update the index of the + // next bundle. + bundlePrefixStart := b.rawBytes.offsets.At(it.nextBundleOffsetIndex) + bundlePrefixLen := rowSuffixStart - bundlePrefixStart + it.nextBundleOffsetIndex = it.offsetIndex + (1 << b.bundleShift) + + it.sharedAndBundlePrefixLen = it.syntheticPrefixLen + uint32(b.sharedPrefixLen) + bundlePrefixLen + it.Buf = it.Buf[:it.sharedAndBundlePrefixLen+rowSuffixLen] + // Copy in the new bundle suffix. + ptr := unsafe.Pointer(unsafe.SliceData(it.Buf)) + ptr = unsafe.Add(ptr, it.syntheticPrefixLen) + ptr = unsafe.Add(ptr, b.sharedPrefixLen) + memmove( + ptr, + unsafe.Pointer(uintptr(b.rawBytes.data)+uintptr(bundlePrefixStart)), + uintptr(bundlePrefixLen)) + // Copy in the per-row suffix. + ptr = unsafe.Add(ptr, bundlePrefixLen) + memmove( + ptr, + unsafe.Pointer(uintptr(b.rawBytes.data)+uintptr(rowSuffixStart)), + uintptr(rowSuffixLen)) +} + +// SharedPrefix return a []byte of the shared prefix that was extracted from +// all of the values in the Bytes vector. The returned slice should not be +// mutated. +func (b *PrefixBytes) SharedPrefix() []byte { + // The very first slice is the prefix for the entire column. + return b.rawBytes.Slice(0, b.rawBytes.offsets.At(0)) +} + +// RowBundlePrefix takes a row index and returns a []byte of the prefix shared +// among all the keys in the row's bundle, but without the block-level shared +// prefix for the column. The returned slice should not be mutated. +func (b *PrefixBytes) RowBundlePrefix(row int) []byte { + i := b.bundleOffsetIndexForRow(row) + invariants.CheckBounds(i, b.rawBytes.slices) + return b.rawBytes.Slice(b.rawBytes.offsets.At2(i)) +} + +// BundlePrefix returns the prefix of the i-th bundle in the column. The +// provided i must be in the range [0, BundleCount()). The returned slice should +// not be mutated. +func (b *PrefixBytes) BundlePrefix(i int) []byte { + j := b.offsetIndexByBundleIndex(i) + invariants.CheckBounds(j, b.rawBytes.slices) + return b.rawBytes.Slice(b.rawBytes.offsets.At2(j)) +} + +// RowSuffix returns a []byte of the suffix unique to the row. A row's full key +// is the result of concatenating SharedPrefix(), BundlePrefix() and +// RowSuffix(). +// +// The returned slice should not be mutated. +func (b *PrefixBytes) RowSuffix(row int) []byte { + return b.rawBytes.Slice(b.rowSuffixOffsets(row, b.rowSuffixIndex(row))) +} + +// rowSuffixOffsets finds the start and end offsets of the row's suffix slice, +// accounting for duplicate keys. It takes the index of the row, and the value +// of rowSuffixIndex(row). +func (b *PrefixBytes) rowSuffixOffsets(row, i int) (low uint32, high uint32) { + invariants.CheckBounds(i, b.rawBytes.slices) + // Retrieve the low and high offsets indicating the start and end of the + // row's suffix slice. + low, high = b.rawBytes.offsets.At2(i) + // If there's a non-empty slice for the row, this row is different than its + // predecessor. + if low != high { + return low, high + } + // Otherwise, an empty slice indicates a duplicate key. We need to find the + // first non-empty predecessor within the bundle, or if all the rows are + // empty, return arbitrary equal low and high. + // + // Compute the index of the first row in the bundle so we know when to stop. + firstIndex := 1 + b.bundleOffsetIndexForRow(row) + for i > firstIndex { + // Step back a row, and check if the slice is non-empty. + i-- + high = low + low = b.rawBytes.offsets.At(i) + if low != high { + return low, high + } + } + // All the rows in the bundle are empty. + return low, high +} + +// Rows returns the count of rows whose keys are encoded within the PrefixBytes. +func (b *PrefixBytes) Rows() int { + return b.rows +} + +// BundleCount returns the count of bundles within the PrefixBytes. +func (b *PrefixBytes) BundleCount() int { + return b.bundleCount(b.rows) +} + +// Search searches for the first key in the PrefixBytes that is greater than or +// equal to k, returning the index of the key and whether an equal key was +// found. If multiple keys are equal, the index of the first such key is +// returned. If all keys are < k, Search returns Rows() for the row index. +func (b *PrefixBytes) Search(k []byte) (rowIndex int, isEqual bool) { + // First compare to the block-level shared prefix. + n := min(len(k), b.sharedPrefixLen) + c := bytes.Compare(k[:n], unsafe.Slice((*byte)(b.rawBytes.data), b.sharedPrefixLen)) + // Note that c cannot be 0 when n < b.sharedPrefixLen. + if c != 0 { + if c < 0 { + // Search key is less than any prefix in the block. + return 0, false + } + // Search key is greater than any key in the block. + return b.rows, false + } + // Trim the block-level shared prefix from the search key. + k = k[b.sharedPrefixLen:] + + // Binary search among the first keys of each bundle. + // + // Define f(-1) == false and f(upper) == true. + // Invariant: f(bi-1) == false, f(upper) == true. + nBundles := b.BundleCount() + bi, upper := 0, nBundles + upperEqual := false + for bi < upper { + h := int(uint(bi+upper) >> 1) // avoid overflow when computing h + // bi ≤ h < upper + + // Retrieve the first key in the h-th (zero-indexed) bundle. We take + // advantage of the fact that the first row is stored contiguously in + // the data array (modulo the block prefix) to slice the entirety of the + // first key: + // + // b u n d l e p r e f i x f i r s t k e y r e m a i n d e r + // ^ ^ ^ + // offset(j) offset(j+1) offset(j+2) + // + j := b.offsetIndexByBundleIndex(h) + invariants.CheckBounds(j+1, b.rawBytes.slices) + bundleFirstKey := b.rawBytes.Slice(b.rawBytes.offsets.At(j), b.rawBytes.offsets.At(j+2)) + c = bytes.Compare(k, bundleFirstKey) + switch { + case c > 0: + bi = h + 1 // preserves f(bi-1) == false + case c < 0: + upper = h // preserves f(upper) == true + upperEqual = false + default: + // c == 0 + upper = h // preserves f(upper) == true + upperEqual = true + } + } + if bi == 0 { + // The very first key is ≥ k. Return it. + return 0, upperEqual + } + // The first key of the bundle bi is ≥ k, but any of the keys in the + // previous bundle besides the first could also be ≥ k. We can binary search + // among them, but if the seek key doesn't share the previous bundle's + // prefix there's no need. + j := b.offsetIndexByBundleIndex(bi - 1) + invariants.CheckBounds(j, b.rawBytes.slices) + bundlePrefix := b.rawBytes.Slice(b.rawBytes.offsets.At2(j)) + + // The row we are looking for might still be in the previous bundle even + // though the seek key is greater than the first key. This is possible only + // if the search key shares the first bundle's prefix (eg, the search key + // equals a row in the previous bundle or falls between two rows within the + // previous bundle). + if len(bundlePrefix) > len(k) || !bytes.Equal(k[:len(bundlePrefix)], bundlePrefix) { + // The search key doesn't share the previous bundle's prefix, so all of + // the keys in the previous bundle must be less than k. We know the + // first key of bi is ≥ k, so return it. + if bi >= nBundles { + return b.rows, false + } + return bi << b.bundleShift, upperEqual + } + // Binary search among bundle bi-1's key remainders after stripping bundle + // bi-1's prefix. + // + // Define f(l-1) == false and f(u) == true. + // Invariant: f(l-1) == false, f(u) == true. + k = k[len(bundlePrefix):] + l := 1 + u := min(1<> 1) // avoid overflow when computing h + // l ≤ h < u + + // j is currently the index of the offset of bundle bi-i's prefix. + // + // b u n d l e p r e f i x f i r s t k e y s e c o n d k e y + // ^ ^ ^ + // offset(j) offset(j+1) offset(j+2) + // + // The beginning of the zero-indexed i-th key of the bundle is at + // offset(j+i+1). + // + invariants.CheckBounds(j+h+1, b.rawBytes.slices) + hStart, hEnd := b.rawBytes.offsets.At2(j + h + 1) + // There's a complication with duplicate keys. When keys are repeated, + // the PrefixBytes encoding avoids re-encoding the duplicate key, + // instead encoding an empty slice. While binary searching, if we land + // on an empty slice, we need to back up until we find a non-empty slice + // which is the key at index h. We iterate with p. If we eventually find + // the duplicated key at index p < h and determine f(p) == true, then we + // can set u=p (rather than h). If we determine f(p)==false, then we + // know f(h)==false too and set l=h+1. + p := h + if hStart == hEnd { + // Back up looking for an empty slice. + for hStart == hEnd && p >= l { + p-- + hEnd = hStart + hStart = b.rawBytes.offsets.At(j + p + 1) + } + // If we backed up to l-1, then all the rows in indexes [l, h] have + // the same keys as index l-1. We know f(l-1) == false [see the + // invariants above], so we can move l to h+1 and continue the loop + // without performing any key comparisons. + if p < l { + l = h + 1 + continue + } + } + rem := b.rawBytes.Slice(hStart, hEnd) + c = bytes.Compare(k, rem) + switch { + case c > 0: + l = h + 1 // preserves f(l-1) == false + case c < 0: + u = p // preserves f(u) == true + upperEqual = false + default: + // c == 0 + u = p // preserves f(u) == true + upperEqual = true + } + } + i := (bi-1)<>pb.bundleShift + count + startOff := blockPrefixLen + prevLen := blockPrefixLen + + // Use dots to indicate string data that's elided because it falls within + // the block or bundle prefix. + dots := strings.Repeat(".", int(blockPrefixLen)) + // Iterate through all the slices in the data section, annotating bundle + // prefixes and using dots to indicate elided data. + for i := 0; i < k-1; i++ { + endOff := pb.rawBytes.offsets.At(i + 1) + if i%(1+(1< 0 && (bundleSize&(bundleSize-1)) != 0 { + panic(errors.AssertionFailedf("prefixbytes bundle size %d is not a power of 2", bundleSize)) + } + *b = PrefixBytesBuilder{ + bundleCalc: makeBundleCalc(uint32(bits.TrailingZeros32(uint32(bundleSize)))), + data: b.data[:0], + bundleSize: bundleSize, + offsets: b.offsets, + maxShared: (1 << 16) - 1, + } + b.offsets.count = 0 +} + +// NumColumns implements ColumnWriter. +func (b *PrefixBytesBuilder) NumColumns() int { return 1 } + +// DataType implements ColumnWriter. +func (b *PrefixBytesBuilder) DataType(int) DataType { return DataTypePrefixBytes } + +// Reset resets the builder to an empty state, preserving the existing bundle +// size. +func (b *PrefixBytesBuilder) Reset() { + const maxRetainedData = 512 << 10 // 512 KB + *b = PrefixBytesBuilder{ + bundleCalc: b.bundleCalc, + data: b.data[:0], + bundleSize: b.bundleSize, + offsets: b.offsets, + maxShared: b.maxShared, + sizings: [2]prefixBytesSizing{}, + } + b.offsets.count = 0 + if len(b.data) > maxRetainedData { + b.data = nil + } +} + +// Rows returns the number of keys added to the builder. +func (b *PrefixBytesBuilder) Rows() int { return b.nKeys } + +// prefixBytesSizing maintains metadata about the size of the accumulated data +// and its encoded size. Every key addition computes a new prefixBytesSizing +// struct. The PrefixBytesBuilder maintains two prefixBytesSizing structs, one +// for the state after the most recent key addition, and one for the state after +// the second most recent key addition. +type prefixBytesSizing struct { + lastKeyOff int // the offset in data where the last key added begins + offsetCount int // the count of offsets required to encode the data + blockPrefixLen int // the length of the block prefix + currentBundleDistinctLen int // the length of the "current" bundle's distinct keys + currentBundleDistinctKeys int // the number of distinct keys in the "current" bundle + // currentBundlePrefixLen is the length of the "current" bundle's prefix. + // The current bundle holds all keys that are not included within + // PrefixBytesBuilder.completedBundleLen. If the addition of a key causes + // the creation of a new bundle, the previous bundle's size is incorporated + // into completedBundleLen and currentBundlePrefixLen is updated to the + // length of the new bundle key. This ensures that there's always at least 1 + // key in the "current" bundle allowing Finish to accept rows = nKeys-1. + // + // Note that currentBundlePrefixLen is inclusive of the blockPrefixLen. + // + // INVARIANT: currentBundlePrefixLen >= blockPrefixLen + currentBundlePrefixLen int // the length of the "current" bundle's prefix + currentBundlePrefixOffset int // the index of the offset of the "current" bundle's prefix + compressedDataLen int // the compressed, encoded size of data + offsetEncoding UintEncoding // the encoding necessary to encode the offsets +} + +func (sz *prefixBytesSizing) String() string { + return fmt.Sprintf("lastKeyOff:%d offsetCount:%d blockPrefixLen:%d\n"+ + "currentBundleDistinct{Len,Keys}: (%d,%d)\n"+ + "currentBundlePrefix{Len,Offset}: (%d,%d)\n"+ + "compressedDataLen:%d offsetEncoding:%s", + sz.lastKeyOff, sz.offsetCount, sz.blockPrefixLen, sz.currentBundleDistinctLen, + sz.currentBundleDistinctKeys, sz.currentBundlePrefixLen, sz.currentBundlePrefixOffset, + sz.compressedDataLen, sz.offsetEncoding) +} + +// Put adds the provided key to the column. The provided key must be +// lexicographically greater than or equal to the previous key added to the +// builder. +// +// The provided bytesSharedWithPrev must be the length of the byte prefix the +// provided key shares with the previous key. The caller is required to provide +// this because in the primary expected use, the caller will already need to +// compute it for the purpose of determining whether successive keys share the +// same prefix. +func (b *PrefixBytesBuilder) Put(key []byte, bytesSharedWithPrev int) { + currIdx := b.nKeys & 1 // %2 + curr := &b.sizings[currIdx] + prev := &b.sizings[currIdx^1] + + if invariants.Enabled { + if len(key) == 0 { + panic(errors.AssertionFailedf("key must be non-empty")) + } + if b.maxShared == 0 { + panic(errors.AssertionFailedf("maxShared must be positive")) + } + if b.nKeys > 0 { + if bytes.Compare(key, b.data[prev.lastKeyOff:]) < 0 { + panic(errors.AssertionFailedf("keys must be added in order: %q < %q", key, b.data[prev.lastKeyOff:])) + } + if bytesSharedWithPrev != crbytes.CommonPrefix(key, b.data[prev.lastKeyOff:]) { + panic(errors.AssertionFailedf("bytesSharedWithPrev %d != %d", bytesSharedWithPrev, + crbytes.CommonPrefix(key, b.data[prev.lastKeyOff:]))) + } + } + } + + // Check if this is the first key in a bundle. + if b.nKeys&(b.bundleSize-1) == 0 { + if b.nKeys == 0 { + // We're adding the first key to the block. + // Set a placeholder offset for the block prefix length. + b.addOffset(0) + // Set a placeholder offset for the bundle prefix length. + b.addOffset(0) + b.nKeys++ + b.data = append(b.data, key...) + b.addOffset(uint32(len(b.data))) + *curr = prefixBytesSizing{ + lastKeyOff: 0, + offsetCount: b.offsets.count, + blockPrefixLen: min(len(key), int(b.maxShared)), + currentBundleDistinctLen: len(key), + currentBundleDistinctKeys: 1, + currentBundlePrefixLen: min(len(key), int(b.maxShared)), + currentBundlePrefixOffset: 1, + compressedDataLen: len(key), + offsetEncoding: DetermineUintEncodingNoDelta(uint64(len(key))), + } + return + } + // We're starting a new bundle. + + // Set the bundle prefix length of the previous bundle. + unsafeSetUint32( + b.offsets.elems, prev.currentBundlePrefixOffset, + unsafeGetUint32(b.offsets.elems, prev.currentBundlePrefixOffset-1)+uint32(prev.currentBundlePrefixLen), + ) + + // Finalize the encoded size of the previous bundle. + bundleSizeJustCompleted := prev.currentBundleDistinctLen - (prev.currentBundleDistinctKeys-1)*prev.currentBundlePrefixLen + b.completedBundleLen += bundleSizeJustCompleted + + // Update the block prefix length if necessary. The caller tells us how + // many bytes of prefix this key shares with the previous key. The block + // prefix can only shrink if the bytes shared with the previous key are + // less than the block prefix length, in which case the new block prefix + // is the number of bytes shared with the previous key. + blockPrefixLen := min(prev.blockPrefixLen, bytesSharedWithPrev) + b.nKeys++ + *curr = prefixBytesSizing{ + lastKeyOff: len(b.data), + offsetCount: b.offsets.count + 2, + blockPrefixLen: blockPrefixLen, + // We're adding the first key to the current bundle. Initialize + // the current bundle prefix. + currentBundlePrefixOffset: b.offsets.count, + currentBundlePrefixLen: min(len(key), int(b.maxShared)), + currentBundleDistinctLen: len(key), + currentBundleDistinctKeys: 1, + compressedDataLen: b.completedBundleLen + len(key) - (b.bundleCount(b.nKeys)-1)*blockPrefixLen, + } + curr.offsetEncoding = DetermineUintEncodingNoDelta(uint64(curr.compressedDataLen)) + b.data = append(b.data, key...) + b.addOffset(0) // Placeholder for bundle prefix. + b.addOffset(uint32(len(b.data))) + return + } + // We're adding a new key to an existing bundle. + b.nKeys++ + + if bytesSharedWithPrev == len(key) { + // Duplicate key; don't add it to the data slice and don't adjust + // currentBundleDistinct{Len,Keys}. + *curr = *prev + curr.offsetCount++ + b.addOffset(unsafeGetUint32(b.offsets.elems, b.offsets.count-1)) + return + } + + // Update the bundle prefix length. Note that the shared prefix length + // can only shrink as new values are added. During construction, the + // bundle prefix value is stored contiguously in the data array so even + // if the bundle prefix length changes no adjustment is needed to that + // value or to the first key in the bundle. + *curr = prefixBytesSizing{ + lastKeyOff: len(b.data), + offsetCount: prev.offsetCount + 1, + blockPrefixLen: min(prev.blockPrefixLen, bytesSharedWithPrev), + currentBundleDistinctLen: prev.currentBundleDistinctLen + len(key), + currentBundleDistinctKeys: prev.currentBundleDistinctKeys + 1, + currentBundlePrefixLen: min(prev.currentBundlePrefixLen, bytesSharedWithPrev), + currentBundlePrefixOffset: prev.currentBundlePrefixOffset, + } + // Compute the correct compressedDataLen. + curr.compressedDataLen = b.completedBundleLen + + curr.currentBundleDistinctLen - + (curr.currentBundleDistinctKeys-1)*curr.currentBundlePrefixLen + // Currently compressedDataLen is correct, except that it includes the block + // prefix length for all bundle prefixes. Adjust the length to account for + // the block prefix being stripped from every bundle except the first one. + curr.compressedDataLen -= (b.bundleCount(b.nKeys) - 1) * curr.blockPrefixLen + // The compressedDataLen is the largest offset we'll need to encode in the + // offset table. + curr.offsetEncoding = DetermineUintEncodingNoDelta(uint64(curr.compressedDataLen)) + b.data = append(b.data, key...) + b.addOffset(uint32(len(b.data))) +} + +// UnsafeGet returns the zero-indexed i'th key added to the builder through Put. +// UnsafeGet may only be used to retrieve the Rows()-1'th or Rows()-2'th keys. +// If called with a different i value, UnsafeGet panics. The keys returned by +// UnsafeGet are guaranteed to be stable until Finish or Reset is called. The +// caller must not mutate the returned slice. +func (b *PrefixBytesBuilder) UnsafeGet(i int) []byte { + switch i { + case b.nKeys - 1: + lastKeyOff := b.sizings[i&1].lastKeyOff + return b.data[lastKeyOff:] + case b.nKeys - 2: + lastKeyOff := b.sizings[(i^1)&1].lastKeyOff + secondLastKeyOff := b.sizings[i&1].lastKeyOff + if secondLastKeyOff == lastKeyOff { + // The last key is a duplicate of the second-to-last key. + return b.data[secondLastKeyOff:] + } + return b.data[secondLastKeyOff:lastKeyOff] + default: + panic(errors.AssertionFailedf("UnsafeGet(%d) called on PrefixBytes with %d keys", i, b.nKeys)) + } +} + +// addOffset adds an offset to the offsets table. If necessary, addOffset will +// grow the offset table to accommodate the new offset. +func (b *PrefixBytesBuilder) addOffset(offset uint32) { + if b.offsets.count == len(b.offsets.elems) { + // Double the size of the allocated array, or initialize it to at least + // 64 rows if this is the first allocation. + n2 := max(len(b.offsets.elems)<<1, 64) + newSlice := make([]uint32, n2) + copy(newSlice, b.offsets.elems) + b.offsets.elems = newSlice + } + unsafeSetUint32(b.offsets.elems, b.offsets.count, offset) + b.offsets.count++ +} + +// writePrefixCompressed writes the provided builder's first [rows] rows with +// prefix-compression applied. It writes offsets and string data in tandem, +// writing offsets of width T into [offsetDeltas] and compressed string data +// into [buf]. The builder's internal state is not modified by +// writePrefixCompressed. writePrefixCompressed is generic in terms of the type +// T of the offset deltas. +// +// The caller must have correctly constructed [offsetDeltas] such that writing +// [sizing.offsetCount] offsets of size T does not overwrite the beginning of +// [buf]: +// +// +-------------------------------------+ <- offsetDeltas.ptr +// | offsetDeltas[0] | +// +-------------------------------------+ +// | offsetDeltas[1] | +// +-------------------------------------+ +// | ... | +// +-------------------------------------+ +// | offsetDeltas[sizing.offsetCount-1] | +// +-------------------------------------+ <- &buf[0] +// | buf (string data) | +// | ... | +// +-------------------------------------+ +func writePrefixCompressed[T Uint]( + b *PrefixBytesBuilder, rows int, sz *prefixBytesSizing, offsetDeltas uintsEncoder[T], buf []byte, +) { + if invariants.Enabled && offsetDeltas.Len() != sz.offsetCount { + panic("incorrect offsetDeltas length") + } + if rows <= 1 { + if rows == 1 { + // If there's just 1 row, no prefix compression is necessary and we can + // just encode the first key as the entire block prefix and first bundle + // prefix. + e := b.offsets.elems[2] + offsetDeltas.UnsafeSet(0, T(e)) + offsetDeltas.UnsafeSet(1, T(e)) + offsetDeltas.UnsafeSet(2, T(e)) + copy(buf[:e], b.data[:e]) + } + return + } + + // The offset at index 0 is the block prefix length. + copy(buf[:sz.blockPrefixLen], b.data[:sz.blockPrefixLen]) + destOffset := T(sz.blockPrefixLen) + offsetDeltas.UnsafeSet(0, destOffset) + var lastRowOffset uint32 + var shared int + + // Loop over the slices starting at the bundle prefix of the first bundle. + // If the slice is a bundle prefix, carve off the suffix that excludes the + // block prefix. Otherwise, carve off the suffix that excludes the block + // prefix + bundle prefix. + for i := 1; i < sz.offsetCount; i++ { + off := unsafeGetUint32(b.offsets.elems, i) + var suffix []byte + if (i-1)%(b.bundleSize+1) == 0 { + // This is a bundle prefix. + if i == sz.currentBundlePrefixOffset { + suffix = b.data[lastRowOffset+uint32(sz.blockPrefixLen) : lastRowOffset+uint32(sz.currentBundlePrefixLen)] + } else { + suffix = b.data[lastRowOffset+uint32(sz.blockPrefixLen) : off] + } + shared = sz.blockPrefixLen + len(suffix) + // We don't update lastRowOffset here because the bundle prefix + // was never actually stored separately in the data array. + } else { + // If the offset of this key is the same as the offset of the + // previous key, then the key is a duplicate. All we need to do is + // set the same offset in the destination. + if off == lastRowOffset { + offsetDeltas.UnsafeSet(i, destOffset) + continue + } + suffix = b.data[lastRowOffset+uint32(shared) : off] + // Update lastRowOffset for the next iteration of this loop. + lastRowOffset = off + } + if invariants.Enabled && len(buf) < int(destOffset)+len(suffix) { + panic(errors.AssertionFailedf("buf is too small: %d < %d", len(buf[destOffset:]), len(suffix))) + } + memmove( + unsafe.Add(unsafe.Pointer(unsafe.SliceData(buf)), destOffset), + unsafe.Pointer(unsafe.SliceData(suffix)), + uintptr(len(suffix)), + ) + destOffset += T(len(suffix)) + offsetDeltas.UnsafeSet(i, destOffset) + } + if destOffset != T(sz.compressedDataLen) { + panic(errors.AssertionFailedf("wrote %d, expected %d", destOffset, sz.compressedDataLen)) + } +} + +// Finish writes the serialized byte slices to buf starting at offset. The buf +// slice must be sufficiently large to store the serialized output. The caller +// should use [Size] to size buf appropriately before calling Finish. +// +// Finish only supports values of [rows] equal to the number of keys set on the +// builder, or one less. +func (b *PrefixBytesBuilder) Finish( + col int, rows int, offset uint32, buf []byte, +) (endOffset uint32) { + if rows < b.nKeys-1 || rows > b.nKeys { + panic(errors.AssertionFailedf("PrefixBytes has accumulated %d keys, asked to Finish %d", b.nKeys, rows)) + } + if rows == 0 { + return offset + } + // Encode the bundle shift. + buf[offset] = byte(b.bundleShift) + offset++ + + sz := &b.sizings[rows&1^1] + stringDataOffset := uintColumnSize(uint32(sz.offsetCount), offset, sz.offsetEncoding) + if sz.offsetEncoding.IsDelta() { + panic(errors.AssertionFailedf("offsets never need delta encoding")) + } + + width := uint32(sz.offsetEncoding.Width()) + buf[offset] = byte(sz.offsetEncoding) + offset++ + offset = alignWithZeroes(buf, offset, width) + switch width { + case 1: + offsetDest := makeUintsEncoder[uint8](buf[offset:], sz.offsetCount) + writePrefixCompressed[uint8](b, rows, sz, offsetDest, buf[stringDataOffset:]) + offsetDest.Finish() + case align16: + offsetDest := makeUintsEncoder[uint16](buf[offset:], sz.offsetCount) + writePrefixCompressed[uint16](b, rows, sz, offsetDest, buf[stringDataOffset:]) + offsetDest.Finish() + case align32: + offsetDest := makeUintsEncoder[uint32](buf[offset:], sz.offsetCount) + writePrefixCompressed[uint32](b, rows, sz, offsetDest, buf[stringDataOffset:]) + offsetDest.Finish() + default: + panic("unreachable") + } + return stringDataOffset + uint32(sz.compressedDataLen) +} + +// Size computes the size required to encode the byte slices beginning at the +// provided offset. The offset is required to ensure proper alignment. The +// returned uint32 is the offset of the first byte after the end of the encoded +// data. To compute the size in bytes, subtract the [offset] passed into Size +// from the returned offset. +func (b *PrefixBytesBuilder) Size(rows int, offset uint32) uint32 { + if rows == 0 { + return offset + } else if rows != b.nKeys && rows != b.nKeys-1 { + panic(errors.AssertionFailedf("PrefixBytes has accumulated %d keys, asked to Size %d", b.nKeys, rows)) + } + sz := &b.sizings[rows&1^1] + // The 1-byte bundleSize. + offset++ + // Compute the size of the offsets table. + offset = uintColumnSize(uint32(sz.offsetCount), offset, sz.offsetEncoding) + return offset + uint32(sz.compressedDataLen) +} + +// WriteDebug implements the Encoder interface. +func (b *PrefixBytesBuilder) WriteDebug(w io.Writer, rows int) { + fmt.Fprintf(w, "prefixbytes(%d): %d keys", b.bundleSize, b.nKeys) +} + +// bundleCalc provides facilities for computing indexes and offsets within a +// PrefixBytes structure. +type bundleCalc struct { + bundleShift uint32 // log2(bundleSize) + // bundleMask is a mask with 1s across the high bits that indicate the + // bundle and 0s for the bits that indicate the position within the bundle. + bundleMask uint32 +} + +func makeBundleCalc(bundleShift uint32) bundleCalc { + return bundleCalc{ + bundleShift: bundleShift, + bundleMask: ^((1 << bundleShift) - 1), + } +} + +// rowSuffixIndex computes the index of the offset encoding the start of a row's +// suffix. Example usage of retrieving the row's suffix: +// +// i := b.rowSuffixIndex(row) +// l := b.rawBytes.offsets.At(i) +// h := b.rawBytes.offsets.At(i + 1) +// suffix := b.rawBytes.slice(l, h) +func (b bundleCalc) rowSuffixIndex(row int) int { + return 1 + (row >> b.bundleShift) + row +} + +// bundleOffsetIndexForRow computes the index of the offset encoding the start +// of a bundle's prefix. +func (b bundleCalc) bundleOffsetIndexForRow(row int) int { + // AND-ing the row with the bundle mask removes the least significant bits + // of the row, which encode the row's index within the bundle. + return int((uint32(row) >> b.bundleShift) + (uint32(row) & b.bundleMask)) +} + +// offsetIndexByBundleIndex computes the index of the offset encoding the start +// of a bundle's prefix when given the bundle's index (an index in +// [0,Rows/BundleSize)). +func (b bundleCalc) offsetIndexByBundleIndex(bi int) int { + return bi<>b.bundleShift +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/raw_bytes.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/raw_bytes.go new file mode 100644 index 0000000..226cc56 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/raw_bytes.go @@ -0,0 +1,232 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "bytes" + "fmt" + "io" + "unsafe" + + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" +) + +// RawBytes holds an array of byte slices, stored as a concatenated data section +// and a series of offsets for each slice. Byte slices within RawBytes are +// stored in their entirety without any compression, ensuring stability without +// copying. +// +// # Representation +// +// An array of N byte slices encodes N+1 offsets. The beginning of the data +// representation holds an offsets table, in the same encoding as a +// DataTypeUint32 column. The integer offsets may be encoded using smaller width +// integers to save space if all offsets fit within an 8-bit or 16-bit uint. +// Each offset is relative to the beginning of the string data section (after +// the offset table). +// +// The use of UintEncoding conserves space in the common case. In the context of +// CockroachDB, the vast majority of offsets will fit in 16-bits when using 32 +// KiB blocks (the size in use by CockroachDB). However, a single value larger +// than 65535 bytes requires an offset too large to fit within 16 bits, in which +// case offsets will be encoded as 32-bit integers. +// +// +-------------------------------------------------------------------+ +// | a uint offsets table, usually encoded with 16-bits, | +// | possibly padded for alignment | +// | (see UintEncoding) | +// +-------------------------------------------------------------------+ +// | String Data | +// | abcabcada.... | +// +-------------------------------------------------------------------+ +// +// The UintEncoding bits of the ColumnEncoding for a RawBytes column describes +// the encoding of the offset table. +type RawBytes struct { + slices int + offsets UnsafeOffsets + start unsafe.Pointer + data unsafe.Pointer +} + +// Assert that RawBytes implements Array[[]byte]. +var _ Array[[]byte] = RawBytes{} + +// DecodeRawBytes decodes the structure of a RawBytes, constructing an accessor +// for an array of byte slices constructed by RawBytesBuilder. Count must be the +// number of byte slices within the array. +func DecodeRawBytes(b []byte, offset uint32, count int) (rawBytes RawBytes, endOffset uint32) { + if count == 0 { + return RawBytes{}, offset + } + offsets, dataOff := DecodeUnsafeOffsets(b, offset, count+1 /* +1 offset */) + return RawBytes{ + slices: count, + offsets: offsets, + start: unsafe.Pointer(&b[offset]), + data: unsafe.Pointer(&b[dataOff]), + }, dataOff + offsets.At(count) +} + +// Assert that DecodeRawBytes implements DecodeFunc. +var _ DecodeFunc[RawBytes] = DecodeRawBytes + +func defaultSliceFormatter(x []byte) string { + if bytes.ContainsFunc(x, func(r rune) bool { return r < 32 || r > 126 }) { + return fmt.Sprintf("%q", x) + } + return string(x) +} + +func rawBytesToBinFormatter( + f *binfmt.Formatter, tp treeprinter.Node, count int, sliceFormatter func([]byte) string, +) { + if count == 0 { + return + } + if sliceFormatter == nil { + sliceFormatter = defaultSliceFormatter + } + + rb, _ := DecodeRawBytes(f.RelativeData(), uint32(f.RelativeOffset()), count) + dataOffset := uint64(f.RelativeOffset()) + uint64(uintptr(rb.data)-uintptr(rb.start)) + n := tp.Child("offsets table") + uintsToBinFormatter(f, n, count+1, func(offset, base uint64) string { + // NB: base is always zero for RawBytes columns. + return fmt.Sprintf("%d [%d overall]", offset+base, offset+base+dataOffset) + }) + n = tp.Child("data") + for i := 0; i < rb.slices; i++ { + s := rb.At(i) + f.HexBytesln(len(s), "data[%d]: %s", i, sliceFormatter(s)) + } + f.ToTreePrinter(n) +} + +//gcassert:inline +func (b *RawBytes) ptr(offset uint32) unsafe.Pointer { + return unsafe.Pointer(uintptr(b.data) + uintptr(offset)) +} + +//gcassert:inline +func (b *RawBytes) Slice(start, end uint32) []byte { + return unsafe.Slice((*byte)(b.ptr(start)), end-start) +} + +//gcassert:inline +func (b *RawBytes) Offsets(i int) (start, end uint32) { + invariants.CheckBounds(i, b.slices) + return b.offsets.At2(i) +} + +// At returns the []byte at index i. The returned slice should not be mutated. +func (b RawBytes) At(i int) []byte { + invariants.CheckBounds(i, b.slices) + return b.Slice(b.offsets.At2(i)) +} + +// Slices returns the number of []byte slices encoded within the RawBytes. +func (b *RawBytes) Slices() int { + return b.slices +} + +// RawBytesBuilder encodes a column of byte slices. +type RawBytesBuilder struct { + rows int + data []byte + offsets UintBuilder +} + +// Assert that *RawBytesBuilder implements ColumnWriter. +var _ ColumnWriter = (*RawBytesBuilder)(nil) + +// Init initializes the builder for first-time use. +func (b *RawBytesBuilder) Init() { + b.offsets.Init() + b.Reset() +} + +// Reset resets the builder to an empty state. +func (b *RawBytesBuilder) Reset() { + b.rows = 0 + b.data = b.data[:0] + b.offsets.Reset() + // Add an initial offset of zero to streamline the logic in RawBytes.At() to + // avoid needing a special case for row 0. + b.offsets.Set(0, 0) +} + +// NumColumns implements ColumnWriter. +func (b *RawBytesBuilder) NumColumns() int { return 1 } + +// DataType implements ColumnWriter. +func (b *RawBytesBuilder) DataType(int) DataType { return DataTypeBytes } + +// Put appends the provided byte slice to the builder. +func (b *RawBytesBuilder) Put(s []byte) { + b.data = append(b.data, s...) + b.rows++ + b.offsets.Set(b.rows, uint64(len(b.data))) +} + +// PutConcat appends a single byte slice formed by the concatenation of the two +// byte slice arguments. +func (b *RawBytesBuilder) PutConcat(s1, s2 []byte) { + b.data = append(append(b.data, s1...), s2...) + b.rows++ + b.offsets.Set(b.rows, uint64(len(b.data))) +} + +// Rows returns the count of slices that have been added to the builder. +func (b *RawBytesBuilder) Rows() int { + return b.rows +} + +// UnsafeGet returns the i'th slice added to the builder. The returned slice is +// owned by the builder and must not be mutated. +func (b *RawBytesBuilder) UnsafeGet(i int) []byte { + if b.rows == 0 { + return nil + } + start := unsafeGetUint64(b.offsets.elems, i) + end := unsafeGetUint64(b.offsets.elems, i+1) + return b.data[start:end] +} + +// Finish writes the serialized byte slices to buf starting at offset. The buf +// slice must be sufficiently large to store the serialized output. The caller +// should use [Size] to size buf appropriately before calling Finish. +func (b *RawBytesBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 { + if rows == 0 { + return offset + } + dataLen := b.offsets.Get(rows) + offset = b.offsets.Finish(0, rows+1, offset, buf) + // Copy the data section. + return offset + uint32(copy(buf[offset:], b.data[:dataLen])) +} + +// Size computes the size required to encode the byte slices beginning in a +// buffer at the provided offset. The offset is required to ensure proper +// alignment. The returned uint32 is the offset of the first byte after the end +// of the encoded data. To compute the size in bytes, subtract the [offset] +// passed into Size from the returned offset. +func (b *RawBytesBuilder) Size(rows int, offset uint32) uint32 { + if rows == 0 { + return offset + } + // Get the size needed to encode the rows+1 offsets. + offset = b.offsets.Size(rows+1, offset) + // Add the value of offset[rows] since that is the accumulated size of the + // first [rows] slices. + return offset + uint32(b.offsets.Get(rows)) +} + +// WriteDebug implements Encoder. +func (b *RawBytesBuilder) WriteDebug(w io.Writer, rows int) { + fmt.Fprintf(w, "bytes: %d rows set; %d bytes in data", b.rows, len(b.data)) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/uints.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/uints.go new file mode 100644 index 0000000..9077bc4 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/uints.go @@ -0,0 +1,518 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "encoding/binary" + "fmt" + "io" + "math" + "math/bits" + "unsafe" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "golang.org/x/exp/constraints" +) + +// Uint is a constraint that permits any unsigned integer type with an +// explicit size. +type Uint interface { + ~uint8 | ~uint16 | ~uint32 | ~uint64 +} + +// UintEncoding indicates how unsigned integers (of at most 64 bits) are +// encoded. It has two components: +// - the low bits indicate how many bytes per integer are used, with +// allowed values 0, 1, 2, 4, or 8. +// - whether we are using a delta encoding, meaning that a base (64-bit) value +// is encoded separately and each encoded value is a delta from that base. +// Delta encoding is never necessary when we use 8 bytes per integer. +// +// Note that 0-byte encodings imply that all values are equal (either to the +// base value if we are using a delta encoding, otherwise to 0). +// +// The UintEncoding byte is serialized to the uint column before the column +// data. +type UintEncoding uint8 + +const uintEncodingDeltaBit UintEncoding = 1 << 7 +const uintEncodingAllZero UintEncoding = 0 + +// IsDelta returns true if it is a delta encoding. +func (e UintEncoding) IsDelta() bool { + return e&uintEncodingDeltaBit != 0 +} + +// Width returns the number of bytes used per integer. It can be 0, 1, 2, 4, or 8. +func (e UintEncoding) Width() int { + return int(e &^ uintEncodingDeltaBit) +} + +// IsValid returns true if the encoding is valid. +func (e UintEncoding) IsValid() bool { + switch e.Width() { + case 0, 1, 2, 4: + return true + case 8: + // We should never need to do delta encoding if we store all 64 bits. + return !e.IsDelta() + default: + return false + } +} + +// String implements fmt.Stringer. +func (e UintEncoding) String() string { + if e.Width() == 0 { + if e.IsDelta() { + return "const" + } + return "zero" + } + deltaString := "" + if e.IsDelta() { + deltaString = ",delta" + } + return fmt.Sprintf("%db%s", e.Width(), deltaString) +} + +// UintEncodingRowThreshold is the threshold under which the number of rows can +// affect the best encoding. This happens when the constant 8 bytes for the +// delta base doesn't make up for saving a byte or two in the per-row encoding. +const UintEncodingRowThreshold = 8 + +// DetermineUintEncoding returns the best valid encoding that can be used to +// represent numRows integers in the range [minValue, maxValue]. +// +// DetermineUintEncoding returns the same result for any value of rows >= +// UintEncodingRowThreshold. +func DetermineUintEncoding(minValue, maxValue uint64, numRows int) UintEncoding { + b := byteWidth(maxValue - minValue) + if b == 8 { + return UintEncoding(8) + } + // Check if we can use the same number of bytes without a delta encoding. + isDelta := maxValue >= (1 << (b << 3)) + if isDelta && numRows < UintEncodingRowThreshold { + bNoDelta := byteWidth(maxValue) + // Check if saving (bNoDelta-b) bytes per row makes up for the 8 bytes + // required by the delta base. + if numRows*int(bNoDelta-b) < 8 { + b = bNoDelta + isDelta = false + } + } + return makeUintEncoding(b, isDelta) +} + +// DetermineUintEncodingNoDelta is a more efficient variant of +// DetermineUintEncoding when minValue is zero (or we don't need a delta +// encoding). +func DetermineUintEncodingNoDelta(maxValue uint64) UintEncoding { + return makeUintEncoding(byteWidth(maxValue), false /* isDelta */) +} + +// byteWidthTable maps a number’s bit‐length to the number of bytes needed. +var byteWidthTable = [65]uint8{ + // 0 bits => 0 bytes + 0, + // 1..8 bits => 1 byte + 1, 1, 1, 1, 1, 1, 1, 1, + // 9..16 bits => 2 bytes + 2, 2, 2, 2, 2, 2, 2, 2, + // 17..32 bits => 4 bytes + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + // 33..64 bits => 8 bytes + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, +} + +// byteWidth returns the number of bytes necessary to represent the given value, +// either 0, 1, 2, 4, or 8. +func byteWidth(maxValue uint64) uint8 { + // bits.Len64 returns 0 for 0, 1..64 for others. + // We then simply return the precomputed result. + return byteWidthTable[bits.Len64(maxValue)] +} + +func makeUintEncoding(width uint8, isDelta bool) UintEncoding { + e := UintEncoding(width) + if isDelta { + e |= uintEncodingDeltaBit + } + if invariants.Enabled && !e.IsValid() { + panic(e) + } + return e +} + +// UintBuilder builds a column of unsigned integers. It uses the smallest +// possible UintEncoding, depending on the values. +type UintBuilder struct { + // configuration fixed on Init; preserved across Reset + useDefault bool + + elems []uint64 + + // stats holds state for the purpose of tracking which UintEncoding would + // be used if the caller Finished the column including all elements Set so + // far. The stats state is used by Size (and Finish) to cheaply determine + // which encoding may most concisely encode the array. + // + // Every Set(i, v) call updates minimum and maximum if necessary. If a call + // updates minimum, maximum or both, it recalculates the encoding and if it + // changed sets sets encodingRow=i, indicating which row last updated the + // width. + // + // Any call to Size or Finish that supplies [rows] that's inclusive of the + // index stored in widthRow may use the stored width. Calls with fewer + // [rows] must recompute the min/max. In expected usage, only Finish will be + // called with fewer rows and only with one less row than has been set, + // meaning that only if the last row updated the width is a recomputation + // necessary. + // + // TODO(jackson): There is a small discrete set of possible encodings, so we + // could instead track the index of the first row that makes each encoding + // impossible. This would allow us to avoid recomputing the min/max in all + // cases. Or, if we limit the API to only allow Finish to be called with one + // less than the last set row, we could maintain the width of only the last + // two rows. + stats struct { + minimum uint64 + maximum uint64 + encoding UintEncoding + encodingRow int // index of last update to encoding + } +} + +// Init initializes the UintBuilder. +func (b *UintBuilder) Init() { + b.init(false) +} + +// InitWithDefault initializes the UintBuilder. Any rows that are not explicitly +// set are assumed to be zero. For the purpose of determining whether a delta +// encoding is possible, the column is assumed to contain at least 1 default +// value. +// +// InitWithDefault may be preferrable when a nonzero value is uncommon, and the +// caller can avoid explicitly Set-ing every zero value. +func (b *UintBuilder) InitWithDefault() { + b.init(true) +} + +func (b *UintBuilder) init(useDefault bool) { + b.useDefault = useDefault + b.Reset() +} + +// NumColumns implements ColumnWriter. +func (b *UintBuilder) NumColumns() int { return 1 } + +// DataType implements ColumnWriter. +func (b *UintBuilder) DataType(int) DataType { return DataTypeUint } + +// Reset implements ColumnWriter and resets the builder, reusing existing +// allocated memory. +func (b *UintBuilder) Reset() { + if b.useDefault { + // If the caller configured a default zero, we assume that the array + // will include at least one default value. + b.stats.minimum = 0 + b.stats.maximum = 0 + clear(b.elems) + } else { + b.stats.minimum = math.MaxUint64 + b.stats.maximum = 0 + // We could reset all values as a precaution, but it has a visible cost + // in benchmarks. + if invariants.Enabled && invariants.Sometimes(50) { + for i := range b.elems { + b.elems[i] = math.MaxUint64 + } + } + } + b.stats.encoding = uintEncodingAllZero + b.stats.encodingRow = 0 +} + +// Get gets the value of the provided row index. The provided row must have been +// Set or the builder must have been initialized with InitWithDefault. +func (b *UintBuilder) Get(row int) uint64 { + // If the UintBuilder is configured to use a zero value for unset rows, it's + // possible that the array has not been grown to a size that includes [row]. + if len(b.elems) <= row { + if invariants.Enabled && !b.useDefault { + panic(errors.AssertionFailedf("Get(%d) on UintBuilder with array of size %d", row, len(b.elems))) + } + return 0 + } + return b.elems[row] +} + +// Set sets the value of the provided row index to v. +func (b *UintBuilder) Set(row int, v uint64) { + if len(b.elems) <= row { + // Double the size of the allocated array, or initialize it to at least 32 + // values (256 bytes) if this is the first allocation. Then double until + // there's sufficient space. + n2 := max(len(b.elems)<<1, 32) + for n2 <= row { + n2 <<= 1 // double the size + } + // NB: Go guarantees the allocated array will be 64-bit aligned. + newElems := make([]uint64, n2) + copy(newElems, b.elems) + b.elems = newElems + } + // Maintain the running minimum and maximum for the purpose of maintaining + // knowledge of the delta encoding that would be used. + if b.stats.minimum > v || b.stats.maximum < v || row < UintEncodingRowThreshold { + b.stats.minimum = min(v, b.stats.minimum) + b.stats.maximum = max(v, b.stats.maximum) + // If updating the minimum and maximum means that we now much use a wider + // width integer, update the encoding and the index of the update to it. + if e := DetermineUintEncoding(b.stats.minimum, b.stats.maximum, row+1); e != b.stats.encoding { + b.stats.encoding = e + b.stats.encodingRow = row + } + } + b.elems[row] = v +} + +// Size implements ColumnWriter and returns the size of the column if its first +// [rows] rows were serialized, serializing the column into offset [offset]. +func (b *UintBuilder) Size(rows int, offset uint32) uint32 { + if rows == 0 { + return offset + } + e, _ := b.determineEncoding(rows) + return uintColumnSize(uint32(rows), offset, e) +} + +// determineEncoding determines the best encoding for a column containing the +// first [rows], along with a lower bound on all the values which can be used as +// a "base" if the encoding is a delta encoding. +func (b *UintBuilder) determineEncoding(rows int) (_ UintEncoding, deltaBase uint64) { + if b.stats.encodingRow < rows { + // b.delta.encoding became the current value within the first [rows], so we + // can use it. + // + // Note that if useDefault is set, this encoding assumes there is at least + // one element with the default (zero) value, which might be pessimistic. + // + // Note that b.stats.minimum includes all rows set so far so it might be + // strictly smaller than all values up to [rows]; but it is still a suitable + // base for b.stats.encoding. + if invariants.Enabled && invariants.Sometimes(1) && rows > 0 { + if enc, _ := b.recalculateEncoding(rows); enc != b.stats.encoding { + panic(fmt.Sprintf("fast and slow paths don't agree: %s vs %s", b.stats.encoding, enc)) + } + } + return b.stats.encoding, b.stats.minimum + } + return b.recalculateEncoding(rows) +} + +func (b *UintBuilder) recalculateEncoding(rows int) (_ UintEncoding, deltaBase uint64) { + // We have to recalculate the minimum and maximum. + minimum, maximum := computeMinMax(b.elems[:min(rows, len(b.elems))]) + if b.useDefault { + // Mirror the pessimism of the fast path so that the result is consistent. + // Otherwise, adding a row can result in a different encoding even when not + // including that row. + minimum = 0 + } + return DetermineUintEncoding(minimum, maximum, rows), minimum +} + +// uintColumnSize returns the size of a column of unsigned integers, encoded at +// the provided offset using the provided width. If width < sizeof(T), then a +// delta encoding is assumed. +func uintColumnSize(rows, offset uint32, e UintEncoding) uint32 { + offset++ // DeltaEncoding byte + if e.IsDelta() { + // A delta encoding will be used. We need to first account for the constant + // that encodes the base value. + offset += 8 + } + width := uint32(e.Width()) + // Include alignment bytes necessary to align offset appropriately for + // elements of the delta width. + if width > 0 { + offset = align(offset, width) + } + // Now account for the array of [rows] x w elements encoding the deltas. + return offset + rows*width +} + +// Finish implements ColumnWriter, serializing the column into offset [offset] of +// [buf]. +func (b *UintBuilder) Finish(col, rows int, offset uint32, buf []byte) uint32 { + if rows == 0 { + return offset + } + + e, minimum := b.determineEncoding(rows) + + values := b.elems[:min(rows, len(b.elems))] + return uintColumnFinish(rows, minimum, values, e, offset, buf) +} + +// uintColumnFinish finishes the column of unsigned integers of type T, applying +// the given encoding. +func uintColumnFinish( + rows int, minimum uint64, values []uint64, e UintEncoding, offset uint32, buf []byte, +) uint32 { + buf[offset] = byte(e) + offset++ + + deltaBase := uint64(0) + if e.IsDelta() { + deltaBase = minimum + binary.LittleEndian.PutUint64(buf[offset:], minimum) + offset += 8 + } + width := uint32(e.Width()) + if width == 0 { + // All the column values are the same. + return offset + } + // Align the offset appropriately. + offset = alignWithZeroes(buf, offset, width) + + if invariants.Enabled && len(buf) < int(offset)+rows*e.Width() { + panic("buffer too small") + } + switch e.Width() { + case 1: + dest := buf[offset : offset+uint32(rows)] + reduceUints(deltaBase, values, dest) + + case 2: + dest := unsafe.Slice((*uint16)(unsafe.Pointer(&buf[offset])), rows) + reduceUints(deltaBase, values, dest) + if BigEndian { + ReverseBytes16(dest) + } + + case 4: + dest := unsafe.Slice((*uint32)(unsafe.Pointer(&buf[offset])), rows) + reduceUints(deltaBase, values, dest) + if BigEndian { + ReverseBytes32(dest) + } + + case 8: + if deltaBase != 0 { + panic("unreachable") + } + dest := unsafe.Slice((*uint64)(unsafe.Pointer(&buf[offset])), rows) + copy(dest, values) + for i := len(values); i < len(dest); i++ { + dest[i] = 0 + } + if BigEndian { + ReverseBytes64(dest) + } + + default: + panic("unreachable") + } + return offset + uint32(rows)*width +} + +// WriteDebug implements Encoder. +func (b *UintBuilder) WriteDebug(w io.Writer, rows int) { + fmt.Fprintf(w, "%s: %d rows", DataTypeUint, rows) +} + +// reduceUints reduces the bit-width of a slice of unsigned by subtracting a +// minimum value from each element and writing it to dst. For example, +// +// reduceUints[uint8](10, []uint64{10, 11, 12}, dst) +// +// could be used to reduce a slice of uint64 values to uint8 values {0, 1, 2}. +// +// The values slice can be smaller than dst; in that case, the values between +// len(values) and len(dst) are assumed to be 0. +func reduceUints[N constraints.Integer](minimum uint64, values []uint64, dst []N) { + _ = dst[len(values)-1] + for i, v := range values { + if invariants.Enabled { + if v < minimum { + panic("incorrect minimum value") + } + if v-minimum > uint64(N(0)-1) { + panic("incorrect target width") + } + } + //gcassert:bce + dst[i] = N(v - minimum) + } + if invariants.Enabled && len(values) < len(dst) && minimum != 0 { + panic("incorrect minimum value") + } + for i := len(values); i < len(dst); i++ { + dst[i] = 0 + } +} + +// computeMinMax computes the minimum and the maximum of the provided slice of +// unsigned integers. +func computeMinMax[I constraints.Unsigned](values []I) (I, I) { + minimum := I(0) - 1 + maximum := I(0) + for _, v := range values { + minimum = min(minimum, v) + maximum = max(maximum, v) + } + return minimum, maximum +} + +func uintsToBinFormatter( + f *binfmt.Formatter, tp treeprinter.Node, rows int, uintFormatter func(el, base uint64) string, +) { + if rows == 0 { + return + } + if uintFormatter == nil { + uintFormatter = func(v, base uint64) string { + if base == 0 { + return fmt.Sprint(v) + } + return fmt.Sprintf("%d + %d = %d", v, base, base+v) + } + } + + e := UintEncoding(f.PeekUint(1)) // UintEncoding byte + if !e.IsValid() { + panic(fmt.Sprintf("%d", e)) + } + f.HexBytesln(1, "encoding: %s", e) + + var base uint64 + if e.IsDelta() { + base = f.PeekUint(8) + f.HexBytesln(8, "64-bit constant: %d", base) + } + width := e.Width() + if width == 0 { + // The column is zero or constant. + f.ToTreePrinter(tp) + return + } + + if off := align(f.RelativeOffset(), width); off != f.RelativeOffset() { + f.HexBytesln(off-f.RelativeOffset(), "padding (aligning to %d-bit boundary)", width*8) + } + for i := 0; i < rows; i++ { + f.HexBytesln(width, "data[%d] = %s", i, uintFormatter(f.PeekUint(width), base)) + } + f.ToTreePrinter(tp) +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/uints_decode.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/uints_decode.go new file mode 100644 index 0000000..c77b91c --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk/uints_decode.go @@ -0,0 +1,34 @@ +// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package colblk + +import ( + "unsafe" + + "github.com/cockroachdb/errors" +) + +// unsafeUint64Decoder is used to access 64-bit unsigned integer backed by a +// buffer in little-ending format. It is desirable to keep this type as small as +// possible since it is embedded multiple times in block decoders. +// +// The At() method is defined in endian_little.go and endian_big.go. +type unsafeUint64Decoder struct { + ptr unsafe.Pointer +} + +func makeUnsafeUint64Decoder(buf []byte, n int) unsafeUint64Decoder { + if n == 0 { + return unsafeUint64Decoder{} + } + ptr := unsafe.Pointer(unsafe.SliceData(buf)) + if align(uintptr(ptr), align64) != uintptr(ptr) { + panic(errors.AssertionFailedf("slice pointer %p not %d-byte aligned", ptr, align64)) + } + if len(buf) < n< 0 { + off = align(off, uint32(w)) + } + return makeUnsafeUints(base, unsafe.Pointer(&b[off]), w), off + uint32(rows*w) +} + +// Assert that DecodeUnsafeIntegerSlice implements DecodeFunc. +var _ DecodeFunc[UnsafeUints] = DecodeUnsafeUints + +func makeUnsafeUints(base uint64, ptr unsafe.Pointer, width int) UnsafeUints { + switch width { + case 0, 1, 2, 4, 8: + default: + panic("invalid width") + } + return UnsafeUints{ + base: base, + ptr: ptr, + width: uint8(width), + } +} + +// UnsafeOffsets is a specialization of UnsafeInts (providing the same +// functionality) which is optimized when the integers are offsets inside a +// column block. It can only be used with 0, 1, 2, or 4 byte encoding without +// delta. +// +// The At() and At2() methods are defined in endian_little.go and endian_big.go. +type UnsafeOffsets struct { + ptr unsafe.Pointer + width uint8 +} + +// DecodeUnsafeOffsets decodes the structure of a slice of offsets from a byte +// slice. +func DecodeUnsafeOffsets(b []byte, off uint32, rows int) (_ UnsafeOffsets, endOffset uint32) { + ints, endOffset := DecodeUnsafeUints(b, off, rows) + if ints.base != 0 || ints.width == 8 { + panic(errors.AssertionFailedf("unexpected offsets encoding (base=%d, width=%d)", ints.base, ints.width)) + } + return UnsafeOffsets{ + ptr: ints.ptr, + width: ints.width, + }, endOffset +} + +// unsafeGetUint32 is just like slice[idx] but without bounds checking. +// +//gcassert:inline +func unsafeGetUint32(slice []uint32, idx int) uint32 { + if invariants.Enabled { + _ = slice[idx] + } + return *(*uint32)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(slice)), uintptr(idx)<0), we + // compute padding appropriately assuming the current byte f.Offset() is + // aligned. + f.SetAnchorOffset() + + n := tp.Child("reference liveness block header") + d.bd.HeaderToBinFormatter(f, n) + d.bd.ColumnToBinFormatter(f, n, 0, d.bd.Rows()) + f.HexBytesln(1, "block padding byte") + f.ToTreePrinter(n) +} + +// BlockDecoder returns the block decoder for the reference liveness block. +func (d *ReferenceLivenessBlockDecoder) BlockDecoder() *BlockDecoder { + return &d.bd +} + +// LivenessAtReference returns the liveness of the reference (the given index). +func (d *ReferenceLivenessBlockDecoder) LivenessAtReference(i int) []byte { + return d.values.At(i) +} + +// Assert that an ReferenceLivenessBlockDecoder can fit inside block.Metadata. +const _ uint = block.MetadataSize - uint(unsafe.Sizeof(ReferenceLivenessBlockDecoder{})) diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk_writer.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk_writer.go new file mode 100644 index 0000000..95fc415 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/colblk_writer.go @@ -0,0 +1,1298 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "bytes" + "context" + "encoding/binary" + "fmt" + "math" + "slices" + "sync" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/bytealloc" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/block/blockkind" + "github.com/cockroachdb/pebble/v2/sstable/colblk" + "github.com/cockroachdb/pebble/v2/sstable/rowblk" + "github.com/cockroachdb/pebble/v2/sstable/valblk" +) + +// RawColumnWriter is a sstable RawWriter that writes sstables with +// column-oriented blocks. All table formats TableFormatPebblev5 and later write +// column-oriented blocks and use RawColumnWriter. +type RawColumnWriter struct { + comparer *base.Comparer + meta WriterMetadata + opts WriterOptions + err error + + dataFlush block.FlushGovernor + indexFlush block.FlushGovernor + blockPropCollectors []BlockPropertyCollector + blockPropsEncoder blockPropertiesEncoder + obsoleteCollector obsoleteKeyBlockPropertyCollector + props Properties + // block writers buffering unflushed data. + dataBlock struct { + colblk.DataBlockEncoder + // numDeletions stores the count of point tombstones in this data block. + // It's used to determine if this data block is considered + // tombstone-dense for the purposes of compaction. + numDeletions int + // deletionSize stores the raw size of point tombstones in this data + // block. It's used to determine if this data block is considered + // tombstone-dense for the purposes of compaction. + deletionSize int + } + indexBlock colblk.IndexBlockWriter + topLevelIndexBlock colblk.IndexBlockWriter + rangeDelBlock colblk.KeyspanBlockWriter + rangeKeyBlock colblk.KeyspanBlockWriter + valueBlock *valblk.Writer // nil iff WriterOptions.DisableValueBlocks=true + blobRefLivenessIndexBlock blobRefValueLivenessWriter + // filter accumulates the filter block. If populated, the filter ingests + // either the output of w.split (i.e. a prefix extractor) if w.split is not + // nil, or the full keys otherwise. + filterBlock filterWriter + prevPointKey struct { + trailer base.InternalKeyTrailer + isObsolete bool + } + pendingDataBlockSize int + indexBlockSize int + queuedDataSize uint64 + + // indexBuffering holds finished index blocks as they're completed while + // building the sstable. If an index block grows sufficiently large + // (IndexBlockSize) while an sstable is still being constructed, the sstable + // writer will create a two-level index structure. As index blocks are + // completed, they're finished and buffered in-memory until the table is + // finished. When the table is finished, the buffered index blocks are + // flushed in order after all the data blocks, and the top-level index block + // is constructed to point to all the individual index blocks. + indexBuffering struct { + // partitions holds all the completed, uncompressed index blocks. + // + // TODO(jackson): We should consider compressing these index blocks now, + // while buffering, to reduce the memory usage of the writer. + partitions []bufferedIndexBlock + // partitionSizeSum is the sum of the sizes of all the completed index + // blocks (in `partitions`). + partitionSizeSum uint64 + // blockAlloc is used to bulk-allocate byte slices used to store index + // blocks in partitions. These live until the sstable is finished. + blockAlloc []byte + // sepAlloc is used to bulk-allocate index block separator slices stored + // in partitions. These live until the sstable is finished. + sepAlloc bytealloc.A + } + + writeQueue struct { + wg sync.WaitGroup + ch chan *compressedBlock + err error + } + layout layoutWriter + + lastKeyBuf []byte + separatorBuf []byte + tmp [blockHandleLikelyMaxLen]byte + previousUserKey invariants.Value[[]byte] + validator invariants.Value[*colblk.DataBlockValidator] + disableKeyOrderChecks bool + cpuMeasurer base.CPUMeasurer +} + +// Assert that *RawColumnWriter implements RawWriter. +var _ RawWriter = (*RawColumnWriter)(nil) + +// cpuMeasurer, if non-nil, is only used for calling +// cpuMeasurer.MeasureCPUSSTableSecondary. +func newColumnarWriter( + writable objstorage.Writable, o WriterOptions, cpuMeasurer base.CPUMeasurer, +) *RawColumnWriter { + if writable == nil { + panic("pebble: nil writable") + } + if !o.TableFormat.BlockColumnar() { + panic(errors.AssertionFailedf("newColumnarWriter cannot create sstables with %s format", o.TableFormat)) + } + o = o.ensureDefaults() + w := &RawColumnWriter{ + comparer: o.Comparer, + meta: WriterMetadata{ + SmallestSeqNum: math.MaxUint64, + }, + opts: o, + layout: makeLayoutWriter(writable, o), + disableKeyOrderChecks: o.internal.DisableKeyOrderChecks, + } + w.dataFlush = block.MakeFlushGovernor(o.BlockSize, o.BlockSizeThreshold, o.SizeClassAwareThreshold, o.AllocatorSizeClasses) + w.indexFlush = block.MakeFlushGovernor(o.IndexBlockSize, o.BlockSizeThreshold, o.SizeClassAwareThreshold, o.AllocatorSizeClasses) + w.dataBlock.Init(o.KeySchema) + w.indexBlock.Init() + w.topLevelIndexBlock.Init() + w.rangeDelBlock.Init(w.comparer.Equal) + w.rangeKeyBlock.Init(w.comparer.Equal) + if !o.DisableValueBlocks { + w.valueBlock = valblk.NewWriter( + block.MakeFlushGovernor(o.BlockSize, o.BlockSizeThreshold, o.SizeClassAwareThreshold, o.AllocatorSizeClasses), + &w.layout.compressor, w.opts.Checksum, func(compressedSize int) {}) + } + if o.FilterPolicy != base.NoFilterPolicy { + switch o.FilterType { + case TableFilter: + w.filterBlock = newTableFilterWriter(o.FilterPolicy) + default: + panic(fmt.Sprintf("unknown filter type: %v", o.FilterType)) + } + } + + numBlockPropertyCollectors := len(o.BlockPropertyCollectors) + if !o.disableObsoleteCollector { + numBlockPropertyCollectors++ + } + if numBlockPropertyCollectors > maxPropertyCollectors { + panic(errors.New("pebble: too many block property collectors")) + } + w.blockPropCollectors = make([]BlockPropertyCollector, 0, numBlockPropertyCollectors) + for _, constructFn := range o.BlockPropertyCollectors { + w.blockPropCollectors = append(w.blockPropCollectors, constructFn()) + } + if !o.disableObsoleteCollector { + w.blockPropCollectors = append(w.blockPropCollectors, &w.obsoleteCollector) + } + var buf bytes.Buffer + buf.WriteString("[") + for i := range w.blockPropCollectors { + if i > 0 { + buf.WriteString(",") + } + buf.WriteString(w.blockPropCollectors[i].Name()) + } + buf.WriteString("]") + w.props.PropertyCollectorNames = buf.String() + + w.props.ComparerName = o.Comparer.Name + w.props.CompressionName = o.Compression.Name + w.props.KeySchemaName = o.KeySchema.Name + w.props.MergerName = o.MergerName + + w.writeQueue.ch = make(chan *compressedBlock) + w.writeQueue.wg.Add(1) + w.cpuMeasurer = cpuMeasurer + go w.drainWriteQueue() + + return w +} + +// Error returns the current accumulated error if any. +func (w *RawColumnWriter) Error() error { + return w.err +} + +// EstimatedSize returns the estimated size of the sstable being written if +// a call to Close() was made without adding additional keys. +func (w *RawColumnWriter) EstimatedSize() uint64 { + // Start with the size of the footer, which is fixed, and the size of all + // the finished data blocks. The size of the finished data blocks is all + // post-compression and the footer is not compressed, so this initial + // quantity is exact. + sz := uint64(w.opts.TableFormat.FooterSize()) + w.queuedDataSize + + // Add the size of value blocks. If any value blocks have already been + // finished, these blocks will contribute post-compression size. If there is + // currently an unfinished value block, it will contribute its pre-compression + // size. + if w.valueBlock != nil { + sz += w.valueBlock.Size() + } + + // Add the size of the completed but unflushed index partitions, the + // unfinished data block, the unfinished index block, the unfinished range + // deletion block, and the unfinished range key block. + // + // All of these sizes are uncompressed sizes. It's okay to be pessimistic + // here and use the uncompressed size because all this memory is buffered + // until the sstable is finished. Including the uncompressed size bounds + // the memory usage used by the writer to the physical size limit. + sz += w.indexBuffering.partitionSizeSum + sz += uint64(w.dataBlock.Size()) + sz += uint64(w.indexBlockSize) + if w.rangeDelBlock.KeyCount() > 0 { + sz += uint64(w.rangeDelBlock.Size()) + } + if w.rangeKeyBlock.KeyCount() > 0 { + sz += uint64(w.rangeKeyBlock.Size()) + } + + // TODO(jackson): Include an estimate of the properties, filter and meta + // index blocks sizes. + return sz +} + +// ComparePrev compares the provided user to the last point key written to the +// writer. The returned value is equivalent to Compare(key, prevKey) where +// prevKey is the last point key written to the writer. +// +// If no key has been written yet, ComparePrev returns +1. +// +// Must not be called after Writer is closed. +func (w *RawColumnWriter) ComparePrev(k []byte) int { + if w == nil || w.dataBlock.Rows() == 0 { + return +1 + } + return int(w.dataBlock.KeyWriter.ComparePrev(k).UserKeyComparison) +} + +// SetSnapshotPinnedProperties sets the properties for pinned keys. Should only +// be used internally by Pebble. +func (w *RawColumnWriter) SetSnapshotPinnedProperties( + pinnedKeyCount, pinnedKeySize, pinnedValueSize uint64, +) { + w.props.SnapshotPinnedKeys = pinnedKeyCount + w.props.SnapshotPinnedKeySize = pinnedKeySize + w.props.SnapshotPinnedValueSize = pinnedValueSize +} + +// Metadata returns the metadata for the finished sstable. Only valid to call +// after the sstable has been finished. +func (w *RawColumnWriter) Metadata() (*WriterMetadata, error) { + if !w.layout.IsFinished() { + return nil, errors.New("pebble: writer is not closed") + } + return &w.meta, nil +} + +// EncodeSpan encodes the keys in the given span. The span can contain either +// only RANGEDEL keys or only range keys. +func (w *RawColumnWriter) EncodeSpan(span keyspan.Span) error { + if span.Empty() { + return nil + } + for _, k := range span.Keys { + w.meta.updateSeqNum(k.SeqNum()) + } + + blockWriter := &w.rangeKeyBlock + if span.Keys[0].Kind() == base.InternalKeyKindRangeDelete { + blockWriter = &w.rangeDelBlock + // Update range delete properties. + // NB: These properties are computed differently than the rowblk sstable + // writer because this writer does not flatten them into row key-value + // pairs. + w.props.RawKeySize += uint64(len(span.Start) + len(span.End)) + count := uint64(len(span.Keys)) + w.props.NumEntries += count + w.props.NumDeletions += count + w.props.NumRangeDeletions += count + } else { + // Update range key properties. + // NB: These properties are computed differently than the rowblk sstable + // writer because this writer does not flatten them into row key-value + // pairs. + w.props.RawRangeKeyKeySize += uint64(len(span.Start) + len(span.End)) + for _, k := range span.Keys { + w.props.RawRangeKeyValueSize += uint64(len(k.Value)) + switch k.Kind() { + case base.InternalKeyKindRangeKeyDelete: + w.props.NumRangeKeyDels++ + case base.InternalKeyKindRangeKeySet: + w.props.NumRangeKeySets++ + case base.InternalKeyKindRangeKeyUnset: + w.props.NumRangeKeyUnsets++ + default: + panic(errors.Errorf("pebble: invalid range key type: %s", k.Kind())) + } + } + for i := range w.blockPropCollectors { + if err := w.blockPropCollectors[i].AddRangeKeys(span); err != nil { + return err + } + } + } + if !w.disableKeyOrderChecks && blockWriter.KeyCount() > 0 { + // Check that spans are being added in fragmented order. If the two + // tombstones overlap, their start and end keys must be identical. + prevStart, prevEnd, prevTrailer := blockWriter.UnsafeLastSpan() + if w.opts.Comparer.Equal(prevStart, span.Start) && w.opts.Comparer.Equal(prevEnd, span.End) { + if prevTrailer < span.Keys[0].Trailer { + w.err = errors.Errorf("pebble: keys must be added in order: %s-%s:{(#%s)}, %s", + w.opts.Comparer.FormatKey(prevStart), + w.opts.Comparer.FormatKey(prevEnd), + prevTrailer, span.Pretty(w.opts.Comparer.FormatKey)) + } + } else if c := w.opts.Comparer.Compare(prevEnd, span.Start); c > 0 { + w.err = errors.Errorf("pebble: keys must be added in order: %s-%s:{(#%s)}, %s", + w.opts.Comparer.FormatKey(prevStart), + w.opts.Comparer.FormatKey(prevEnd), + prevTrailer, span.Pretty(w.opts.Comparer.FormatKey)) + return w.err + } + } + blockWriter.AddSpan(span) + return nil +} + +// Add adds a point key/value pair when writing a +// strict-obsolete sstable. For a given Writer, the keys passed to Add must be +// in increasing order. Span keys (range deletions, range keys) must be added +// through EncodeSpan. +// +// forceObsolete indicates whether the caller has determined that this key is +// obsolete even though it may be the latest point key for this userkey. This +// should be set to true for keys obsoleted by RANGEDELs, and is required for +// strict-obsolete sstables. +// +// Note that there are two properties, S1 and S2 (see comment in format.go) +// that strict-obsolete ssts must satisfy. S2, due to RANGEDELs, is solely the +// responsibility of the caller. S1 is solely the responsibility of the +// callee. +func (w *RawColumnWriter) Add(key InternalKey, value []byte, forceObsolete bool) error { + switch key.Kind() { + case base.InternalKeyKindRangeDelete, base.InternalKeyKindRangeKeySet, + base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete: + return errors.Newf("%s must be added through EncodeSpan", key.Kind()) + case base.InternalKeyKindMerge: + if w.opts.IsStrictObsolete { + return errors.Errorf("MERGE not supported in a strict-obsolete sstable") + } + } + + eval, err := w.evaluatePoint(key, len(value)) + if err != nil { + return err + } + eval.isObsolete = eval.isObsolete || forceObsolete + w.prevPointKey.trailer = key.Trailer + w.prevPointKey.isObsolete = eval.isObsolete + + var valuePrefix block.ValuePrefix + var valueStoredWithKey []byte + if eval.writeToValueBlock { + vh, err := w.valueBlock.AddValue(value) + if err != nil { + return err + } + n := valblk.EncodeHandle(w.tmp[:], vh) + valueStoredWithKey = w.tmp[:n] + var attribute base.ShortAttribute + if w.opts.ShortAttributeExtractor != nil { + // TODO(sumeer): for compactions, it is possible that the input sstable + // already has this value in the value section and so we have already + // extracted the ShortAttribute. Avoid extracting it again. This will + // require changing the RawWriter.Add interface. + if attribute, err = w.opts.ShortAttributeExtractor( + key.UserKey, int(eval.kcmp.PrefixLen), value); err != nil { + return err + } + } + valuePrefix = block.ValueBlockHandlePrefix(eval.kcmp.PrefixEqual(), attribute) + } else { + valueStoredWithKey = value + if len(value) > 0 { + valuePrefix = block.InPlaceValuePrefix(eval.kcmp.PrefixEqual()) + } + } + return w.add(key, len(value), valueStoredWithKey, valuePrefix, eval) +} + +// AddWithBlobHandle implements the RawWriter interface. +func (w *RawColumnWriter) AddWithBlobHandle( + key InternalKey, h blob.InlineHandle, attr base.ShortAttribute, forceObsolete bool, +) error { + // Blob value handles require at least TableFormatPebblev6. + if w.opts.TableFormat <= TableFormatPebblev5 { + w.err = errors.Newf("pebble: blob value handles are not supported in %s", w.opts.TableFormat.String()) + return w.err + } + switch key.Kind() { + case base.InternalKeyKindRangeDelete, base.InternalKeyKindRangeKeySet, + base.InternalKeyKindRangeKeyUnset, base.InternalKeyKindRangeKeyDelete: + return errors.Newf("%s must be added through EncodeSpan", key.Kind()) + case base.InternalKeyKindMerge: + return errors.Errorf("MERGE does not support blob value handles") + } + + eval, err := w.evaluatePoint(key, int(h.ValueLen)) + if err != nil { + return err + } + eval.isObsolete = eval.isObsolete || forceObsolete + w.prevPointKey.trailer = key.Trailer + w.prevPointKey.isObsolete = eval.isObsolete + + n := h.Encode(w.tmp[:]) + valueStoredWithKey := w.tmp[:n] + valuePrefix := block.BlobValueHandlePrefix(eval.kcmp.PrefixEqual(), attr) + err = w.add(key, int(h.ValueLen), valueStoredWithKey, valuePrefix, eval) + if err != nil { + return err + } + w.props.NumValuesInBlobFiles++ + if err := w.blobRefLivenessIndexBlock.addLiveValue(h.ReferenceID, h.BlockID, h.ValueID, uint64(h.ValueLen)); err != nil { + return err + } + return nil +} + +func (w *RawColumnWriter) add( + key InternalKey, + valueLen int, + valueStoredWithKey []byte, + valuePrefix block.ValuePrefix, + eval pointKeyEvaluation, +) error { + // Append the key to the data block. We have NOT yet committed to + // including the key in the block. The data block writer permits us to + // finish the block excluding the last-appended KV. + entriesWithoutKV := w.dataBlock.Rows() + w.dataBlock.Add(key, valueStoredWithKey, valuePrefix, eval.kcmp, eval.isObsolete) + + // Now that we've appended the KV pair, we can compute the exact size of the + // block with this key-value pair included. Check to see if we should flush + // the current block, either with or without the added key-value pair. + size := w.dataBlock.Size() + if shouldFlushWithoutLatestKV(size, w.pendingDataBlockSize, entriesWithoutKV, &w.dataFlush) { + // Flush the data block excluding the key we just added. + if err := w.flushDataBlockWithoutNextKey(key.UserKey); err != nil { + w.err = err + return err + } + // flushDataBlockWithoutNextKey reset the data block builder, and we can + // add the key to this next block now. + w.dataBlock.Add(key, valueStoredWithKey, valuePrefix, eval.kcmp, eval.isObsolete) + w.pendingDataBlockSize = w.dataBlock.Size() + } else { + // We're not flushing the data block, and we're committing to including + // the current KV in the block. Remember the new size of the data block + // with the current KV. + w.pendingDataBlockSize = size + } + + for i := range w.blockPropCollectors { + v := valueStoredWithKey + if key.Kind() == base.InternalKeyKindSet || key.Kind() == base.InternalKeyKindSetWithDelete || !valuePrefix.IsInPlaceValue() { + // Values for SET, SETWITHDEL keys are not required to be in-place, + // and may not even be read by the compaction, so pass nil values. + // Block property collectors in such Pebble DB's must not look at + // the value. + v = nil + } + if err := w.blockPropCollectors[i].AddPointKey(key, v); err != nil { + w.err = err + return err + } + } + w.obsoleteCollector.AddPoint(eval.isObsolete) + if w.filterBlock != nil { + w.filterBlock.addKey(key.UserKey[:eval.kcmp.PrefixLen]) + } + w.meta.updateSeqNum(key.SeqNum()) + if !w.meta.HasPointKeys { + w.meta.SetSmallestPointKey(key.Clone()) + } + + w.props.NumEntries++ + switch key.Kind() { + case InternalKeyKindDelete, InternalKeyKindSingleDelete: + w.props.NumDeletions++ + w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey)) + w.dataBlock.numDeletions++ + w.dataBlock.deletionSize += len(key.UserKey) + case InternalKeyKindDeleteSized: + var size uint64 + if len(valueStoredWithKey) > 0 { + var n int + size, n = binary.Uvarint(valueStoredWithKey) + if n <= 0 { + return errors.Newf("%s key's value (%x) does not parse as uvarint", + errors.Safe(key.Kind().String()), valueStoredWithKey) + } + } + w.props.NumDeletions++ + w.props.NumSizedDeletions++ + w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey)) + w.props.RawPointTombstoneValueSize += size + w.dataBlock.numDeletions++ + w.dataBlock.deletionSize += len(key.UserKey) + case InternalKeyKindMerge: + w.props.NumMergeOperands++ + } + w.props.RawKeySize += uint64(key.Size()) + w.props.RawValueSize += uint64(valueLen) + return nil +} + +type pointKeyEvaluation struct { + kcmp colblk.KeyComparison + isObsolete bool + writeToValueBlock bool +} + +// evaluatePoint takes information about a point key being written to the +// sstable and decides how the point should be represented, where its value +// should be stored, etc. +func (w *RawColumnWriter) evaluatePoint( + key base.InternalKey, valueLen int, +) (eval pointKeyEvaluation, err error) { + eval.kcmp = w.dataBlock.KeyWriter.ComparePrev(key.UserKey) + + // When invariants are enabled, validate kcmp. + if invariants.Enabled { + colblk.AssertKeyCompare(w.comparer, key.UserKey, w.previousUserKey.Get(), eval.kcmp) + w.previousUserKey.Set(append(w.previousUserKey.Get()[:0], key.UserKey...)) + } + + if !w.meta.HasPointKeys { + return eval, nil + } + keyKind := key.Kind() + // Ensure that no one adds a point key kind without considering the obsolete + // handling for that kind. + switch keyKind { + case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge, + InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: + default: + panic(errors.AssertionFailedf("unexpected key kind %s", keyKind.String())) + } + prevKeyKind := w.prevPointKey.trailer.Kind() + // If same user key, then the current key is obsolete if any of the + // following is true: + // C1 The prev key was obsolete. + // C2 The prev key was not a MERGE. When the previous key is a MERGE we must + // preserve SET* and MERGE since their values will be merged into the + // previous key. We also must preserve DEL* since there may be an older + // SET*/MERGE in a lower level that must not be merged with the MERGE -- + // if we omit the DEL* that lower SET*/MERGE will become visible. + // + // Regardless of whether it is the same user key or not + // C3 The current key is some kind of point delete, and we are writing to + // the lowest level, then it is also obsolete. The correctness of this + // relies on the same user key not spanning multiple sstables in a level. + // + // C1 ensures that for a user key there is at most one transition from + // !obsolete to obsolete. Consider a user key k, for which the first n keys + // are not obsolete. We consider the various value of n: + // + // n = 0: This happens due to forceObsolete being set by the caller, or due + // to C3. forceObsolete must only be set due a RANGEDEL, and that RANGEDEL + // must also delete all the lower seqnums for the same user key. C3 triggers + // due to a point delete and that deletes all the lower seqnums for the same + // user key. + // + // n = 1: This is the common case. It happens when the first key is not a + // MERGE, or the current key is some kind of point delete. + // + // n > 1: This is due to a sequence of MERGE keys, potentially followed by a + // single non-MERGE key. + isObsoleteC1AndC2 := eval.kcmp.UserKeyComparison == 0 && + (w.prevPointKey.isObsolete || prevKeyKind != InternalKeyKindMerge) + isObsoleteC3 := w.opts.WritingToLowestLevel && + (keyKind == InternalKeyKindDelete || keyKind == InternalKeyKindSingleDelete || + keyKind == InternalKeyKindDeleteSized) + eval.isObsolete = isObsoleteC1AndC2 || isObsoleteC3 + // TODO(sumeer): storing isObsolete SET and SETWITHDEL in value blocks is + // possible, but requires some care in documenting and checking invariants. + // There is code that assumes nothing in value blocks because of single MVCC + // version (those should be ok). We have to ensure setHasSamePrefix is + // correctly initialized here etc. + + if !w.disableKeyOrderChecks && (eval.kcmp.UserKeyComparison < 0 || + (eval.kcmp.UserKeyComparison == 0 && w.prevPointKey.trailer <= key.Trailer)) { + previousKey := base.InternalKey{ + UserKey: w.dataBlock.MaterializeLastUserKey(nil), + Trailer: w.prevPointKey.trailer, + } + return eval, errors.Errorf( + "pebble: keys must be added in strictly increasing order: %s, %s", + previousKey.Pretty(w.comparer.FormatKey), + key.Pretty(w.comparer.FormatKey)) + } + + // We might want to write this key's value to a value block if it has the + // same prefix. + // + // We require: + // . Value blocks to be enabled. + // . The current key to have the same prefix as the previous key. + // . The previous key to be a SET. + // . The current key to be a SET. + // . If there are bounds requiring some keys' values to be in-place, the + // key must not fall within those bounds. + // . The value to be sufficiently large. (Currently we simply require a + // non-zero length, so all non-empty values are eligible for storage + // out-of-band in a value block.) + // + // Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of + // valueHandle, this should be > 3. But tiny values are common in test and + // unlikely in production, so we use 0 here for better test coverage. + const tinyValueThreshold = 0 + useValueBlock := !w.opts.DisableValueBlocks && + eval.kcmp.PrefixEqual() && + prevKeyKind == InternalKeyKindSet && + keyKind == InternalKeyKindSet && + valueLen > tinyValueThreshold && + w.valueBlock != nil + if !useValueBlock { + return eval, nil + } + // NB: it is possible that eval.kcmp.UserKeyComparison == 0, i.e., these two + // SETs have identical user keys (because of an open snapshot). This should + // be the rare case. + eval.writeToValueBlock = true + return eval, nil +} + +var compressedBlockPool = sync.Pool{ + New: func() interface{} { + return new(compressedBlock) + }, +} + +type compressedBlock struct { + physical block.PhysicalBlock + blockBuf blockBuf +} + +func (w *RawColumnWriter) flushDataBlockWithoutNextKey(nextKey []byte) error { + serializedBlock, lastKey := w.dataBlock.Finish(w.dataBlock.Rows()-1, w.pendingDataBlockSize) + w.maybeIncrementTombstoneDenseBlocks(len(serializedBlock)) + // Compute the separator that will be written to the index block alongside + // this data block's end offset. It is the separator between the last key in + // the finished block and the [nextKey] that was excluded from the block. + w.separatorBuf = w.comparer.Separator(w.separatorBuf[:0], lastKey.UserKey, nextKey) + if err := w.enqueueDataBlock(serializedBlock, lastKey, w.separatorBuf); err != nil { + return err + } + w.dataBlock.Reset() + w.pendingDataBlockSize = 0 + return nil +} + +// maybeIncrementTombstoneDenseBlocks increments the number of tombstone dense +// blocks if the number of deletions in the data block exceeds a threshold or +// the deletion size exceeds a threshold. It should be called after the +// data block has been finished. +// Invariant: w.dataBlockBuf.uncompressed must already be populated. +func (w *RawColumnWriter) maybeIncrementTombstoneDenseBlocks(uncompressedLen int) { + minSize := w.opts.DeletionSizeRatioThreshold * float32(uncompressedLen) + if w.dataBlock.numDeletions > w.opts.NumDeletionsThreshold || float32(w.dataBlock.deletionSize) > minSize { + w.props.NumTombstoneDenseBlocks++ + } + w.dataBlock.numDeletions = 0 + w.dataBlock.deletionSize = 0 +} + +// enqueueDataBlock compresses and checksums the provided data block and sends +// it to the write queue to be asynchronously written to the underlying storage. +// It also adds the block's index block separator to the pending index block, +// possibly triggering the index block to be finished and buffered. +func (w *RawColumnWriter) enqueueDataBlock( + serializedBlock []byte, lastKey base.InternalKey, separator []byte, +) error { + w.lastKeyBuf = append(w.lastKeyBuf[:0], lastKey.UserKey...) + w.meta.SetLargestPointKey(base.InternalKey{ + UserKey: w.lastKeyBuf, + Trailer: lastKey.Trailer, + }) + + if invariants.Enabled { + v := w.validator.Get() + if v == nil { + v = &colblk.DataBlockValidator{} + w.validator.Set(v) + } + if err := v.Validate(serializedBlock, w.comparer, w.opts.KeySchema); err != nil { + panic(err) + } + } + + // Serialize the data block, compress it and send it to the write queue. + cb := compressedBlockPool.Get().(*compressedBlock) + cb.blockBuf.checksummer.Type = w.opts.Checksum + cb.physical = block.CompressAndChecksum( + &cb.blockBuf.dataBuf, + serializedBlock, + blockkind.SSTableData, + &w.layout.compressor, + &cb.blockBuf.checksummer, + ) + return w.enqueuePhysicalBlock(cb, separator) +} + +func (w *RawColumnWriter) enqueuePhysicalBlock(cb *compressedBlock, separator []byte) error { + dataBlockHandle := block.Handle{ + Offset: w.queuedDataSize, + Length: uint64(cb.physical.LengthWithoutTrailer()), + } + w.queuedDataSize += dataBlockHandle.Length + block.TrailerLen + w.writeQueue.ch <- cb + + var err error + w.blockPropsEncoder.resetProps() + for i := range w.blockPropCollectors { + scratch := w.blockPropsEncoder.getScratchForProp() + if scratch, err = w.blockPropCollectors[i].FinishDataBlock(scratch); err != nil { + return err + } + w.blockPropsEncoder.addProp(shortID(i), scratch) + } + dataBlockProps := w.blockPropsEncoder.unsafeProps() + + // Add the separator to the index block. This might trigger a flush of the + // index block too. + i := w.indexBlock.AddBlockHandle(separator, dataBlockHandle, dataBlockProps) + sizeWithEntry := w.indexBlock.Size() + if shouldFlushWithoutLatestKV(sizeWithEntry, w.indexBlockSize, i, &w.indexFlush) { + // NB: finishIndexBlock will use blockPropsEncoder, so we must clone the + // data block's props first. + dataBlockProps = slices.Clone(dataBlockProps) + + if err = w.finishIndexBlock(w.indexBlock.Rows() - 1); err != nil { + return err + } + // finishIndexBlock reset the index block builder, and we can + // add the block handle to this new index block. + _ = w.indexBlock.AddBlockHandle(separator, dataBlockHandle, dataBlockProps) + w.indexBlockSize = w.indexBlock.Size() + } else { + w.indexBlockSize = sizeWithEntry + } + // Incorporate the finished data block's property into the index block, now + // that we've flushed the index block without the new separator if + // necessary. + for i := range w.blockPropCollectors { + w.blockPropCollectors[i].AddPrevDataBlockToIndexBlock() + } + return nil +} + +// finishIndexBlock finishes the currently pending index block with the first +// [rows] rows. In practice, [rows] is always w.indexBlock.Rows() or +// w.indexBlock.Rows()-1. +// +// The finished index block is buffered until the writer is closed. +func (w *RawColumnWriter) finishIndexBlock(rows int) error { + defer w.indexBlock.Reset() + w.blockPropsEncoder.resetProps() + for i := range w.blockPropCollectors { + scratch := w.blockPropsEncoder.getScratchForProp() + var err error + if scratch, err = w.blockPropCollectors[i].FinishIndexBlock(scratch); err != nil { + return err + } + w.blockPropsEncoder.addProp(shortID(i), scratch) + } + indexProps := w.blockPropsEncoder.props() + bib := bufferedIndexBlock{nEntries: rows, properties: indexProps} + + // Copy the last (greatest) separator key in the index block into bib.sep. + // It'll be the separator on the entry in the top-level index block. + // + // TODO(jackson): bib.sep.Trailer is unused within the columnar-block + // sstable writer. Its existence is a code artifact of reuse of the + // bufferedIndexBlock type between colblk and rowblk writers. This can be + // cleaned up. + bib.sep.Trailer = base.MakeTrailer(base.SeqNumMax, base.InternalKeyKindSeparator) + w.indexBuffering.sepAlloc, bib.sep.UserKey = w.indexBuffering.sepAlloc.Copy( + w.indexBlock.UnsafeSeparator(rows - 1)) + + // Finish the index block and copy it so that w.indexBlock may be reused. + blk := w.indexBlock.Finish(rows) + if len(w.indexBuffering.blockAlloc) < len(blk) { + // Allocate enough bytes for approximately 16 index blocks. + w.indexBuffering.blockAlloc = make([]byte, len(blk)*16) + } + n := copy(w.indexBuffering.blockAlloc, blk) + bib.block = w.indexBuffering.blockAlloc[:n:n] + w.indexBuffering.blockAlloc = w.indexBuffering.blockAlloc[n:] + + w.indexBuffering.partitions = append(w.indexBuffering.partitions, bib) + // We include the separator user key to account for its bytes in the + // top-level index block. + // + // TODO(jackson): We could incrementally build the top-level index block + // and produce an exact calculation of the current top-level index + // block's size. + w.indexBuffering.partitionSizeSum += uint64(len(blk) + block.TrailerLen + len(bib.sep.UserKey)) + return nil +} + +// flushBufferedIndexBlocks writes all index blocks, including the top-level +// index block if necessary, to the underlying writable. It returns the block +// handle of the top index (either the only index block or the top-level index +// if two-level). +func (w *RawColumnWriter) flushBufferedIndexBlocks() (rootIndex block.Handle, err error) { + // If there's a currently-pending index block, finish it. + if w.indexBlock.Rows() > 0 || len(w.indexBuffering.partitions) == 0 { + if err := w.finishIndexBlock(w.indexBlock.Rows()); err != nil { + return block.Handle{}, err + } + } + // We've buffered all the index blocks. Typically there's just one index + // block, in which case we're writing a "single-level" index. If we're + // writing a large file or the index separators happen to be excessively + // long, we may have several index blocks and need to construct a + // "two-level" index structure. + switch len(w.indexBuffering.partitions) { + case 0: + // This is impossible because we'll flush the index block immediately + // above this switch statement if there are no buffered partitions + // (regardless of whether there are data block handles in the index + // block). + panic("unreachable") + case 1: + // Single-level index. + rootIndex, err = w.layout.WriteIndexBlock(w.indexBuffering.partitions[0].block) + if err != nil { + return rootIndex, err + } + w.props.IndexSize = uint64(len(w.indexBuffering.partitions[0].block)) + w.props.NumDataBlocks = uint64(w.indexBuffering.partitions[0].nEntries) + w.props.IndexType = binarySearchIndex + default: + // Two-level index. + for _, part := range w.indexBuffering.partitions { + bh, err := w.layout.WriteIndexBlock(part.block) + if err != nil { + return block.Handle{}, err + } + w.props.IndexSize += uint64(len(part.block)) + w.props.NumDataBlocks += uint64(part.nEntries) + w.topLevelIndexBlock.AddBlockHandle(part.sep.UserKey, bh, part.properties) + } + topLevelIndex := w.topLevelIndexBlock.Finish(w.topLevelIndexBlock.Rows()) + rootIndex, err = w.layout.WriteIndexBlock(topLevelIndex) + if err != nil { + return block.Handle{}, err + } + w.props.TopLevelIndexSize = uint64(len(topLevelIndex)) + w.props.IndexSize += uint64(len(topLevelIndex)) + w.props.IndexType = twoLevelIndex + w.props.IndexPartitions = uint64(len(w.indexBuffering.partitions)) + } + return rootIndex, nil +} + +// drainWriteQueue runs in its own goroutine and is responsible for writing +// finished, compressed data blocks to the writable. It reads from w.writeQueue +// until the channel is closed. All data blocks are written by this goroutine. +// Other blocks are written directly by the client goroutine. See Close. +func (w *RawColumnWriter) drainWriteQueue() { + defer w.writeQueue.wg.Done() + // Call once to initialize the CPU measurer. + w.cpuMeasurer.MeasureCPU(base.CompactionGoroutineSSTableSecondary) + for cb := range w.writeQueue.ch { + if _, err := w.layout.WritePrecompressedDataBlock(cb.physical); err != nil { + w.writeQueue.err = err + } + // Report to the CPU measurer immediately after writing (note that there + // may be a time lag until the next block is available to write). + w.cpuMeasurer.MeasureCPU(base.CompactionGoroutineSSTableSecondary) + cb.blockBuf.clear() + cb.physical = block.PhysicalBlock{} + compressedBlockPool.Put(cb) + } +} + +func (w *RawColumnWriter) Close() (err error) { + defer func() { + if w.valueBlock != nil { + w.valueBlock.Release() + // Defensive code in case Close gets called again. We don't want to put + // the same object to a sync.Pool. + w.valueBlock = nil + } + w.layout.Abort() + // Record any error in the writer (so we can exit early if Close is called + // again). + if err != nil { + w.err = err + } + }() + if w.layout.writable == nil { + return w.err + } + + // Finish the last data block and send it to the write queue if it contains + // any pending KVs. + if rows := w.dataBlock.Rows(); rows > 0 { + serializedBlock, lastKey := w.dataBlock.Finish(rows, w.pendingDataBlockSize) + w.separatorBuf = w.comparer.Successor(w.separatorBuf[:0], lastKey.UserKey) + w.err = errors.CombineErrors(w.err, w.enqueueDataBlock(serializedBlock, lastKey, w.separatorBuf)) + w.maybeIncrementTombstoneDenseBlocks(len(serializedBlock)) + } + // Close the write queue channel so that the goroutine responsible for + // writing data blocks to disk knows to exit. Any subsequent blocks (eg, + // index, metadata, range key, etc) will be written by the goroutine that + // called Close. + close(w.writeQueue.ch) + w.writeQueue.wg.Wait() + // If the write queue encountered any errors while writing out data blocks, + // it's stored in w.writeQueue.err. + w.err = firstError(w.err, w.writeQueue.err) + if w.err != nil { + return w.err + } + + // INVARIANT: w.queuedDataSize == w.layout.offset. + // All data blocks have been written to disk. The queuedDataSize is the + // cumulative size of all the data blocks we've sent to the write queue. Now + // that they've all been flushed, queuedDataSize should match w.layout's + // offset. + if w.queuedDataSize != w.layout.offset { + panic(errors.AssertionFailedf("pebble: %d of queued data blocks but layout offset is %d", + w.queuedDataSize, w.layout.offset)) + } + w.props.DataSize = w.layout.offset + if _, err = w.flushBufferedIndexBlocks(); err != nil { + return err + } + + // Write the filter block. + if w.filterBlock != nil { + bh, err := w.layout.WriteFilterBlock(w.filterBlock) + if err != nil { + return err + } + w.props.FilterPolicyName = w.filterBlock.policyName() + w.props.FilterSize = bh.Length + } + + // Write the range deletion block if non-empty. + if w.rangeDelBlock.KeyCount() > 0 { + w.props.NumRangeDeletions = uint64(w.rangeDelBlock.KeyCount()) + sm, la := w.rangeDelBlock.UnsafeBoundaryKeys() + w.meta.SetSmallestRangeDelKey(sm) + w.meta.SetLargestRangeDelKey(la) + if _, err := w.layout.WriteRangeDeletionBlock(w.rangeDelBlock.Finish()); err != nil { + return err + } + } + + // Write the range key block if non-empty. + if w.rangeKeyBlock.KeyCount() > 0 { + sm, la := w.rangeKeyBlock.UnsafeBoundaryKeys() + w.meta.SetSmallestRangeKey(sm) + w.meta.SetLargestRangeKey(la) + if _, err := w.layout.WriteRangeKeyBlock(w.rangeKeyBlock.Finish()); err != nil { + return err + } + } + + // Write out the value block. + if w.valueBlock != nil { + _, vbStats, err := w.valueBlock.Finish(&w.layout, w.layout.offset) + if err != nil { + return err + } + w.props.NumValueBlocks = vbStats.NumValueBlocks + w.props.NumValuesInValueBlocks = vbStats.NumValuesInValueBlocks + w.props.ValueBlocksSize = vbStats.ValueBlocksAndIndexSize + } + + // Write the blob reference index block if non-empty. + if w.blobRefLivenessIndexBlock.numReferences() > 0 { + var encoder colblk.ReferenceLivenessBlockEncoder + encoder.Init() + for refID, buf := range w.blobRefLivenessIndexBlock.finish() { + encoder.AddReferenceLiveness(int(refID), buf) + } + if _, err := w.layout.WriteBlobRefIndexBlock(encoder.Finish()); err != nil { + return err + } + } + + // Write the properties block. + { + // Finish and record the prop collectors if props are not yet recorded. + // Pre-computed props might have been copied by specialized sst creators + // like suffix replacer. + if len(w.props.UserProperties) == 0 { + userProps := make(map[string]string) + for i := range w.blockPropCollectors { + scratch := w.blockPropsEncoder.getScratchForProp() + // Place the shortID in the first byte. + scratch = append(scratch, byte(i)) + buf, err := w.blockPropCollectors[i].FinishTable(scratch) + if err != nil { + return err + } + var prop string + if len(buf) > 0 { + prop = string(buf) + } + // NB: The property is populated in the map even if it is the + // empty string, since the presence in the map is what indicates + // that the block property collector was used when writing. + userProps[w.blockPropCollectors[i].Name()] = prop + } + if len(userProps) > 0 { + w.props.UserProperties = userProps + } + } + + w.props.CompressionStats = w.layout.compressor.Stats().String() + var toWrite []byte + w.props.CompressionOptions = rocksDBCompressionOptions + if w.opts.TableFormat >= TableFormatPebblev7 { + var cw colblk.KeyValueBlockWriter + cw.Init() + w.props.saveToColWriter(w.opts.TableFormat, &cw) + toWrite = cw.Finish(cw.Rows()) + } else { + var raw rowblk.Writer + // The restart interval is set to infinity because the properties block + // is always read sequentially and cached in a heap located object. This + // reduces table size without a significant impact on performance. + raw.RestartInterval = propertiesBlockRestartInterval + if err = w.props.saveToRowWriter(w.opts.TableFormat, &raw); err != nil { + return err + } + toWrite = raw.Finish() + } + if _, err = w.layout.WritePropertiesBlock(toWrite); err != nil { + return err + } + } + + if w.opts.TableFormat >= TableFormatPebblev7 { + w.layout.attributes = w.props.toAttributes() + } + + // Write the table footer. + w.meta.Size, err = w.layout.Finish() + if err != nil { + return err + } + w.meta.Properties = w.props + // Release any held memory and make any future calls error. + *w = RawColumnWriter{meta: w.meta, err: errWriterClosed} + return nil +} + +// rewriteSuffixes implements RawWriter. +func (w *RawColumnWriter) rewriteSuffixes( + r *Reader, sstBytes []byte, wo WriterOptions, from, to []byte, concurrency int, +) error { + for _, c := range w.blockPropCollectors { + if !c.SupportsSuffixReplacement() { + return errors.Errorf("block property collector %s does not support suffix replacement", c.Name()) + } + } + l, err := r.Layout() + if err != nil { + return errors.Wrap(err, "reading layout") + } + // Copy data blocks in parallel, rewriting suffixes as we go. + blocks, err := rewriteDataBlocksInParallel(r, sstBytes, wo, l.Data, from, to, concurrency, w.layout.compressor.Stats(), func() blockRewriter { + return colblk.NewDataBlockRewriter(wo.KeySchema, w.comparer) + }) + if err != nil { + return errors.Wrap(err, "rewriting data blocks") + } + + // oldShortIDs maps the shortID for the block property collector in the old + // blocks to the shortID in the new blocks. Initialized once for the sstable. + oldShortIDs, n, err := getShortIDs(r, w.blockPropCollectors) + if err != nil { + return errors.Wrap(err, "getting short IDs") + } + oldProps := make([][]byte, len(w.blockPropCollectors)) + for i := range blocks { + cb := compressedBlockPool.Get().(*compressedBlock) + cb.physical = blocks[i].physical + + // Load any previous values for our prop collectors into oldProps. + for i := range oldProps { + oldProps[i] = nil + } + decoder := makeBlockPropertiesDecoder(n, l.Data[i].Props) + for !decoder.Done() { + id, val, err := decoder.Next() + if err != nil { + return err + } + if oldShortIDs[id].IsValid() { + oldProps[oldShortIDs[id]] = val + } + } + for i, p := range w.blockPropCollectors { + if err := p.AddCollectedWithSuffixReplacement(oldProps[i], from, to); err != nil { + return err + } + } + var separator []byte + if i+1 < len(blocks) { + w.separatorBuf = w.comparer.Separator(w.separatorBuf[:0], blocks[i].end.UserKey, blocks[i+1].start.UserKey) + separator = w.separatorBuf + } else { + w.separatorBuf = w.comparer.Successor(w.separatorBuf[:0], blocks[i].end.UserKey) + separator = w.separatorBuf + } + if err := w.enqueuePhysicalBlock(cb, separator); err != nil { + return err + } + } + + if len(blocks) > 0 { + props, err := r.ReadPropertiesBlock(context.TODO(), nil /* buffer pool */) + if err != nil { + return errors.Wrap(err, "reading properties block") + } + w.meta.updateSeqNum(blocks[0].start.SeqNum()) + w.props.NumEntries = props.NumEntries + w.props.RawKeySize = props.RawKeySize + w.props.RawValueSize = props.RawValueSize + w.meta.SetSmallestPointKey(blocks[0].start) + w.meta.SetLargestPointKey(blocks[len(blocks)-1].end) + } + + // Copy range key block, replacing suffixes if it exists. + if err := rewriteRangeKeyBlockToWriter(r, w, from, to); err != nil { + return errors.Wrap(err, "rewriting range key blocks") + } + // Copy over the filter block if it exists. + if w.filterBlock != nil { + if filterBlockBH, ok := l.FilterByName(w.filterBlock.metaName()); ok { + filterBlock, _, err := readBlockBuf(sstBytes, filterBlockBH, r.blockReader.ChecksumType(), nil) + if err != nil { + return errors.Wrap(err, "reading filter") + } + w.filterBlock = copyFilterWriter{ + origPolicyName: w.filterBlock.policyName(), + origMetaName: w.filterBlock.metaName(), + // Clone the filter block, because readBlockBuf allows the + // returned byte slice to point directly into sst. + data: slices.Clone(filterBlock), + } + } + } + return nil +} + +func shouldFlushWithoutLatestKV( + sizeWithKV int, sizeWithoutKV int, entryCountWithoutKV int, flushGovernor *block.FlushGovernor, +) bool { + if entryCountWithoutKV == 0 { + return false + } + if sizeWithoutKV < flushGovernor.LowWatermark() { + // Fast path when the block is too small to flush. + return false + } + return flushGovernor.ShouldFlush(sizeWithoutKV, sizeWithKV) +} + +// copyDataBlocks adds a range of blocks to the table as-is. These blocks could be +// compressed. It's specifically used by the sstable copier that can copy parts +// of an sstable to a new sstable, using CopySpan(). +func (w *RawColumnWriter) copyDataBlocks( + ctx context.Context, blocks []indexEntry, rh objstorage.ReadHandle, +) error { + const readSizeTarget = 256 << 10 + readAndFlushBlocks := func(firstBlockIdx, lastBlockIdx int) error { + if firstBlockIdx > lastBlockIdx { + panic("pebble: readAndFlushBlocks called with invalid block range") + } + // We need to flush blocks[firstBlockIdx:lastBlockIdx+1] into the write queue. + // We do this by issuing one big read from the read handle into the buffer, and + // then enqueueing the writing of those blocks one-by-one. + // + // TODO(bilal): Consider refactoring the write queue to support writing multiple + // blocks in one request. + lastBH := blocks[lastBlockIdx].bh + blocksToReadLen := lastBH.Offset + lastBH.Length + block.TrailerLen - blocks[firstBlockIdx].bh.Offset + // We need to create a new buffer for each read, as w.enqueuePhysicalBlock passes + // a pointer to the buffer to the write queue. + buf := make([]byte, 0, blocksToReadLen) + if err := rh.ReadAt(ctx, buf[:blocksToReadLen], int64(blocks[firstBlockIdx].bh.Offset)); err != nil { + return err + } + for i := firstBlockIdx; i <= lastBlockIdx; i++ { + offsetDiff := blocks[i].bh.Offset - blocks[firstBlockIdx].bh.Offset + blockBuf := buf[offsetDiff : offsetDiff+blocks[i].bh.Length+block.TrailerLen] + cb := compressedBlockPool.Get().(*compressedBlock) + cb.physical = block.NewPhysicalBlock(blockBuf) + if err := w.enqueuePhysicalBlock(cb, blocks[i].sep); err != nil { + return err + } + } + return nil + } + // Iterate through blocks until we have enough to fill readSizeTarget. When we have more than + // one block in blocksToRead and adding the next block would exceed the target buffer capacity, + // we read and flush existing blocks in blocksToRead. This allows us to read as many + // blocks in one IO request as possible, while still utilizing the write queue in this + // writer. + lastBlockOffset := uint64(0) + for i := 0; i < len(blocks); { + if blocks[i].bh.Offset < lastBlockOffset { + panic("pebble: copyDataBlocks called with blocks out of order") + } + start := i + // Note the i++ in the initializing condition; this means we will always flush at least + // one block. + for i++; i < len(blocks) && (blocks[i].bh.Length+blocks[i].bh.Offset+block.TrailerLen-blocks[start].bh.Offset) <= uint64(readSizeTarget); i++ { + } + // i points to one index past the last block we want to read. + if err := readAndFlushBlocks(start, i-1); err != nil { + return err + } + } + return nil +} + +// addDataBlock adds a raw uncompressed data block to the table as-is. It's specifically used +// by the sstable copier that can copy parts of an sstable to a new sstable, +// using CopySpan(). +func (w *RawColumnWriter) addDataBlock(b, sep []byte, bhp block.HandleWithProperties) error { + // Serialize the data block, compress it and send it to the write queue. + cb := compressedBlockPool.Get().(*compressedBlock) + cb.blockBuf.checksummer.Type = w.opts.Checksum + cb.physical = block.CompressAndChecksum( + &cb.blockBuf.dataBuf, + b, + blockkind.SSTableData, + &w.layout.compressor, + &cb.blockBuf.checksummer, + ) + if err := w.enqueuePhysicalBlock(cb, sep); err != nil { + return err + } + return nil +} + +// setFilter sets the filter to the specified filterWriter. It's specifically used +// by the sstable copier that can copy parts of an sstable to a new sstable, +// using CopySpan(). +func (w *RawColumnWriter) setFilter(fw filterWriter) { + w.filterBlock = fw +} + +// copyProperties copies properties from the specified props, and resets others +// to prepare for copying data blocks from another sstable, using the copy/addDataBlock(s) +// methods above. It's specifically used by the sstable copier that can copy parts of an +// sstable to a new sstable, using CopySpan(). +func (w *RawColumnWriter) copyProperties(props Properties) { + w.props = props + // Remove all user properties to disable block properties, which we do not + // calculate for CopySpan. + w.props.UserProperties = nil + // Reset props that we'll re-derive as we build our own index. + w.props.IndexPartitions = 0 + w.props.TopLevelIndexSize = 0 + w.props.IndexSize = 0 + w.props.IndexType = 0 +} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/comparer.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/comparer.go similarity index 94% rename from vendor/github.com/cockroachdb/pebble/sstable/comparer.go rename to vendor/github.com/cockroachdb/pebble/v2/sstable/comparer.go index 66a20b5..04c15d2 100644 --- a/vendor/github.com/cockroachdb/pebble/sstable/comparer.go +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/comparer.go @@ -4,7 +4,7 @@ package sstable -import "github.com/cockroachdb/pebble/internal/base" +import "github.com/cockroachdb/pebble/v2/internal/base" // Compare exports the base.Compare type. type Compare = base.Compare diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/copier.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/copier.go new file mode 100644 index 0000000..d9c13b6 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/copier.go @@ -0,0 +1,319 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "context" + "slices" + "strings" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/bytealloc" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// CopySpan produces a copy of a approximate subset of an input sstable. +// +// The produced sstable contains all keys from the input sstable in the span +// [start, end), as well as potentially some additional keys from the original +// file that were adjacent to but outside that span. +// +// CopySpan differs from simply seeking a reader to start and iterating until +// the end passing the results to a writer in that it does not write the new +// sstable from scratch, key-by-key, recompressing each key into new blocks and +// computing new filters and properties. Instead, it finds data _blocks_ that +// intersect the requested span and copies those, whole, to the new file, +// avoiding all decompression and recompression work. It then copies the +// original bloom filter - this filter is valid for the subset of data as well, +// just with potentially a higher false positive rate compared to one that would +// be computed just from the keys in it. +// +// The resulting sstable will have no block properties. +// +// The function might return ErrEmptySpan if there are no blocks that could +// include keys in the given range. See ErrEmptySpan for more details. +// +// Closes input and finishes or aborts output in all cases, including on errors. +// +// Note that CopySpan is not aware of any suffix or prefix replacement; the +// caller must account for those when specifying the bounds. +func CopySpan( + ctx context.Context, + input objstorage.Readable, + r *Reader, + rOpts ReaderOptions, + output objstorage.Writable, + o WriterOptions, + start, end InternalKey, +) (size uint64, _ error) { + defer func() { _ = input.Close() }() + + const unsupportedCopyFeatures = AttributeValueBlocks | AttributeRangeKeySets | AttributeRangeKeyUnsets | AttributeRangeKeyDels | AttributeRangeDels + if r.Attributes.Intersects(unsupportedCopyFeatures) { + return copyWholeFileBecauseOfUnsupportedFeature(ctx, input, output) // Finishes/Aborts output. + } + + // Don't initialize the writer with a filter policy. We'll copy whatever + // filter exists within the sstable verbatim regardless. + o.FilterPolicy = base.NoFilterPolicy + o.TableFormat = r.tableFormat + // We don't want the writer to attempt to write out block property data in + // index blocks. This data won't be valid since we're not passing the actual + // key data through the writer. We also remove the table-level properties + // below. + // + // TODO(dt,radu): Figure out how to populate the prop collector state with + // block props from the original sst. + o.BlockPropertyCollectors = nil + o.disableObsoleteCollector = true + w := NewRawWriter(output, o) + + defer func() { + if w != nil { + _ = w.Close() + } + }() + + var preallocRH objstorageprovider.PreallocatedReadHandle + // ReadBeforeForIndexAndFilter attempts to read the top-level index, filter + // and lower-level index blocks with one read. + rh := r.blockReader.UsePreallocatedReadHandle( + objstorage.ReadBeforeForIndexAndFilter, &preallocRH) + defer func() { _ = rh.Close() }() + rh.SetupForCompaction() + + bufferPool := metaBufferPools.Get().(*block.BufferPool) + defer metaBufferPools.Put(bufferPool) + defer bufferPool.Release() + + metaIndex, _, err := r.readAndDecodeMetaindex(ctx, bufferPool, rh) + if err != nil { + return 0, errors.Wrap(err, "reading metaindex") + } + props, err := r.readPropertiesBlockInternal(ctx, bufferPool, rh) + if err != nil { + return 0, errors.Wrap(err, "reading properties") + } + + // Copy the filter block if it exists. We iterate over the metaindex to find + // the appropriate filter block so that we copy the filter block even if the + // reader wasn't configured with the same filter policy. + var filterBlockHandle block.Handle + var filterBlockName string + for name, bh := range metaIndex { + if !strings.HasPrefix(name, "fullfilter.") { + continue + } + filterBlockHandle = bh + filterBlockName = name + break + } + if filterBlockName != "" { + err = func() error { + filterBlock, err := r.readFilterBlock(ctx, block.NoReadEnv, rh, filterBlockHandle) + if err != nil { + return errors.Wrap(err, "reading filter") + } + defer filterBlock.Release() + + w.setFilter(copyFilterWriter{ + origMetaName: filterBlockName, + origPolicyName: props.FilterPolicyName, + data: slices.Clone(filterBlock.BlockData()), + }) + return nil + }() + if err != nil { + return 0, err + } + } + + indexH, err := r.readTopLevelIndexBlock(ctx, block.NoReadEnv, rh) + if err != nil { + return 0, err + } + defer indexH.Release() + + // Copy all the props from the source file; we can't compute our own for many + // that depend on seeing every key, such as total count or size so we copy the + // original props instead. This will result in over-counts but that is safer + // than under-counts. + w.copyProperties(props) + + // Find the blocks that intersect our span. + blocks, err := intersectingIndexEntries(ctx, r, rh, indexH, start, end, props.NumDataBlocks) + if err != nil { + return 0, err + } + + // In theory an empty SST is fine, but #3409 means they are not. We could make + // a non-empty sst by copying something outside the span, but #3907 means that + // the empty virtual span would still be a problem, so don't bother. + if len(blocks) < 1 { + return 0, ErrEmptySpan + } + + // Copy all blocks byte-for-byte without doing any per-key processing. + var blocksNotInCache []indexEntry + + for i := range blocks { + cv := r.blockReader.GetFromCache(blocks[i].bh.Handle) + if cv == nil { + // Cache miss. Add this block to the list of blocks that are not in cache. + blocksNotInCache = blocks[i-len(blocksNotInCache) : i+1] + continue + } + + // Cache hit. + rh.RecordCacheHit(ctx, int64(blocks[i].bh.Offset), int64(blocks[i].bh.Length+block.TrailerLen)) + if len(blocksNotInCache) > 0 { + // We have some blocks that were not in cache preceding this block. + // Copy them using objstorage.Copy. + if err := w.copyDataBlocks(ctx, blocksNotInCache, rh); err != nil { + cv.Release() + return 0, err + } + blocksNotInCache = nil + } + + err := w.addDataBlock(block.CacheBufferHandle(cv).BlockData(), blocks[i].sep, blocks[i].bh) + cv.Release() + if err != nil { + return 0, err + } + } + + if len(blocksNotInCache) > 0 { + // We have some remaining blocks that were not in cache. Copy them + // using objstorage.Copy. + if err := w.copyDataBlocks(ctx, blocksNotInCache, rh); err != nil { + return 0, err + } + blocksNotInCache = nil + } + + // TODO(dt): Copy range keys (the fact there are none is checked above). + // TODO(dt): Copy valblocks keys (the fact there are none is checked above). + + if err := w.Close(); err != nil { + w = nil + return 0, err + } + meta, err := w.Metadata() + if err != nil { + return 0, err + } + wrote := meta.Size + w = nil + return wrote, nil +} + +// ErrEmptySpan is returned by CopySpan if the input sstable has no keys in the +// requested span. +// +// Note that CopySpan's determination of block overlap is best effort - we may +// copy a block that doesn't actually contain any keys in the span, in which +// case we won't generate this error. We currently only generate this error when +// the span start is beyond all keys in the physical sstable. +var ErrEmptySpan = errors.New("cannot copy empty span") + +// indexEntry captures the two components of an sst index entry: the key and the +// decoded block handle value. +type indexEntry struct { + sep []byte + bh block.HandleWithProperties +} + +// intersectingIndexEntries returns the entries from the index with separator +// keys contained by [start, end), i.e. the subset of the sst's index that +// intersects the provided span. +func intersectingIndexEntries( + ctx context.Context, + r *Reader, + rh objstorage.ReadHandle, + indexH block.BufferHandle, + start, end InternalKey, + numDataBlocks uint64, +) ([]indexEntry, error) { + top := r.tableFormat.newIndexIter() + err := top.Init(r.Comparer, indexH.BlockData(), NoTransforms) + if err != nil { + return nil, err + } + defer func() { _ = top.Close() }() + + var alloc bytealloc.A + res := make([]indexEntry, 0, numDataBlocks) + for valid := top.SeekGE(start.UserKey); valid; valid = top.Next() { + bh, err := top.BlockHandleWithProperties() + if err != nil { + return nil, err + } + if !r.Attributes.Has(AttributeTwoLevelIndex) { + entry := indexEntry{bh: bh, sep: top.Separator()} + alloc, entry.bh.Props = alloc.Copy(entry.bh.Props) + alloc, entry.sep = alloc.Copy(entry.sep) + res = append(res, entry) + } else { + err := func() error { + subBlk, err := r.readIndexBlock(ctx, block.NoReadEnv, rh, bh.Handle) + if err != nil { + return err + } + defer subBlk.Release() + + sub := r.tableFormat.newIndexIter() + err = sub.Init(r.Comparer, subBlk.BlockData(), NoTransforms) + if err != nil { + return err + } + defer func() { _ = sub.Close() }() + + for valid := sub.SeekGE(start.UserKey); valid; valid = sub.Next() { + bh, err := sub.BlockHandleWithProperties() + if err != nil { + return err + } + entry := indexEntry{bh: bh, sep: sub.Separator()} + alloc, entry.bh.Props = alloc.Copy(entry.bh.Props) + alloc, entry.sep = alloc.Copy(entry.sep) + res = append(res, entry) + if r.Comparer.Compare(end.UserKey, entry.sep) <= 0 { + break + } + } + return nil + }() + if err != nil { + return nil, err + } + } + if top.SeparatorGT(end.UserKey, true /* inclusively */) { + break + } + } + return res, nil +} + +// copyWholeFileBecauseOfUnsupportedFeature is a thin wrapper around Copy that +// exists to ensure it is visible in profiles/stack traces if we are looking at +// cluster copying more than expected. +// +// Finishes or Aborts output; does *not* Close input. +func copyWholeFileBecauseOfUnsupportedFeature( + ctx context.Context, input objstorage.Readable, output objstorage.Writable, +) (size uint64, _ error) { + length := uint64(input.Size()) + rh := input.NewReadHandle(objstorage.NoReadBefore) + rh.SetupForCompaction() + if err := objstorage.Copy(ctx, rh, output, 0, length); err != nil { + output.Abort() + return 0, err + } + return length, output.Finish() +} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/filter.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/filter.go similarity index 80% rename from vendor/github.com/cockroachdb/pebble/sstable/filter.go rename to vendor/github.com/cockroachdb/pebble/v2/sstable/filter.go index 7b2e1ab..732c4ae 100644 --- a/vendor/github.com/cockroachdb/pebble/sstable/filter.go +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/filter.go @@ -28,14 +28,6 @@ type FilterMetricsTracker struct { misses atomic.Int64 } -var _ ReaderOption = (*FilterMetricsTracker)(nil) - -func (m *FilterMetricsTracker) readerApply(r *Reader) { - if r.tableFilter != nil { - r.tableFilter.metrics = m - } -} - // Load returns the current values as FilterMetrics. func (m *FilterMetricsTracker) Load() FilterMetrics { return FilterMetrics{ @@ -44,18 +36,6 @@ func (m *FilterMetricsTracker) Load() FilterMetrics { } } -// BlockHandle is the file offset and length of a block. -type BlockHandle struct { - Offset, Length uint64 -} - -// BlockHandleWithProperties is used for data blocks and first/lower level -// index blocks, since they can be annotated using BlockPropertyCollectors. -type BlockHandleWithProperties struct { - BlockHandle - Props []byte -} - type filterWriter interface { addKey(key []byte) finish() ([]byte, error) @@ -68,10 +48,10 @@ type tableFilterReader struct { metrics *FilterMetricsTracker } -func newTableFilterReader(policy FilterPolicy) *tableFilterReader { +func newTableFilterReader(policy FilterPolicy, metrics *FilterMetricsTracker) *tableFilterReader { return &tableFilterReader{ policy: policy, - metrics: nil, + metrics: metrics, } } diff --git a/vendor/github.com/cockroachdb/pebble/sstable/format.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/format.go similarity index 67% rename from vendor/github.com/cockroachdb/pebble/sstable/format.go rename to vendor/github.com/cockroachdb/pebble/v2/sstable/format.go index 82310a5..a9f6e12 100644 --- a/vendor/github.com/cockroachdb/pebble/sstable/format.go +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/format.go @@ -6,7 +6,10 @@ package sstable import ( "github.com/cockroachdb/errors" - "github.com/cockroachdb/pebble/internal/base" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/colblk" + "github.com/cockroachdb/pebble/v2/sstable/rowblk" ) // TableFormat specifies the format version for sstables. The legacy LevelDB @@ -21,15 +24,60 @@ const ( TableFormatUnspecified TableFormat = iota TableFormatLevelDB TableFormatRocksDBv2 - TableFormatPebblev1 // Block properties. - TableFormatPebblev2 // Range keys. - TableFormatPebblev3 // Value blocks. - TableFormatPebblev4 // DELSIZED tombstones. + + // TableFormatPebblev1 adds block properties. + TableFormatPebblev1 + + // TableFormatPebblev2 adds range keys + TableFormatPebblev2 + + // TableFormatPebblev3 adds value blocks. + TableFormatPebblev3 + + // TableFormatPebblev4 adds DELSIZED tombstones. + TableFormatPebblev4 + + // TableFormatPebblev5 adds columnar blocks. + TableFormatPebblev5 // Columnar blocks. + + // TableFormatPebblev6 adds: + // - checksum for footer; + // - blob value handles; + // - columnar metaindex; + // - MinLZ compression support. + // + // Supported by CockroachDB v25.2 and later. + TableFormatPebblev6 + + // TableFormatPebblev7 adds: + // - columnar + compressed properties block; + // - footer attributes. + // + // Supported by CockroachDB v25.3 and later. + TableFormatPebblev7 + NumTableFormats TableFormatMax = NumTableFormats - 1 + + // TableFormatMinSupported is the minimum format supported by Pebble. This + // package still supports older formats for uses outside of Pebble + // (CockroachDB uses it to read data from backups that could be old). + TableFormatMinSupported = TableFormatPebblev1 ) +var footerSizes [NumTableFormats]int = [NumTableFormats]int{ + TableFormatLevelDB: levelDBFooterLen, + TableFormatRocksDBv2: rocksDBFooterLen, + TableFormatPebblev1: rocksDBFooterLen, + TableFormatPebblev2: rocksDBFooterLen, + TableFormatPebblev3: rocksDBFooterLen, + TableFormatPebblev4: rocksDBFooterLen, + TableFormatPebblev5: rocksDBFooterLen, + TableFormatPebblev6: checkedPebbleDBFooterLen, + TableFormatPebblev7: pebbleDBv7FooterLen, +} + // TableFormatPebblev4, in addition to DELSIZED, introduces the use of // InternalKeyKindSSTableInternalObsoleteBit. // @@ -134,9 +182,31 @@ const ( // // Note that we do not need to do anything special at write time for // SETWITHDEL and SINGLEDEL. This is because these key kinds are treated -// specially only by compactions, which do not hide obsolete points. For -// regular reads, SETWITHDEL behaves the same as SET and SINGLEDEL behaves the -// same as DEL. +// specially only by compactions, which typically do not hide obsolete points +// (see exception below). For regular reads, SETWITHDEL behaves the same as +// SET and SINGLEDEL behaves the same as DEL. +// +// 2.1.1 Compaction reads of a foreign sstable +// +// Compaction reads of a foreign sstable behave like regular reads in that +// only non-obsolete points are exposed. Consider a L5 foreign sstable with +// b.SINGLEDEL that is non-obsolete followed by obsolete b.DEL. And a L6 +// foreign sstable with two b.SETs. The SINGLEDEL will be exposed, and not the +// DEL, but this is not a correctness issue since only one of the SETs in the +// L6 sstable will be exposed. However, this works only because we have +// limited the number of foreign sst levels to two, and is extremely fragile. +// For robust correctness, non-obsolete SINGLEDELs in foreign sstables should +// be exposed as DELs. +// +// Additionally, to avoid false positive accounting errors in DELSIZED, we +// should expose them as DEL. +// +// NB: as of writing this comment, we do not have end-to-end support for +// SINGLEDEL for disaggregated storage since pointCollapsingIterator (used by +// ScanInternal) does not support SINGLEDEL. So the disaggregated key spans +// are required to never have SINGLEDELs (which is fine for CockroachDB since +// only the MVCC key space uses disaggregated storage, and SINGLEDELs are only +// used for the non-MVCC locks and intents). // // 2.2 Strictness and MERGE // @@ -181,17 +251,16 @@ const ( // RANGEDELs when a Pebble-external writer is trying to construct a strict // obsolete sstable. -// ParseTableFormat parses the given magic bytes and version into its +// parseTableFormat parses the given magic bytes and version into its // corresponding internal TableFormat. -func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) { +func parseTableFormat(magic []byte, version uint32) (TableFormat, error) { switch string(magic) { case levelDBMagic: return TableFormatLevelDB, nil case rocksDBMagic: if version != rocksDBFormatVersion2 { return TableFormatUnspecified, base.CorruptionErrorf( - "pebble/table: unsupported rocksdb format version %d", errors.Safe(version), - ) + "(unsupported rocksdb format version %d)", errors.Safe(version)) } return TableFormatRocksDBv2, nil case pebbleDBMagic: @@ -204,18 +273,40 @@ func ParseTableFormat(magic []byte, version uint32) (TableFormat, error) { return TableFormatPebblev3, nil case 4: return TableFormatPebblev4, nil + case 5: + return TableFormatPebblev5, nil + case 6: + return TableFormatPebblev6, nil + case 7: + return TableFormatPebblev7, nil default: return TableFormatUnspecified, base.CorruptionErrorf( - "pebble/table: unsupported pebble format version %d", errors.Safe(version), - ) + "(unsupported pebble format version %d)", errors.Safe(version)) } default: return TableFormatUnspecified, base.CorruptionErrorf( - "pebble/table: invalid table (bad magic number: 0x%x)", magic, - ) + "(bad magic number: 0x%x)", magic) } } +// BlockColumnar returns true iff the table format uses the columnar format for +// data, index and keyspan blocks. +func (f TableFormat) BlockColumnar() bool { + return f >= TableFormatPebblev5 +} + +// FooterSize returns the maximum size of the footer for the table format. +func (f TableFormat) FooterSize() int { + return footerSizes[f] +} + +func (f TableFormat) newIndexIter() block.IndexBlockIterator { + if !f.BlockColumnar() { + return new(rowblk.IndexIter) + } + return new(colblk.IndexIter) +} + // AsTuple returns the TableFormat's (Magic String, Version) tuple. func (f TableFormat) AsTuple() (string, uint32) { switch f { @@ -231,6 +322,12 @@ func (f TableFormat) AsTuple() (string, uint32) { return pebbleDBMagic, 3 case TableFormatPebblev4: return pebbleDBMagic, 4 + case TableFormatPebblev5: + return pebbleDBMagic, 5 + case TableFormatPebblev6: + return pebbleDBMagic, 6 + case TableFormatPebblev7: + return pebbleDBMagic, 7 default: panic("sstable: unknown table format version tuple") } @@ -239,6 +336,8 @@ func (f TableFormat) AsTuple() (string, uint32) { // String returns the TableFormat (Magic String,Version) tuple. func (f TableFormat) String() string { switch f { + case TableFormatUnspecified: + return "unspecified" case TableFormatLevelDB: return "(LevelDB)" case TableFormatRocksDBv2: @@ -251,7 +350,31 @@ func (f TableFormat) String() string { return "(Pebble,v3)" case TableFormatPebblev4: return "(Pebble,v4)" + case TableFormatPebblev5: + return "(Pebble,v5)" + case TableFormatPebblev6: + return "(Pebble,v6)" + case TableFormatPebblev7: + return "(Pebble,v7)" default: panic("sstable: unknown table format version tuple") } } + +var tableFormatStrings = func() map[string]TableFormat { + strs := make(map[string]TableFormat, NumTableFormats) + for f := TableFormatUnspecified; f < NumTableFormats; f++ { + strs[f.String()] = f + } + return strs +}() + +// ParseTableFormatString parses a TableFormat from its human-readable string +// representation. +func ParseTableFormatString(s string) (TableFormat, error) { + f, ok := tableFormatStrings[s] + if !ok { + return TableFormatUnspecified, errors.Errorf("unknown table format %q", s) + } + return f, nil +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/internal.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/internal.go new file mode 100644 index 0000000..8fe64e6 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/internal.go @@ -0,0 +1,49 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/valblk" +) + +// These constants are part of the file format, and should not be changed. +const ( + InternalKeyKindDelete = base.InternalKeyKindDelete + InternalKeyKindSet = base.InternalKeyKindSet + InternalKeyKindMerge = base.InternalKeyKindMerge + InternalKeyKindLogData = base.InternalKeyKindLogData + InternalKeyKindSingleDelete = base.InternalKeyKindSingleDelete + InternalKeyKindRangeDelete = base.InternalKeyKindRangeDelete + InternalKeyKindSetWithDelete = base.InternalKeyKindSetWithDelete + InternalKeyKindDeleteSized = base.InternalKeyKindDeleteSized + InternalKeyKindMax = base.InternalKeyKindMax + InternalKeyKindInvalid = base.InternalKeyKindInvalid +) + +// InternalKey exports the base.InternalKey type. +type InternalKey = base.InternalKey + +// Span exports the keyspan.Span type. +type Span = keyspan.Span + +const valueBlocksIndexHandleMaxLen = blockHandleMaxLenWithoutProperties + 3 + +// Assert blockHandleLikelyMaxLen >= valueBlocksIndexHandleMaxLen. +const _ = uint(blockHandleLikelyMaxLen - valueBlocksIndexHandleMaxLen) + +// Assert blockHandleLikelyMaxLen >= (valblk.HandleMaxLen+1). +// +// The additional 1 is for the 'valuePrefix' byte which prefaces values in +// recent SSTable versions. +const _ = uint(blockHandleLikelyMaxLen - valblk.HandleMaxLen - 1) + +// Assert blockHandleLikelyMaxLen >= (blob.MaxInlineHandleLength+1). +// +// The additional 1 is for the 'valuePrefix' byte which prefaces values in recent +// SSTable versions. +const _ = uint(blockHandleLikelyMaxLen - blob.MaxInlineHandleLength - 1) diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/layout.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/layout.go new file mode 100644 index 0000000..01ec710 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/layout.go @@ -0,0 +1,1090 @@ +// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "bytes" + "cmp" + "context" + "encoding/binary" + "fmt" + "io" + "slices" + "unsafe" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/binfmt" + "github.com/cockroachdb/pebble/v2/internal/bytealloc" + "github.com/cockroachdb/pebble/v2/internal/crc" + "github.com/cockroachdb/pebble/v2/internal/sstableinternal" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/sstable/blob" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/block/blockkind" + "github.com/cockroachdb/pebble/v2/sstable/colblk" + "github.com/cockroachdb/pebble/v2/sstable/rowblk" + "github.com/cockroachdb/pebble/v2/sstable/valblk" +) + +// Layout describes the block organization of an sstable. +type Layout struct { + // NOTE: changes to fields in this struct should also be reflected in + // ValidateBlockChecksums, which validates a static list of BlockHandles + // referenced in this struct. + + Data []block.HandleWithProperties + Index []block.Handle + TopIndex block.Handle + Filter []NamedBlockHandle + RangeDel block.Handle + RangeKey block.Handle + ValueBlock []block.Handle + ValueIndex block.Handle + Properties block.Handle + MetaIndex block.Handle + BlobReferenceIndex block.Handle + Footer block.Handle + Format TableFormat +} + +// NamedBlockHandle holds a block.Handle and corresponding name. +type NamedBlockHandle struct { + block.Handle + Name string +} + +// FilterByName retrieves the block handle of the named filter, if it exists. +// The provided the name should be the name as it appears in the metaindex +// block. +func (l *Layout) FilterByName(name string) (block.Handle, bool) { + for i := range l.Filter { + if l.Filter[i].Name == name { + return l.Filter[i].Handle, true + } + } + return block.Handle{}, false +} + +func (l *Layout) orderedBlocks() []NamedBlockHandle { + var blocks []NamedBlockHandle + for i := range l.Data { + blocks = append(blocks, NamedBlockHandle{l.Data[i].Handle, "data"}) + } + for i := range l.Index { + blocks = append(blocks, NamedBlockHandle{l.Index[i], "index"}) + } + if l.TopIndex.Length != 0 { + blocks = append(blocks, NamedBlockHandle{l.TopIndex, "top-index"}) + } + blocks = append(blocks, l.Filter...) + if l.RangeDel.Length != 0 { + blocks = append(blocks, NamedBlockHandle{l.RangeDel, "range-del"}) + } + if l.RangeKey.Length != 0 { + blocks = append(blocks, NamedBlockHandle{l.RangeKey, "range-key"}) + } + for i := range l.ValueBlock { + blocks = append(blocks, NamedBlockHandle{l.ValueBlock[i], "value-block"}) + } + if l.ValueIndex.Length != 0 { + blocks = append(blocks, NamedBlockHandle{l.ValueIndex, "value-index"}) + } + if l.Properties.Length != 0 { + blocks = append(blocks, NamedBlockHandle{l.Properties, "properties"}) + } + if l.MetaIndex.Length != 0 { + blocks = append(blocks, NamedBlockHandle{l.MetaIndex, "meta-index"}) + } + if l.BlobReferenceIndex.Length != 0 { + blocks = append(blocks, NamedBlockHandle{l.BlobReferenceIndex, "blob-reference-index"}) + } + if l.Footer.Length != 0 { + if l.Footer.Length == levelDBFooterLen { + blocks = append(blocks, NamedBlockHandle{l.Footer, "leveldb-footer"}) + } else { + blocks = append(blocks, NamedBlockHandle{l.Footer, "footer"}) + } + } + slices.SortFunc(blocks, func(a, b NamedBlockHandle) int { + return cmp.Compare(a.Offset, b.Offset) + }) + return blocks +} + +// Describe returns a description of the layout. If the verbose parameter is +// true, details of the structure of each block are returned as well. +// If verbose is true and fmtKV is non-nil, the output includes the KVs (as formatted by this function). +func (l *Layout) Describe( + verbose bool, r *Reader, fmtKV func(key *base.InternalKey, value []byte) string, +) string { + ctx := context.TODO() + + blocks := l.orderedBlocks() + formatting := rowblkFormatting + if l.Format.BlockColumnar() { + formatting = colblkFormatting + } + + tp := treeprinter.New() + root := tp.Child("sstable") + + for i := range blocks { + b := &blocks[i] + tpNode := root.Childf("%s offset: %d length: %d", b.Name, b.Offset, b.Length) + + if !verbose { + continue + } + if b.Name == "filter" { + continue + } + + if b.Name == "footer" || b.Name == "leveldb-footer" { + trailer, offset := make([]byte, b.Length), 0 + _ = r.blockReader.Readable().ReadAt(ctx, trailer, int64(b.Offset)) + + // In all cases, we know the version is right before the magic. + version := binary.LittleEndian.Uint32(trailer[len(trailer)-magicLen-versionLen:]) + magicNumber := trailer[len(trailer)-magicLen:] + format, err := parseTableFormat(magicNumber, version) + if err != nil { + panic("Error parsing table format.") + } + + var attributes Attributes + if format >= TableFormatPebblev7 { + attributes = Attributes(binary.LittleEndian.Uint32(trailer[pebbleDBV7FooterAttributesOffset:])) + } + + var computedChecksum uint32 + var encodedChecksum uint32 + if format >= TableFormatPebblev6 { + checksumOffset := checkedPebbleDBChecksumOffset + if format >= TableFormatPebblev7 { + checksumOffset = pebbleDBv7FooterChecksumOffset + } + computedChecksum = crc.CRC(0). + Update(trailer[:checksumOffset]). + Update(trailer[checksumOffset+checksumLen:]). + Value() + encodedChecksum = binary.LittleEndian.Uint32(trailer[checksumOffset:]) + } + + if b.Name == "footer" { + checksumType := block.ChecksumType(trailer[0]) + tpNode.Childf("%03d checksum type: %s", offset, checksumType) + trailer, offset = trailer[1:], offset+1 + } + + metaHandle, n := binary.Uvarint(trailer) + metaLen, m := binary.Uvarint(trailer[n:]) + tpNode.Childf("%03d meta: offset=%d, length=%d", offset, metaHandle, metaLen) + trailer, offset = trailer[n+m:], offset+n+m + + indexHandle, n := binary.Uvarint(trailer) + indexLen, m := binary.Uvarint(trailer[n:]) + tpNode.Childf("%03d index: offset=%d, length=%d", offset, indexHandle, indexLen) + trailer, offset = trailer[n+m:], offset+n+m + + // Set the offset to the start of footer's remaining trailing fields. + trailing := magicLen + if b.Name != "leveldb-footer" { + trailing += versionLen + } + if format >= TableFormatPebblev6 { + trailing += checksumLen + } + offset += len(trailer) - trailing + + if format >= TableFormatPebblev7 { + // Attributes should be just prior to the checksum. + tpNode.Childf("%03d attributes: %s", offset-attributesLen, attributes.String()) + } + + if format >= TableFormatPebblev6 { + if computedChecksum == encodedChecksum { + tpNode.Childf("%03d footer checksum: 0x%04x", offset, encodedChecksum) + } else { + tpNode.Childf("%03d invalid footer checksum: 0x%04x, expected: 0x%04x", offset, encodedChecksum, computedChecksum) + } + offset += checksumLen + } + + tpNode.Childf("%03d version: %d", offset, version) + offset = offset + 4 + tpNode.Childf("%03d magic number: 0x%x", offset, magicNumber) + continue + } + // Read the block and format it. Returns an error if we couldn't read the + // block. + err := func() error { + var err error + var h block.BufferHandle + // Defer release of any block handle that will have been read. + defer func() { h.Release() }() + + switch b.Name { + case "data": + h, err = r.readDataBlock(ctx, block.NoReadEnv, noReadHandle, b.Handle) + if err != nil { + return err + } + if fmtKV == nil { + err = formatting.formatDataBlock(tpNode, r, *b, h.BlockData(), nil) + } else { + var lastKey InternalKey + err = formatting.formatDataBlock(tpNode, r, *b, h.BlockData(), func(key *base.InternalKey, value []byte) string { + v := fmtKV(key, value) + if base.InternalCompare(r.Comparer.Compare, lastKey, *key) >= 0 { + v += " WARNING: OUT OF ORDER KEYS!" + } + lastKey.Trailer = key.Trailer + lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...) + return v + }) + } + + case "range-del": + h, err = r.readRangeDelBlock(ctx, block.NoReadEnv, noReadHandle, b.Handle) + if err != nil { + return err + } + // TODO(jackson): colblk ignores fmtKV, because it doesn't + // make sense in the context. + err = formatting.formatKeyspanBlock(tpNode, r, *b, h.BlockData(), fmtKV) + + case "range-key": + h, err = r.readRangeKeyBlock(ctx, block.NoReadEnv, noReadHandle, b.Handle) + if err != nil { + return err + } + // TODO(jackson): colblk ignores fmtKV, because it doesn't + // make sense in the context. + err = formatting.formatKeyspanBlock(tpNode, r, *b, h.BlockData(), fmtKV) + + case "index", "top-index": + h, err = r.readIndexBlock(ctx, block.NoReadEnv, noReadHandle, b.Handle) + if err != nil { + return err + } + err = formatting.formatIndexBlock(tpNode, r, *b, h.BlockData()) + + case "properties": + h, err = r.blockReader.Read(ctx, block.NoReadEnv, noReadHandle, b.Handle, blockkind.Metadata, noInitBlockMetadataFn) + if err != nil { + return err + } + if r.tableFormat >= TableFormatPebblev7 { + var decoder colblk.KeyValueBlockDecoder + decoder.Init(h.BlockData()) + offset := 0 + for i := 0; i < decoder.BlockDecoder().Rows(); i++ { + key := decoder.KeyAt(i) + value := decoder.ValueAt(i) + length := len(key) + len(value) + tpNode.Childf("%05d %s (%d)", offset, key, length) + offset += length + } + } else { + iter, _ := rowblk.NewRawIter(r.Comparer.Compare, h.BlockData()) + iter.Describe(tpNode, func(w io.Writer, key *base.InternalKey, value []byte, enc rowblk.KVEncoding) { + fmt.Fprintf(w, "%05d %s (%d)", enc.Offset, key.UserKey, enc.Length) + }) + } + + case "meta-index": + if b.Handle != r.metaindexBH { + return base.AssertionFailedf("range-del block handle does not match rangeDelBH") + } + h, err = r.readMetaindexBlock(ctx, block.NoReadEnv, noReadHandle) + if err != nil { + return err + } + + if r.tableFormat >= TableFormatPebblev6 { + var decoder colblk.KeyValueBlockDecoder + decoder.Init(h.BlockData()) + for i := 0; i < decoder.BlockDecoder().Rows(); i++ { + key := decoder.KeyAt(i) + value := decoder.ValueAt(i) + var bh block.Handle + var n int + var vbih valblk.IndexHandle + isValueBlocksIndexHandle := false + if bytes.Equal(key, []byte(metaValueIndexName)) { + vbih, n, err = valblk.DecodeIndexHandle(value) + bh = vbih.Handle + isValueBlocksIndexHandle = true + } else { + bh, n = block.DecodeHandle(value) + } + if n == 0 || n != len(value) { + tpNode.Childf("%04d [err: %s]\n", i, err) + continue + } + var vbihStr string + if isValueBlocksIndexHandle { + vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)", + vbih.BlockNumByteLength, vbih.BlockOffsetByteLength, vbih.BlockLengthByteLength) + } + tpNode.Childf("%04d %s block:%d/%d%s\n", + i, key, bh.Offset, bh.Length, vbihStr) + } + } else { + iter, _ := rowblk.NewRawIter(r.Comparer.Compare, h.BlockData()) + iter.Describe(tpNode, func(w io.Writer, key *base.InternalKey, value []byte, enc rowblk.KVEncoding) { + var bh block.Handle + var n int + var vbih valblk.IndexHandle + isValueBlocksIndexHandle := false + if bytes.Equal(iter.Key().UserKey, []byte(metaValueIndexName)) { + vbih, n, err = valblk.DecodeIndexHandle(value) + bh = vbih.Handle + isValueBlocksIndexHandle = true + } else { + bh, n = block.DecodeHandle(value) + } + if n == 0 || n != len(value) { + fmt.Fprintf(w, "%04d [err: %s]\n", enc.Offset, err) + return + } + var vbihStr string + if isValueBlocksIndexHandle { + vbihStr = fmt.Sprintf(" value-blocks-index-lengths: %d(num), %d(offset), %d(length)", + vbih.BlockNumByteLength, vbih.BlockOffsetByteLength, vbih.BlockLengthByteLength) + } + fmt.Fprintf(w, "%04d %s block:%d/%d%s", + uint64(enc.Offset), iter.Key().UserKey, bh.Offset, bh.Length, vbihStr) + }) + } + + case "value-block": + // We don't peer into the value-block since it can't be interpreted + // without the valueHandles. + case "value-index": + // We have already read the value-index to construct the list of + // value-blocks, so no need to do it again. + case "blob-reference-index": + h, err = r.readBlobRefIndexBlock(ctx, block.NoReadEnv, noReadHandle) + if err != nil { + return err + } + var decoder colblk.ReferenceLivenessBlockDecoder + decoder.Init(h.BlockData()) + offset := 0 + for i := range decoder.BlockDecoder().Rows() { + value := decoder.LivenessAtReference(i) + encs := DecodeBlobRefLivenessEncoding(value) + length := len(value) + parent := tpNode.Childf("%05d (%d)", offset, length) + for _, enc := range encs { + parent.Childf("block: %d, values size: %d, bitmap size: %d byte(s), bitmap: %08b", + enc.BlockID, enc.ValuesSize, enc.BitmapSize, enc.Bitmap) + } + offset += length + } + } + + // Format the trailer. + trailer := make([]byte, block.TrailerLen) + _ = r.blockReader.Readable().ReadAt(ctx, trailer, int64(b.Offset+b.Length)) + algo := block.CompressionIndicator(trailer[0]) + checksum := binary.LittleEndian.Uint32(trailer[1:]) + tpNode.Childf("trailer [compression=%s checksum=0x%04x]", algo, checksum) + return nil + }() + if err != nil { + tpNode.Childf("error reading block: %v", err) + } + } + return tp.String() +} + +type blockFormatting struct { + formatIndexBlock formatBlockFunc + formatDataBlock formatBlockFuncKV + formatKeyspanBlock formatBlockFuncKV +} + +type ( + formatBlockFunc func(treeprinter.Node, *Reader, NamedBlockHandle, []byte) error + formatBlockFuncKV func(treeprinter.Node, *Reader, NamedBlockHandle, []byte, func(*base.InternalKey, []byte) string) error +) + +var ( + rowblkFormatting = blockFormatting{ + formatIndexBlock: formatRowblkIndexBlock, + formatDataBlock: formatRowblkDataBlock, + formatKeyspanBlock: formatRowblkDataBlock, + } + colblkFormatting = blockFormatting{ + formatIndexBlock: formatColblkIndexBlock, + formatDataBlock: formatColblkDataBlock, + formatKeyspanBlock: formatColblkKeyspanBlock, + } +) + +func formatColblkIndexBlock(tp treeprinter.Node, r *Reader, b NamedBlockHandle, data []byte) error { + var iter colblk.IndexIter + if err := iter.Init(r.Comparer, data, NoTransforms); err != nil { + return err + } + defer func() { _ = iter.Close() }() + i := 0 + for v := iter.First(); v; v = iter.Next() { + bh, err := iter.BlockHandleWithProperties() + if err != nil { + return err + } + tp.Childf("%05d block:%d/%d\n", i, bh.Offset, bh.Length) + i++ + } + return nil +} + +func formatColblkDataBlock( + tp treeprinter.Node, + r *Reader, + b NamedBlockHandle, + data []byte, + fmtKV func(key *base.InternalKey, value []byte) string, +) error { + var decoder colblk.DataBlockDecoder + decoder.Init(r.keySchema, data) + f := binfmt.New(data) + decoder.Describe(f, tp) + + if fmtKV != nil { + var iter colblk.DataBlockIter + iter.InitOnce(r.keySchema, r.Comparer, describingLazyValueHandler{}) + if err := iter.Init(&decoder, block.IterTransforms{}); err != nil { + return err + } + defer func() { _ = iter.Close() }() + for kv := iter.First(); kv != nil; kv = iter.Next() { + tp.Child(fmtKV(&kv.K, kv.V.LazyValue().ValueOrHandle)) + } + } + return nil +} + +// describingLazyValueHandler is a block.GetInternalValueForPrefixAndValueHandler +// that replaces a value handle with an in-place value describing the handle. +type describingLazyValueHandler struct{} + +// Assert that debugLazyValueHandler implements the +// block.GetInternalValueForPrefixAndValueHandler interface. +var _ block.GetInternalValueForPrefixAndValueHandler = describingLazyValueHandler{} + +func (describingLazyValueHandler) GetInternalValueForPrefixAndValueHandle( + handle []byte, +) base.InternalValue { + vp := block.ValuePrefix(handle[0]) + var result string + switch { + case vp.IsValueBlockHandle(): + vh := valblk.DecodeHandle(handle[1:]) + result = fmt.Sprintf("value handle %+v", vh) + case vp.IsBlobValueHandle(): + handlePreface, remainder := blob.DecodeInlineHandlePreface(handle[1:]) + handleSuffix := blob.DecodeHandleSuffix(remainder) + ih := blob.InlineHandle{ + InlineHandlePreface: handlePreface, + HandleSuffix: handleSuffix, + } + result = fmt.Sprintf("blob handle %+v", ih) + default: + result = "unknown value type" + } + return base.MakeInPlaceValue([]byte(result)) +} + +func formatColblkKeyspanBlock( + tp treeprinter.Node, + r *Reader, + b NamedBlockHandle, + data []byte, + _ func(*base.InternalKey, []byte) string, +) error { + var decoder colblk.KeyspanDecoder + decoder.Init(data) + f := binfmt.New(data) + decoder.Describe(f, tp) + return nil +} + +func formatRowblkIndexBlock(tp treeprinter.Node, r *Reader, b NamedBlockHandle, data []byte) error { + iter, err := rowblk.NewIter(r.Comparer.Compare, r.Comparer.ComparePointSuffixes, r.Comparer.Split, data, NoTransforms) + if err != nil { + return err + } + iter.Describe(tp, func(w io.Writer, key *base.InternalKey, value []byte, enc rowblk.KVEncoding) { + bh, err := block.DecodeHandleWithProperties(value) + if err != nil { + fmt.Fprintf(w, "%05d [err: %s]\n", enc.Offset, err) + return + } + fmt.Fprintf(w, "%05d block:%d/%d", enc.Offset, bh.Offset, bh.Length) + if enc.IsRestart { + fmt.Fprintf(w, " [restart]") + } + }) + return nil +} + +func formatRowblkDataBlock( + tp treeprinter.Node, + r *Reader, + b NamedBlockHandle, + data []byte, + fmtRecord func(key *base.InternalKey, value []byte) string, +) error { + iter, err := rowblk.NewIter(r.Comparer.Compare, r.Comparer.ComparePointSuffixes, r.Comparer.Split, data, NoTransforms) + if err != nil { + return err + } + iter.Describe(tp, func(w io.Writer, key *base.InternalKey, value []byte, enc rowblk.KVEncoding) { + // The format of the numbers in the record line is: + // + // ( = [] + + ) + // + // is the total number of bytes for the record. + // is the size of the 3 varint encoded integers for , + // , and . + // is the number of key bytes shared with the previous key. + // is the number of unshared key bytes. + // is the number of value bytes. + fmt.Fprintf(w, "%05d record (%d = %d [%d] + %d + %d)", + uint64(enc.Offset), enc.Length, + enc.Length-int32(enc.KeyUnshared+enc.ValueLen), enc.KeyShared, enc.KeyUnshared, enc.ValueLen) + if enc.IsRestart { + fmt.Fprint(w, " [restart]") + } + if fmtRecord != nil { + if r.tableFormat < TableFormatPebblev3 || key.Kind() != InternalKeyKindSet { + fmt.Fprintf(w, "\n %s", fmtRecord(key, value)) + return + } + vp := block.ValuePrefix(value[0]) + if vp.IsInPlaceValue() { + fmt.Fprintf(w, "\n %s", fmtRecord(key, value[1:])) + } else if vp.IsValueBlockHandle() { + vh := valblk.DecodeHandle(value[1:]) + fmt.Fprintf(w, "\n %s", fmtRecord(key, []byte(fmt.Sprintf("value handle %+v", vh)))) + } else { + panic(fmt.Sprintf("unknown value prefix: %d", value[0])) + } + } + }) + return nil +} + +func decodeLayout(comparer *base.Comparer, data []byte, tableFormat TableFormat) (Layout, error) { + foot, err := parseFooter(data, 0, int64(len(data))) + if err != nil { + return Layout{}, err + } + decompressedMeta, err := decompressInMemory(data, foot.metaindexBH) + if err != nil { + return Layout{}, errors.Wrap(err, "decompressing metaindex") + } + var meta map[string]block.Handle + var vbih valblk.IndexHandle + if tableFormat >= TableFormatPebblev6 { + meta, vbih, err = decodeColumnarMetaIndex(decompressedMeta) + } else { + meta, vbih, err = decodeMetaindex(decompressedMeta) + } + if err != nil { + return Layout{}, err + } + layout := Layout{ + MetaIndex: foot.metaindexBH, + Properties: meta[metaPropertiesName], + RangeDel: meta[metaRangeDelV2Name], + RangeKey: meta[metaRangeKeyName], + ValueIndex: vbih.Handle, + Footer: foot.footerBH, + Format: foot.format, + } + decompressedProps, err := decompressInMemory(data, layout.Properties) + if err != nil { + return Layout{}, errors.Wrap(err, "decompressing properties") + } + props, err := decodePropertiesBlock(tableFormat, decompressedProps) + if err != nil { + return Layout{}, err + } + + if props.IndexType == twoLevelIndex { + decompressed, err := decompressInMemory(data, foot.indexBH) + if err != nil { + return Layout{}, errors.Wrap(err, "decompressing two-level index") + } + layout.TopIndex = foot.indexBH + topLevelIter, err := newIndexIter(foot.format, comparer, decompressed) + if err != nil { + return Layout{}, err + } + err = forEachIndexEntry(topLevelIter, func(bhp block.HandleWithProperties) { + layout.Index = append(layout.Index, bhp.Handle) + }) + if err != nil { + return Layout{}, err + } + } else { + layout.Index = append(layout.Index, foot.indexBH) + } + for _, indexBH := range layout.Index { + decompressed, err := decompressInMemory(data, indexBH) + if err != nil { + return Layout{}, errors.Wrap(err, "decompressing index block") + } + indexIter, err := newIndexIter(foot.format, comparer, decompressed) + if err != nil { + return Layout{}, err + } + err = forEachIndexEntry(indexIter, func(bhp block.HandleWithProperties) { + layout.Data = append(layout.Data, bhp) + }) + if err != nil { + return Layout{}, err + } + } + + if layout.ValueIndex.Length > 0 { + vbiBlock, err := decompressInMemory(data, layout.ValueIndex) + if err != nil { + return Layout{}, errors.Wrap(err, "decompressing value index") + } + layout.ValueBlock, err = valblk.DecodeIndex(vbiBlock, vbih) + if err != nil { + return Layout{}, err + } + } + + return layout, nil +} + +func decompressInMemory(data []byte, bh block.Handle) ([]byte, error) { + typ := block.CompressionIndicator(data[bh.Offset+bh.Length]) + var decompressed []byte + if typ == block.NoCompressionIndicator { + return data[bh.Offset : bh.Offset+bh.Length], nil + } + // Decode the length of the decompressed value. + decodedLen, err := block.DecompressedLen(typ, data[bh.Offset:bh.Offset+bh.Length]) + if err != nil { + return nil, err + } + decompressed = make([]byte, decodedLen) + if err := block.DecompressInto(typ, data[int(bh.Offset):bh.Offset+bh.Length], decompressed); err != nil { + return nil, err + } + return decompressed, nil +} + +func newIndexIter( + tableFormat TableFormat, comparer *base.Comparer, data []byte, +) (block.IndexBlockIterator, error) { + var iter block.IndexBlockIterator + var err error + if tableFormat <= TableFormatPebblev4 { + iter = new(rowblk.IndexIter) + err = iter.Init(comparer, data, block.NoTransforms) + } else { + iter = new(colblk.IndexIter) + err = iter.Init(comparer, data, block.NoTransforms) + } + if err != nil { + return nil, err + } + return iter, nil +} + +func forEachIndexEntry( + indexIter block.IndexBlockIterator, fn func(block.HandleWithProperties), +) error { + for v := indexIter.First(); v; v = indexIter.Next() { + bhp, err := indexIter.BlockHandleWithProperties() + if err != nil { + return err + } + fn(bhp) + } + return indexIter.Close() +} + +// decodeMetaindex decodes a row-based meta index block. The returned map owns +// all its memory and can outlive the provided data slice. +func decodeMetaindex( + data []byte, +) (meta map[string]block.Handle, vbih valblk.IndexHandle, err error) { + i, err := rowblk.NewRawIter(bytes.Compare, data) + if err != nil { + return nil, valblk.IndexHandle{}, err + } + defer func() { err = firstError(err, i.Close()) }() + + var keysAlloc bytealloc.A + meta = map[string]block.Handle{} + for valid := i.First(); valid; valid = i.Next() { + value := i.Value() + var bh block.Handle + if bytes.Equal(i.Key().UserKey, []byte(metaValueIndexName)) { + var n int + vbih, n, err = valblk.DecodeIndexHandle(i.Value()) + if err != nil { + return nil, vbih, err + } + if n == 0 || n != len(value) { + return nil, vbih, base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)") + } + bh = vbih.Handle + } else { + var n int + bh, n = block.DecodeHandle(value) + if n == 0 || n != len(value) { + return nil, vbih, base.CorruptionErrorf("pebble/table: invalid table (bad block handle)") + } + } + var key []byte + keysAlloc, key = keysAlloc.Copy(i.Key().UserKey) + keyStr := unsafe.String(unsafe.SliceData(key), len(key)) + meta[keyStr] = bh + } + return meta, vbih, nil +} + +// decodeColumnarMetaIndex decodes a columnar meta index block. The returned map +// owns all its memory and can outlive the provided data slice. +func decodeColumnarMetaIndex( + data []byte, +) (meta map[string]block.Handle, vbih valblk.IndexHandle, err error) { + var decoder colblk.KeyValueBlockDecoder + decoder.Init(data) + var keysAlloc bytealloc.A + meta = map[string]block.Handle{} + for i := 0; i < decoder.BlockDecoder().Rows(); i++ { + key := decoder.KeyAt(i) + value := decoder.ValueAt(i) + + var bh block.Handle + if bytes.Equal(key, []byte(metaValueIndexName)) { + var n int + vbih, n, err = valblk.DecodeIndexHandle(value) + if err != nil { + return nil, vbih, err + } + if n == 0 || n != len(value) { + return nil, vbih, base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)") + } + bh = vbih.Handle + } else { + var n int + bh, n = block.DecodeHandle(value) + if n == 0 || n != len(value) { + return nil, vbih, base.CorruptionErrorf("pebble/table: invalid table (bad block handle)") + } + } + var keyCopy []byte + keysAlloc, keyCopy = keysAlloc.Copy(key) + keyStr := unsafe.String(unsafe.SliceData(keyCopy), len(keyCopy)) + meta[keyStr] = bh + } + return meta, vbih, nil +} + +// layoutWriter writes the structure of an sstable to durable storage. It +// accepts serialized blocks, writes them to storage, and returns a block handle +// describing the offset and length of the block. +type layoutWriter struct { + writable objstorage.Writable + + // cacheOpts are used to remove blocks written to the sstable from the cache, + // providing a defense in depth against bugs which cause cache collisions. + cacheOpts sstableinternal.CacheOptions + + // options copied from WriterOptions + tableFormat TableFormat + compressor block.Compressor + checksumType block.ChecksumType + + // Attribute bitset of the sstable, derived from sstable Properties at the time + // of writing. + attributes Attributes + + // offset tracks the current write offset within the writable. + offset uint64 + // lastIndexBlockHandle holds the handle to the most recently-written index + // block. It's updated by writeIndexBlock. When writing sstables with a + // single-level index, this field will be updated once. When writing + // sstables with a two-level index, the last update will set the two-level + // index. + lastIndexBlockHandle block.Handle + handles []metaIndexHandle + handlesBuf bytealloc.A + tmp [blockHandleLikelyMaxLen]byte + buf blockBuf +} + +func makeLayoutWriter(w objstorage.Writable, opts WriterOptions) layoutWriter { + return layoutWriter{ + writable: w, + cacheOpts: opts.internal.CacheOpts, + tableFormat: opts.TableFormat, + compressor: block.MakeCompressor(opts.Compression), + checksumType: opts.Checksum, + buf: blockBuf{ + checksummer: block.Checksummer{Type: opts.Checksum}, + }, + } +} + +type metaIndexHandle struct { + key string + encodedBlockHandle []byte +} + +// Abort aborts writing the table, aborting the underlying writable too. Abort +// is idempotent. +func (w *layoutWriter) Abort() { + if w.writable != nil { + w.writable.Abort() + w.writable = nil + w.compressor.Close() + } +} + +// WriteDataBlock constructs a trailer for the provided data block and writes +// the block and trailer to the writer. It returns the block's handle. +func (w *layoutWriter) WriteDataBlock(b []byte, buf *blockBuf) (block.Handle, error) { + return w.writeBlock(b, blockkind.SSTableData, buf) +} + +// WritePrecompressedDataBlock writes a pre-compressed data block and its +// pre-computed trailer to the writer, returning its block handle. It can mangle +// the block data. +func (w *layoutWriter) WritePrecompressedDataBlock(blk block.PhysicalBlock) (block.Handle, error) { + return w.writePrecompressedBlock(blk) +} + +// WriteIndexBlock constructs a trailer for the provided index (first or +// second-level) and writes the block and trailer to the writer. It remembers +// the last-written index block's handle and adds it to the file's meta index +// when the writer is finished. +func (w *layoutWriter) WriteIndexBlock(b []byte) (block.Handle, error) { + h, err := w.writeBlock(b, blockkind.SSTableIndex, &w.buf) + if err == nil { + w.lastIndexBlockHandle = h + } + return h, err +} + +// WriteFilterBlock finishes the provided filter, constructs a trailer, and +// writes the block and trailer to the writer. It automatically adds the filter +// block to the file's meta index when the writer is finished. +func (w *layoutWriter) WriteFilterBlock(f filterWriter) (bh block.Handle, err error) { + b, err := f.finish() + if err != nil { + return block.Handle{}, err + } + return w.writeNamedBlockUncompressed(b, blockkind.Filter, f.metaName()) +} + +// WritePropertiesBlock constructs a trailer for the provided properties block +// and writes the block and trailer to the writer. It automatically adds the +// properties block to the file's meta index when the writer is finished. +func (w *layoutWriter) WritePropertiesBlock(b []byte) (bh block.Handle, err error) { + // In v6 and earlier, we use a row oriented block with an infinite restart + // interval, which provides very good prefix compression. Since v7, we use the + // columnar format without prefix compression for this block; we enable block + // compression to compensate. + if w.tableFormat < TableFormatPebblev7 { + bh, err = w.writeBlockUncompressed(b, blockkind.Metadata, &w.buf) + } else { + bh, err = w.writeBlock(b, blockkind.Metadata, &w.buf) + } + if err == nil { + w.recordToMetaindex(metaPropertiesName, bh) + } + return bh, err +} + +// WriteRangeKeyBlock constructs a trailer for the provided range key block and +// writes the block and trailer to the writer. It automatically adds the range +// key block to the file's meta index when the writer is finished. +func (w *layoutWriter) WriteRangeKeyBlock(b []byte) (block.Handle, error) { + return w.writeNamedBlockUncompressed(b, blockkind.RangeKey, metaRangeKeyName) +} + +// WriteBlobRefIndexBlock constructs a trailer for the provided blob reference +// index block and writes the block and trailer to the writer. It automatically +// adds the blob reference index block to the file's meta index when the writer +// is finished. +func (w *layoutWriter) WriteBlobRefIndexBlock(b []byte) (block.Handle, error) { + return w.writeNamedBlockUncompressed(b, blockkind.BlobReferenceValueLivenessIndex, metaBlobRefIndexName) +} + +// WriteRangeDeletionBlock constructs a trailer for the provided range deletion +// block and writes the block and trailer to the writer. It automatically adds +// the range deletion block to the file's meta index when the writer is +// finished. +func (w *layoutWriter) WriteRangeDeletionBlock(b []byte) (block.Handle, error) { + return w.writeNamedBlockUncompressed(b, blockkind.RangeDel, metaRangeDelV2Name) +} + +// writeNamedBlockUncompressed writes a block without compressing it and adds it to the metaindex. +func (w *layoutWriter) writeNamedBlockUncompressed( + b []byte, kind block.Kind, name string, +) (bh block.Handle, err error) { + bh, err = w.writeBlockUncompressed(b, kind, &w.buf) + if err == nil { + w.recordToMetaindex(name, bh) + } + return bh, err +} + +// WriteValueBlock writes a pre-finished value block (with the trailer) to the +// writer. It can mangle the block data. +func (w *layoutWriter) WriteValueBlock(blk block.PhysicalBlock) (block.Handle, error) { + return w.writePrecompressedBlock(blk) +} + +// WriteValueIndexBlock writes a value index block and adds it to the meta +// index. It can mangle the block data. +func (w *layoutWriter) WriteValueIndexBlock( + blk block.PhysicalBlock, vbih valblk.IndexHandle, +) (block.Handle, error) { + h, err := w.writePrecompressedBlock(blk) + if err != nil { + return block.Handle{}, err + } + n := valblk.EncodeIndexHandle(w.tmp[:], vbih) + w.recordToMetaindexRaw(metaValueIndexName, w.tmp[:n]) + return h, nil +} + +// writeBlock checksums, compresses, and writes out a block. +func (w *layoutWriter) writeBlock(b []byte, kind block.Kind, buf *blockBuf) (block.Handle, error) { + pb := block.CompressAndChecksum(&buf.dataBuf, b, kind, &w.compressor, &buf.checksummer) + h, err := w.writePrecompressedBlock(pb) + return h, err +} + +// writeBlock checksums and writes out a block. +func (w *layoutWriter) writeBlockUncompressed( + b []byte, kind block.Kind, buf *blockBuf, +) (block.Handle, error) { + pb := block.CopyAndChecksum(&buf.dataBuf, b, kind, &w.compressor, &buf.checksummer) + h, err := w.writePrecompressedBlock(pb) + return h, err +} + +// writePrecompressedBlock writes a pre-compressed block and its +// pre-computed trailer to the writer, returning its block handle. +// +// writePrecompressedBlock might mangle the block data. +func (w *layoutWriter) writePrecompressedBlock(blk block.PhysicalBlock) (block.Handle, error) { + w.clearFromCache(w.offset) + // Write the bytes to the file. + n, err := blk.WriteTo(w.writable) + if err != nil { + return block.Handle{}, err + } + bh := block.Handle{Offset: w.offset, Length: uint64(blk.LengthWithoutTrailer())} + w.offset += uint64(n) + return bh, nil +} + +// Write implements io.Writer (with the caveat that it can mangle the block +// data). This is analogous to writePrecompressedBlock for blocks that already +// incorporate the trailer, and don't need the callee to return a BlockHandle. +func (w *layoutWriter) Write(blockWithTrailer []byte) (n int, err error) { + offset := w.offset + w.clearFromCache(offset) + w.offset += uint64(len(blockWithTrailer)) + // This call can mangle blockWithTrailer. + if err := w.writable.Write(blockWithTrailer); err != nil { + return 0, err + } + return len(blockWithTrailer), nil +} + +// clearFromCache removes the block at the provided offset from the cache. This provides defense in +// depth against bugs which cause cache collisions. +func (w *layoutWriter) clearFromCache(offset uint64) { + if w.cacheOpts.CacheHandle != nil { + // TODO(peter): Alternatively, we could add the uncompressed value to the + // cache. + w.cacheOpts.CacheHandle.Delete(w.cacheOpts.FileNum, offset) + } +} + +func (w *layoutWriter) recordToMetaindex(key string, h block.Handle) { + n := h.EncodeVarints(w.tmp[:]) + w.recordToMetaindexRaw(key, w.tmp[:n]) +} + +func (w *layoutWriter) recordToMetaindexRaw(key string, h []byte) { + var encodedHandle []byte + w.handlesBuf, encodedHandle = w.handlesBuf.Alloc(len(h)) + copy(encodedHandle, h) + w.handles = append(w.handles, metaIndexHandle{key: key, encodedBlockHandle: encodedHandle}) +} + +func (w *layoutWriter) IsFinished() bool { return w.writable == nil } + +// Finish serializes the sstable, writing out the meta index block and sstable +// footer and closing the file. It returns the total size of the resulting +// sstable. +func (w *layoutWriter) Finish() (size uint64, err error) { + // Sort the meta index handles by key and write the meta index block. + slices.SortFunc(w.handles, func(a, b metaIndexHandle) int { + return cmp.Compare(a.key, b.key) + }) + var b []byte + if w.tableFormat >= TableFormatPebblev6 { + var cw colblk.KeyValueBlockWriter + cw.Init() + for _, h := range w.handles { + cw.AddKV(unsafe.Slice(unsafe.StringData(h.key), len(h.key)), h.encodedBlockHandle) + } + b = cw.Finish(cw.Rows()) + } else { + bw := rowblk.Writer{RestartInterval: 1} + for _, h := range w.handles { + if err := bw.AddRaw(unsafe.Slice(unsafe.StringData(h.key), len(h.key)), h.encodedBlockHandle); err != nil { + return 0, err + } + } + b = bw.Finish() + } + metaIndexHandle, err := w.writeBlockUncompressed(b, blockkind.Metadata, &w.buf) + if err != nil { + return 0, err + } + + // Write the table footer. + footer := footer{ + format: w.tableFormat, + checksum: w.checksumType, + metaindexBH: metaIndexHandle, + indexBH: w.lastIndexBlockHandle, + attributes: w.attributes, + } + encodedFooter := footer.encode(w.tmp[:]) + if err := w.writable.Write(encodedFooter); err != nil { + return 0, err + } + w.offset += uint64(len(encodedFooter)) + + err = w.writable.Finish() + w.writable = nil + w.compressor.Close() + return w.offset, err +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/options.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/options.go new file mode 100644 index 0000000..d02ee96 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/options.go @@ -0,0 +1,379 @@ +// Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "fmt" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/sstableinternal" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/colblk" + "github.com/cockroachdb/pebble/v2/sstable/rowblk" +) + +const ( + // MaximumRestartOffset is the maximum permissible value for a restart + // offset within a block. That is, the maximum block size that allows adding + // an additional restart point. + MaximumRestartOffset = rowblk.MaximumRestartOffset + // DefaultNumDeletionsThreshold defines the minimum number of point + // tombstones that must be present in a data block for it to be + // considered tombstone-dense. + DefaultNumDeletionsThreshold = 100 + // DefaultDeletionSizeRatioThreshold defines the minimum ratio of the size + // of point tombstones to the size of the data block in order to consider the + // block as tombstone-dense. + DefaultDeletionSizeRatioThreshold = 0.5 +) + +var ignoredInternalProperties = map[string]struct{}{ + "rocksdb.column.family.id": {}, + "rocksdb.fixed.key.length": {}, + "rocksdb.index.key.is.user.key": {}, + "rocksdb.index.value.is.delta.encoded": {}, + "rocksdb.oldest.key.time": {}, + "rocksdb.creation.time": {}, + "rocksdb.file.creation.time": {}, + "rocksdb.format.version": {}, +} + +// FilterType exports the base.FilterType type. +type FilterType = base.FilterType + +// Exported TableFilter constants. +const ( + TableFilter = base.TableFilter +) + +// FilterWriter exports the base.FilterWriter type. +type FilterWriter = base.FilterWriter + +// FilterPolicy exports the base.FilterPolicy type. +type FilterPolicy = base.FilterPolicy + +// Comparers is a map from comparer name to comparer. It is used for debugging +// tools which may be used on multiple databases configured with different +// comparers. +type Comparers map[string]*base.Comparer + +// Mergers is a map from merger name to merger. It is used for debugging tools +// which may be used on multiple databases configured with different +// mergers. +type Mergers map[string]*base.Merger + +// KeySchemas is a map from key schema name to key schema. A single database may +// contain sstables with multiple key schemas. +type KeySchemas map[string]*colblk.KeySchema + +// MakeKeySchemas constructs a KeySchemas from a slice of key schemas. +func MakeKeySchemas(keySchemas ...*colblk.KeySchema) KeySchemas { + m := make(KeySchemas, len(keySchemas)) + for _, keySchema := range keySchemas { + if _, ok := m[keySchema.Name]; ok { + panic(fmt.Sprintf("duplicate key schemas with name %q", keySchema.Name)) + } + m[keySchema.Name] = keySchema + } + return m +} + +// ReaderOptions holds the parameters needed for reading an sstable. +type ReaderOptions struct { + block.ReaderOptions + + // Comparer defines a total ordering over the space of []byte keys: a 'less + // than' relationship. The same comparison algorithm must be used for reads + // and writes over the lifetime of the DB. + // + // The default value uses the same ordering as bytes.Compare. + Comparer *Comparer + + // Merger defines the Merge function in use for this keyspace. + Merger *Merger + + Comparers Comparers + Mergers Mergers + // KeySchemas contains the set of known key schemas to use when interpreting + // columnar data blocks. Only used for sstables encoded in format + // TableFormatPebblev5 or higher. + KeySchemas KeySchemas + + // Filters is a map from filter policy name to filter policy. Filters with + // policies that are not in this map will be ignored. + Filters map[string]FilterPolicy + + // FilterMetricsTracker is optionally used to track filter metrics. + FilterMetricsTracker *FilterMetricsTracker +} + +func (o ReaderOptions) ensureDefaults() ReaderOptions { + if o.Comparer == nil { + o.Comparer = base.DefaultComparer + } + if o.Merger == nil { + o.Merger = base.DefaultMerger + } + if o.LoggerAndTracer == nil { + o.LoggerAndTracer = base.NoopLoggerAndTracer{} + } + if o.KeySchemas == nil { + o.KeySchemas = defaultKeySchemas + } + return o +} + +var defaultKeySchema = colblk.DefaultKeySchema(base.DefaultComparer, 16) +var defaultKeySchemas = MakeKeySchemas(&defaultKeySchema) + +type CompressionProfile = block.CompressionProfile + +// Exported CompressionProfile constants. +var ( + DefaultCompression = block.DefaultCompression + NoCompression = block.NoCompression + SnappyCompression = block.SnappyCompression + ZstdCompression = block.ZstdCompression + // MinLZCompression is only supported with table formats v6+. Older formats + // fall back to snappy. + MinLZCompression = block.MinLZCompression + FastestCompression = block.FastestCompression + FastCompression = block.FastCompression + BalancedCompression = block.BalancedCompression + GoodCompression = block.GoodCompression +) + +// WriterOptions holds the parameters used to control building an sstable. +type WriterOptions struct { + // BlockRestartInterval is the number of keys between restart points + // for delta encoding of keys. + // + // The default value is 16. + BlockRestartInterval int + + // BlockSize is the target uncompressed size in bytes of each table block. + // + // The default value is 4096. + BlockSize int + + // BlockSizeThreshold finishes a block if the block size is larger than the + // specified percentage of the target block size and adding the next entry + // would cause the block to be larger than the target block size. + // + // The default value is 90. + BlockSizeThreshold int + + // SizeClassAwareThreshold imposes a minimum block size restriction for blocks + // to be flushed, that is computed as the percentage of the target block size. + // Note that this threshold takes precedence over BlockSizeThreshold when + // valid AllocatorSizeClasses are specified. + // + // The default value is 60. + SizeClassAwareThreshold int + + // Comparer defines a total ordering over the space of []byte keys: a 'less + // than' relationship. The same comparison algorithm must be used for reads + // and writes over the lifetime of the DB. + // + // The default value uses the same ordering as bytes.Compare. + Comparer *Comparer + + // Compression defines the per-block compression to use. + // + // The default value uses snappy compression. + Compression *CompressionProfile + + // FilterPolicy defines a filter algorithm (such as a Bloom filter) that can + // reduce disk reads for Get calls. + // + // One such implementation is bloom.FilterPolicy(10) from the pebble/bloom + // package. + // + // The default value is NoFilterPolicy. + FilterPolicy FilterPolicy + + // FilterType defines whether an existing filter policy is applied at a + // block-level or table-level. Block-level filters use less memory to create, + // but are slower to access as a check for the key in the index must first be + // performed to locate the filter block. A table-level filter will require + // memory proportional to the number of keys in an sstable to create, but + // avoids the index lookup when determining if a key is present. Table-level + // filters should be preferred except under constrained memory situations. + FilterType FilterType + + // IndexBlockSize is the target uncompressed size in bytes of each index + // block. When the index block size is larger than this target, two-level + // indexes are automatically enabled. Setting this option to a large value + // (such as math.MaxInt32) disables the automatic creation of two-level + // indexes. + // + // The default value is the value of BlockSize. + IndexBlockSize int + + // KeySchema describes the schema to use for sstable formats that make use + // of columnar blocks, decomposing keys into their constituent components. + // Ignored if TableFormat <= TableFormatPebblev4. + KeySchema *colblk.KeySchema + + // Merger defines the associative merge operation to use for merging values + // written with {Batch,DB}.Merge. The MergerName is checked for consistency + // with the value stored in the sstable when it was written. + MergerName string + + // TableFormat specifies the format version for writing sstables. The default + // is TableFormatMinSupported. + TableFormat TableFormat + + // IsStrictObsolete is only relevant for >= TableFormatPebblev4. See comment + // in format.go. Must be false if format < TableFormatPebblev4. + // + // TODO(bilal): set this when writing shared ssts. + IsStrictObsolete bool + + // WritingToLowestLevel is only relevant for >= TableFormatPebblev4. It is + // used to set the obsolete bit on DEL/DELSIZED/SINGLEDEL if they are the + // youngest for a userkey. + WritingToLowestLevel bool + + // BlockPropertyCollectors is a list of BlockPropertyCollector creation + // functions. A new BlockPropertyCollector is created for each sstable + // built and lives for the lifetime of writing that table. + BlockPropertyCollectors []func() BlockPropertyCollector + + // Checksum specifies which checksum to use. + Checksum block.ChecksumType + + // ShortAttributeExtractor mirrors + // Options.Experimental.ShortAttributeExtractor. + ShortAttributeExtractor base.ShortAttributeExtractor + + // DisableValueBlocks is only used for TableFormat >= TableFormatPebblev3, + // and if set to true, does not write any values to value blocks. This is + // only intended for cases where the in-memory buffering of all value blocks + // while writing a sstable is too expensive and likely to cause an OOM. It + // is never set to true by a Pebble DB, and can be set to true when some + // external code is directly generating huge sstables using Pebble's + // sstable.Writer (for example, CockroachDB backups can sometimes write + // 750MB sstables -- see + // https://github.com/cockroachdb/cockroach/issues/117113). + DisableValueBlocks bool + + // AllocatorSizeClasses provides a sorted list containing the supported size + // classes of the underlying memory allocator. This provides hints to the + // writer's flushing policy to select block sizes that preemptively reduce + // internal fragmentation when loaded into the block cache. + AllocatorSizeClasses []int + + // internal options can only be used from within the pebble package. + internal sstableinternal.WriterOptions + + // NumDeletionsThreshold mirrors Options.Experimental.NumDeletionsThreshold. + NumDeletionsThreshold int + + // DeletionSizeRatioThreshold mirrors + // Options.Experimental.DeletionSizeRatioThreshold. + DeletionSizeRatioThreshold float32 + + // disableObsoleteCollector is used to disable the obsolete key block property + // collector automatically added by sstable block writers. + disableObsoleteCollector bool +} + +// UserKeyPrefixBound represents a [Lower,Upper) bound of user key prefixes. +// If both are nil, there is no bound specified. Else, Compare(Lower,Upper) +// must be < 0. +type UserKeyPrefixBound struct { + // Lower is a lower bound user key prefix. + Lower []byte + // Upper is an upper bound user key prefix. + Upper []byte +} + +// IsEmpty returns true iff the bound is empty. +func (ukb *UserKeyPrefixBound) IsEmpty() bool { + return len(ukb.Lower) == 0 && len(ukb.Upper) == 0 +} + +// JemallocSizeClasses are a subset of available size classes in jemalloc[1], +// suitable for the AllocatorSizeClasses option. +// +// The size classes are used when writing sstables for determining target block +// sizes for flushes, with the goal of reducing internal memory fragmentation +// when the blocks are later loaded into the block cache. We only use the size +// classes between 16KiB - 256KiB as block limits fall in that range. +// +// [1] https://jemalloc.net/jemalloc.3.html#size_classes +var JemallocSizeClasses = []int{ + 16 * 1024, + 20 * 1024, 24 * 1024, 28 * 1024, 32 * 1024, // 4KiB spacing + 40 * 1024, 48 * 1024, 56 * 1024, 64 * 1024, // 8KiB spacing + 80 * 1024, 96 * 1024, 112 * 1024, 128 * 1024, // 16KiB spacing. + 160 * 1024, 192 * 1024, 224 * 1024, 256 * 1024, // 32KiB spacing. + 320 * 1024, +} + +// SetInternal sets the internal writer options. Note that even though this +// method is public, a caller outside the pebble package can't construct a value +// to pass to it. +func (o *WriterOptions) SetInternal(internalOpts sstableinternal.WriterOptions) { + o.internal = internalOpts +} + +func (o WriterOptions) ensureDefaults() WriterOptions { + if o.BlockRestartInterval <= 0 { + o.BlockRestartInterval = base.DefaultBlockRestartInterval + } + if o.BlockSize <= 0 { + o.BlockSize = base.DefaultBlockSize + } + if o.BlockSizeThreshold <= 0 { + o.BlockSizeThreshold = base.DefaultBlockSizeThreshold + } + if o.SizeClassAwareThreshold <= 0 { + o.SizeClassAwareThreshold = base.SizeClassAwareBlockSizeThreshold + } + if o.Comparer == nil { + o.Comparer = base.DefaultComparer + } + if o.IndexBlockSize <= 0 { + o.IndexBlockSize = o.BlockSize + } + if o.MergerName == "" { + o.MergerName = base.DefaultMerger.Name + } + if o.Checksum == block.ChecksumTypeNone { + o.Checksum = block.ChecksumTypeCRC32c + } + // By default, if the table format is not specified, fall back to using the + // most compatible format that is supported by Pebble. + if o.TableFormat == TableFormatUnspecified { + o.TableFormat = TableFormatMinSupported + } + if o.NumDeletionsThreshold == 0 { + o.NumDeletionsThreshold = DefaultNumDeletionsThreshold + } + if o.DeletionSizeRatioThreshold == 0 { + o.DeletionSizeRatioThreshold = DefaultDeletionSizeRatioThreshold + } + if o.KeySchema == nil && o.TableFormat.BlockColumnar() { + s := colblk.DefaultKeySchema(o.Comparer, 16 /* bundle size */) + o.KeySchema = &s + } + if o.Compression == nil || !tableFormatSupportsCompressionProfile(o.TableFormat, o.Compression) { + o.Compression = block.SnappyCompression + } + if o.FilterPolicy == nil { + o.FilterPolicy = base.NoFilterPolicy + } + return o +} + +func tableFormatSupportsCompressionProfile(tf TableFormat, profile *CompressionProfile) bool { + // MinLZ is only supported in TableFormatPebblev6 and higher. + if tf < TableFormatPebblev6 && profile.UsesMinLZ() { + return false + } + return true +} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/properties.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/properties.go similarity index 68% rename from vendor/github.com/cockroachdb/pebble/sstable/properties.go rename to vendor/github.com/cockroachdb/pebble/v2/sstable/properties.go index 3bbf34a..99383a3 100644 --- a/vendor/github.com/cockroachdb/pebble/sstable/properties.go +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/properties.go @@ -8,16 +8,22 @@ import ( "bytes" "encoding/binary" "fmt" + "iter" + "maps" "math" "reflect" + "slices" "sort" + "strings" "unsafe" - "github.com/cockroachdb/pebble/internal/intern" + "github.com/cockroachdb/pebble/v2/internal/intern" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/sstable/colblk" + "github.com/cockroachdb/pebble/v2/sstable/rowblk" ) const propertiesBlockRestartInterval = math.MaxInt32 -const propGlobalSeqnumName = "rocksdb.external_sst_file.global_seqno" var propTagMap = make(map[string]reflect.StructField) var propBoolTrue = []byte{'1'} @@ -67,7 +73,7 @@ func init() { // and virtual sstables properties. // // For virtual sstables, fields are constructed through extrapolation upon virtual -// reader construction. See MakeVirtualReader for implementation details. +// reader construction. // // NB: The values of these properties can affect correctness. For example, // if NumRangeKeySets == 0, but the sstable actually contains range keys, then @@ -77,7 +83,8 @@ type CommonProperties struct { NumEntries uint64 `prop:"rocksdb.num.entries"` // Total raw key size. RawKeySize uint64 `prop:"rocksdb.raw.key.size"` - // Total raw value size. + // Total raw value size. If values are separated, this includes the size of + // the separated value, NOT the value handle. RawValueSize uint64 `prop:"rocksdb.raw.value.size"` // Total raw key size of point deletion tombstones. This value is comparable // to RawKeySize. @@ -100,6 +107,17 @@ type CommonProperties struct { NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"` // Total size of value blocks and value index block. Only serialized if > 0. ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"` + // NumDataBlocks is the number of data blocks in this table. + NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"` + // NumTombstoneDenseBlocks is the number of data blocks in this table that + // are considered tombstone-dense. See the TombstoneDenseBlocksRatio field + // in manifest.TableStats for the criteria used to determine if a data + // block is tombstone-dense. + NumTombstoneDenseBlocks uint64 `prop:"pebble.num.tombstone-dense-blocks"` + // The compression algorithm used to compress blocks. + CompressionName string `prop:"rocksdb.compression"` + // The compression options used to compress blocks. + CompressionOptions string `prop:"rocksdb.compression_options"` } // String is only used for testing purposes. @@ -114,7 +132,7 @@ func (c *CommonProperties) String() string { // NumPointDeletions is the number of point deletions in the sstable. For virtual // sstables, this is an estimate. func (c *CommonProperties) NumPointDeletions() uint64 { - return c.NumDeletions - c.NumRangeDeletions + return invariants.SafeSub(c.NumDeletions, c.NumRangeDeletions) } // Properties holds the sstable property values. The properties are @@ -128,37 +146,27 @@ type Properties struct { // The name of the comparer used in this table. ComparerName string `prop:"rocksdb.comparator"` - // The compression algorithm used to compress blocks. - CompressionName string `prop:"rocksdb.compression"` - // The compression options used to compress blocks. - CompressionOptions string `prop:"rocksdb.compression_options"` // The total size of all data blocks. DataSize uint64 `prop:"rocksdb.data.size"` - // The external sstable version format. Version 2 is the one RocksDB has been - // using since 5.13. RocksDB only uses the global sequence number for an - // sstable if this property has been set. - ExternalFormatVersion uint32 `prop:"rocksdb.external_sst_file.version"` // The name of the filter policy used in this table. Empty if no filter // policy is used. FilterPolicyName string `prop:"rocksdb.filter.policy"` // The size of filter block. FilterSize uint64 `prop:"rocksdb.filter.size"` - // The global sequence number to use for all entries in the table. Present if - // the table was created externally and ingested whole. - GlobalSeqNum uint64 `prop:"rocksdb.external_sst_file.global_seqno"` // Total number of index partitions if kTwoLevelIndexSearch is used. IndexPartitions uint64 `prop:"rocksdb.index.partitions"` - // The size of index block. + // The size (uncompressed) of index block. IndexSize uint64 `prop:"rocksdb.index.size"` // The index type. TODO(peter): add a more detailed description. IndexType uint32 `prop:"rocksdb.block.based.table.index.type"` // For formats >= TableFormatPebblev4, this is set to true if the obsolete // bit is strict for all the point keys. IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"` + // The name of the key schema used in this table. Empty for formats <= + // TableFormatPebblev4. + KeySchemaName string `prop:"pebble.colblk.schema"` // The name of the merger used in this table. Empty if no merger is used. MergerName string `prop:"rocksdb.merge.operator"` - // The number of blocks in this table. - NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"` // The number of merge operands in the table. NumMergeOperands uint64 `prop:"rocksdb.merge.operands"` // The number of RANGEKEYUNSETs in this table. @@ -167,11 +175,8 @@ type Properties struct { NumValueBlocks uint64 `prop:"pebble.num.value-blocks"` // The number of values stored in value blocks. Only serialized if > 0. NumValuesInValueBlocks uint64 `prop:"pebble.num.values.in.value-blocks"` - // The name of the prefix extractor used in this table. Empty if no prefix - // extractor is used. - PrefixExtractorName string `prop:"rocksdb.prefix.extractor.name"` - // If filtering is enabled, was the filter created on the key prefix. - PrefixFiltering bool `prop:"rocksdb.block.based.table.prefix.filtering"` + // The number of values stored in blob files. Only serialized if > 0. + NumValuesInBlobFiles uint64 `prop:"pebble.num.values.in.blob-files"` // A comma separated list of names of the property collectors used in this // table. PropertyCollectorNames string `prop:"rocksdb.property.collectors"` @@ -187,12 +192,14 @@ type Properties struct { // The cumulative bytes of values in this table that were pinned by // open snapshots. This value is comparable to RawValueSize. SnapshotPinnedValueSize uint64 `prop:"pebble.raw.snapshot-pinned-values.size"` - // Size of the top-level index if kTwoLevelIndexSearch is used. + // Size (uncompressed) of the top-level index if kTwoLevelIndexSearch is used. TopLevelIndexSize uint64 `prop:"rocksdb.top-level.index.size"` - // User collected properties. + // The compression statistics encoded as a string. The format is: + // ":/,:/,..." + CompressionStats string `prop:"pebble.compression_stats"` + // User collected properties. Currently, we only use them to store block + // properties aggregated at the table level. UserProperties map[string]string - // If filtering is enabled, was the filter created on the whole key. - WholeKeyFiltering bool `prop:"rocksdb.block.based.table.whole.key.filtering"` // Loaded set indicating which fields have been loaded from disk. Indexed by // the field's byte offset within the struct @@ -226,8 +233,7 @@ func writeProperties(loaded map[uintptr]struct{}, v reflect.Value, buf *bytes.Bu } f := v.Field(i) - // TODO(peter): Use f.IsZero() when we can rely on go1.13. - if zero := reflect.Zero(f.Type()); zero.Interface() == f.Interface() { + if f.IsZero() { // Skip printing of zero values which were not loaded from disk. if _, ok := loaded[ft.Offset]; !ok { continue @@ -250,6 +256,47 @@ func writeProperties(loaded map[uintptr]struct{}, v reflect.Value, buf *bytes.Bu } } +func (p *Properties) GetScaledProperties(backingSize, size uint64) CommonProperties { + // Make sure the sizes are sane, just in case. + size = max(size, 1) + backingSize = max(backingSize, size) + + scale := func(a uint64) uint64 { + return (a*size + backingSize - 1) / backingSize + } + // It's important that no non-zero fields (like NumDeletions, NumRangeKeySets) + // become zero (or vice-versa). + if invariants.Enabled && (scale(1) != 1 || scale(0) != 0) { + panic("bad scale()") + } + + props := p.CommonProperties + props.RawKeySize = scale(p.RawKeySize) + props.RawValueSize = scale(p.RawValueSize) + props.NumEntries = scale(p.NumEntries) + props.NumDataBlocks = scale(p.NumDataBlocks) + props.NumTombstoneDenseBlocks = scale(p.NumTombstoneDenseBlocks) + + props.NumRangeDeletions = scale(p.NumRangeDeletions) + props.NumSizedDeletions = scale(p.NumSizedDeletions) + // We cannot directly scale NumDeletions, because it is supposed to be the sum + // of various types of deletions. See #4670. + numOtherDeletions := scale(invariants.SafeSub(p.NumDeletions, p.NumRangeDeletions) + p.NumSizedDeletions) + props.NumDeletions = numOtherDeletions + props.NumRangeDeletions + props.NumSizedDeletions + + props.NumRangeKeyDels = scale(p.NumRangeKeyDels) + props.NumRangeKeySets = scale(p.NumRangeKeySets) + + props.ValueBlocksSize = scale(p.ValueBlocksSize) + + props.RawPointTombstoneKeySize = scale(p.RawPointTombstoneKeySize) + props.RawPointTombstoneValueSize = scale(p.RawPointTombstoneValueSize) + + props.CompressionName = p.CompressionName + + return props +} + func (p *Properties) String() string { var buf bytes.Buffer v := reflect.ValueOf(*p) @@ -262,40 +309,35 @@ func (p *Properties) String() string { } sort.Strings(keys) for _, key := range keys { - fmt.Fprintf(&buf, "%s: %s\n", key, p.UserProperties[key]) + // If there are characters outside of the printable ASCII range, print + // the value in hexadecimal. + if strings.IndexFunc(p.UserProperties[key], func(r rune) bool { return r < ' ' || r > '~' }) != -1 { + fmt.Fprintf(&buf, "%s: hex:%x\n", key, p.UserProperties[key]) + } else { + fmt.Fprintf(&buf, "%s: %s\n", key, p.UserProperties[key]) + } } return buf.String() } -func (p *Properties) load( - b block, blockOffset uint64, deniedUserProperties map[string]struct{}, -) error { - i, err := newRawBlockIter(bytes.Compare, b) - if err != nil { - return err - } +func (p *Properties) load(i iter.Seq2[[]byte, []byte]) error { p.Loaded = make(map[uintptr]struct{}) v := reflect.ValueOf(p).Elem() - for valid := i.First(); valid; valid = i.Next() { - if f, ok := propTagMap[string(i.Key().UserKey)]; ok { + for key, val := range i { + if f, ok := propTagMap[string(key)]; ok { p.Loaded[f.Offset] = struct{}{} field := v.FieldByIndex(f.Index) switch f.Type.Kind() { case reflect.Bool: - field.SetBool(bytes.Equal(i.Value(), propBoolTrue)) + field.SetBool(bytes.Equal(val, propBoolTrue)) case reflect.Uint32: - field.SetUint(uint64(binary.LittleEndian.Uint32(i.Value()))) + field.SetUint(uint64(binary.LittleEndian.Uint32(val))) case reflect.Uint64: - var n uint64 - if string(i.Key().UserKey) == propGlobalSeqnumName { - n = binary.LittleEndian.Uint64(i.Value()) - } else { - n, _ = binary.Uvarint(i.Value()) - } + n, _ := binary.Uvarint(val) field.SetUint(n) case reflect.String: - field.SetString(intern.Bytes(i.Value())) + field.SetString(intern.Bytes(val)) default: panic("not reached") } @@ -305,8 +347,8 @@ func (p *Properties) load( p.UserProperties = make(map[string]string) } - if _, denied := deniedUserProperties[string(i.Key().UserKey)]; !denied { - p.UserProperties[intern.Bytes(i.Key().UserKey)] = string(i.Value()) + if _, denied := ignoredInternalProperties[string(key)]; !denied { + p.UserProperties[intern.Bytes(key)] = string(val) } } return nil @@ -333,6 +375,8 @@ func (p *Properties) saveUint64(m map[string][]byte, offset uintptr, value uint6 m[propOffsetTagMap[offset]] = buf[:] } +var _ = (*Properties).saveUint64 + func (p *Properties) saveUvarint(m map[string][]byte, offset uintptr, value uint64) { var buf [10]byte n := binary.PutUvarint(buf[:], value) @@ -343,7 +387,7 @@ func (p *Properties) saveString(m map[string][]byte, offset uintptr, value strin m[propOffsetTagMap[offset]] = []byte(value) } -func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) { +func (p *Properties) accumulateProps(tblFormat TableFormat) ([]string, map[string][]byte) { m := make(map[string][]byte) for k, v := range p.UserProperties { m[k] = []byte(v) @@ -359,10 +403,6 @@ func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) { p.saveString(m, unsafe.Offsetof(p.CompressionOptions), p.CompressionOptions) } p.saveUvarint(m, unsafe.Offsetof(p.DataSize), p.DataSize) - if p.ExternalFormatVersion != 0 { - p.saveUint32(m, unsafe.Offsetof(p.ExternalFormatVersion), p.ExternalFormatVersion) - p.saveUint64(m, unsafe.Offsetof(p.GlobalSeqNum), p.GlobalSeqNum) - } if p.FilterPolicyName != "" { p.saveString(m, unsafe.Offsetof(p.FilterPolicyName), p.FilterPolicyName) } @@ -376,6 +416,9 @@ func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) { if p.IsStrictObsolete { p.saveBool(m, unsafe.Offsetof(p.IsStrictObsolete), p.IsStrictObsolete) } + if p.KeySchemaName != "" { + p.saveString(m, unsafe.Offsetof(p.KeySchemaName), p.KeySchemaName) + } if p.MergerName != "" { p.saveString(m, unsafe.Offsetof(p.MergerName), p.MergerName) } @@ -410,10 +453,9 @@ func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) { if p.NumValuesInValueBlocks > 0 { p.saveUvarint(m, unsafe.Offsetof(p.NumValuesInValueBlocks), p.NumValuesInValueBlocks) } - if p.PrefixExtractorName != "" { - p.saveString(m, unsafe.Offsetof(p.PrefixExtractorName), p.PrefixExtractorName) + if p.NumValuesInBlobFiles > 0 { + p.saveUvarint(m, unsafe.Offsetof(p.NumValuesInBlobFiles), p.NumValuesInBlobFiles) } - p.saveBool(m, unsafe.Offsetof(p.PrefixFiltering), p.PrefixFiltering) if p.PropertyCollectorNames != "" { p.saveString(m, unsafe.Offsetof(p.PropertyCollectorNames), p.PropertyCollectorNames) } @@ -427,24 +469,87 @@ func (p *Properties) save(tblFormat TableFormat, w *rawBlockWriter) { if p.ValueBlocksSize > 0 { p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize) } - p.saveBool(m, unsafe.Offsetof(p.WholeKeyFiltering), p.WholeKeyFiltering) + if p.NumTombstoneDenseBlocks != 0 { + p.saveUvarint(m, unsafe.Offsetof(p.NumTombstoneDenseBlocks), p.NumTombstoneDenseBlocks) + } + if p.CompressionStats != "" { + p.saveString(m, unsafe.Offsetof(p.CompressionStats), p.CompressionStats) + } if tblFormat < TableFormatPebblev1 { - m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32) - m["rocksdb.fixed.key.length"] = []byte{0x00} - m["rocksdb.index.key.is.user.key"] = []byte{0x00} - m["rocksdb.index.value.is.delta.encoded"] = []byte{0x00} - m["rocksdb.oldest.key.time"] = []byte{0x00} - m["rocksdb.creation.time"] = []byte{0x00} - m["rocksdb.format.version"] = []byte{0x00} + m["rocksdb.column.family.id"] = maxInt32Slice + m["rocksdb.fixed.key.length"] = singleZeroSlice + m["rocksdb.index.key.is.user.key"] = singleZeroSlice + m["rocksdb.index.value.is.delta.encoded"] = singleZeroSlice + m["rocksdb.oldest.key.time"] = singleZeroSlice + m["rocksdb.creation.time"] = singleZeroSlice + m["rocksdb.format.version"] = singleZeroSlice } - keys := make([]string, 0, len(m)) - for key := range m { - keys = append(keys, key) - } + keys := slices.Collect(maps.Keys(m)) sort.Strings(keys) + + return keys, m +} + +func (p *Properties) saveToRowWriter(tblFormat TableFormat, w *rowblk.Writer) error { + keys, m := p.accumulateProps(tblFormat) + for _, key := range keys { + if err := w.AddRawString(key, m[key]); err != nil { + return err + } + } + return nil +} + +func (p *Properties) saveToColWriter(tblFormat TableFormat, w *colblk.KeyValueBlockWriter) { + keys, m := p.accumulateProps(tblFormat) for _, key := range keys { - w.add(InternalKey{UserKey: []byte(key)}, m[key]) + // Zero-length keys are unsupported. See below about StringData. + if len(key) == 0 { + continue + } + // Use an unsafe conversion to avoid allocating. AddKV is not + // supposed to modify the given slice, so the unsafe conversion + // is okay. Note that unsafe.StringData panics if len(key) == 0, + // so we explicitly skip zero-length keys above. They shouldn't + // occur in practice. + w.AddKV(unsafe.Slice(unsafe.StringData(key), len(key)), m[key]) } } + +func (p *Properties) toAttributes() Attributes { + var attributes Attributes + + if p.NumValueBlocks > 0 || p.NumValuesInValueBlocks > 0 { + attributes.Add(AttributeValueBlocks) + } + if p.NumRangeKeySets > 0 { + attributes.Add(AttributeRangeKeySets) + } + if p.NumRangeKeyUnsets > 0 { + attributes.Add(AttributeRangeKeyUnsets) + } + if p.NumRangeKeyDels > 0 { + attributes.Add(AttributeRangeKeyDels) + } + if p.NumRangeDeletions > 0 { + attributes.Add(AttributeRangeDels) + } + if p.IndexType == twoLevelIndex { + attributes.Add(AttributeTwoLevelIndex) + } + if p.NumValuesInBlobFiles > 0 { + attributes.Add(AttributeBlobValues) + } + if p.NumDataBlocks > 0 { + attributes.Add(AttributePointKeys) + } + + return attributes +} + +var ( + singleZeroSlice = []byte{0x00} + maxInt32Slice = binary.AppendUvarint([]byte(nil), math.MaxInt32) +) diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/reader.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader.go new file mode 100644 index 0000000..2799430 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader.go @@ -0,0 +1,1139 @@ +// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "bytes" + "cmp" + "context" + "fmt" + "io" + "slices" + "strings" + "sync" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/bytealloc" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/rangekey" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/block/blockkind" + "github.com/cockroachdb/pebble/v2/sstable/colblk" + "github.com/cockroachdb/pebble/v2/sstable/rowblk" + "github.com/cockroachdb/pebble/v2/sstable/valblk" + "github.com/cockroachdb/pebble/v2/sstable/virtual" + "github.com/cockroachdb/pebble/v2/vfs" +) + +var errReaderClosed = errors.New("pebble/table: reader is closed") + +type loadBlockResult int8 + +const ( + loadBlockOK loadBlockResult = iota + // Could be due to error or because no block left to load. + loadBlockFailed + loadBlockIrrelevant +) + +// Reader is a table reader. +// If you update this struct, make sure you also update the magic number in +// StringForTests() in metrics.go. +type Reader struct { + blockReader block.Reader + + // The following fields are copied from the ReadOptions. + keySchema *colblk.KeySchema + filterMetricsTracker *FilterMetricsTracker + Comparer *base.Comparer + + tableFilter *tableFilterReader + + err error + + indexBH block.Handle + filterBH block.Handle + rangeDelBH block.Handle + rangeKeyBH block.Handle + valueBIH valblk.IndexHandle + propertiesBH block.Handle + metaindexBH block.Handle + footerBH block.Handle + blobRefIndexBH block.Handle + + tableFormat TableFormat + Attributes Attributes + UserProperties map[string]string +} + +type ReadEnv struct { + Virtual *virtual.VirtualReaderParams + // IsSharedIngested is true if this is a shared table that was ingested. Can + // only be set when Virtual is non-nil. + IsSharedIngested bool + Block block.ReadEnv +} + +var NoReadEnv = ReadEnv{} + +// Close the reader and the underlying objstorage.Readable. +func (r *Reader) Close() error { + r.err = firstError(r.err, r.blockReader.Close()) + if r.err != nil { + return r.err + } + // Make any future calls to Get, NewIter or Close return an error. + r.err = errReaderClosed + return nil +} + +// IterOptions defines options for configuring a sstable pointer iterator. +type IterOptions struct { + Lower, Upper []byte + Transforms IterTransforms + Filterer *BlockPropertiesFilterer + FilterBlockSizeLimit FilterBlockSizeLimit + Env ReadEnv + ReaderProvider valblk.ReaderProvider + BlobContext TableBlobContext +} + +// NewPointIter returns an iterator for the point keys in the table. +// +// If transform.HideObsoletePoints is set, the callee assumes that filterer +// already includes obsoleteKeyBlockPropertyFilter. The caller can satisfy this +// contract by first calling TryAddBlockPropertyFilterForHideObsoletePoints. +func (r *Reader) NewPointIter(ctx context.Context, opts IterOptions) (Iterator, error) { + return r.newPointIter(ctx, opts) +} + +// TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called +// before the call to NewPointIter, to get the value of hideObsoletePoints and +// potentially add a block property filter. +func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints( + snapshotForHideObsoletePoints base.SeqNum, + fileLargestSeqNum base.SeqNum, + pointKeyFilters []BlockPropertyFilter, +) (hideObsoletePoints bool, filters []BlockPropertyFilter) { + hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 && + snapshotForHideObsoletePoints > fileLargestSeqNum + if hideObsoletePoints { + pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{}) + } + return hideObsoletePoints, pointKeyFilters +} + +func (r *Reader) newPointIter(ctx context.Context, opts IterOptions) (Iterator, error) { + // NB: pebble.fileCache wraps the returned iterator with one which performs + // reference counting on the Reader, preventing the Reader from being closed + // until the final iterator closes. + var res Iterator + var err error + if r.Attributes.Has(AttributeTwoLevelIndex) { + if r.tableFormat.BlockColumnar() { + res, err = newColumnBlockTwoLevelIterator( + ctx, r, opts) + } else { + res, err = newRowBlockTwoLevelIterator( + ctx, r, opts) + } + } else { + if r.tableFormat.BlockColumnar() { + res, err = newColumnBlockSingleLevelIterator( + ctx, r, opts) + } else { + res, err = newRowBlockSingleLevelIterator( + ctx, r, opts) + } + } + if err != nil { + // Note: we don't want to return res here - it will be a nil + // single/twoLevelIterator, not a nil Iterator. + return nil, err + } + return res, nil +} + +// NewIter returns an iterator for the point keys in the table. It is a +// simplified version of NewPointIter and should only be used for tests and +// tooling. +// +// NewIter must only be used when the Reader is guaranteed to outlive any +// LazyValues returned from the iter. +func (r *Reader) NewIter( + transforms IterTransforms, lower, upper []byte, blobContext TableBlobContext, +) (Iterator, error) { + // TODO(radu): we should probably not use bloom filters in this case, as there + // likely isn't a cache set up. + opts := IterOptions{ + Lower: lower, + Upper: upper, + Transforms: transforms, + Filterer: nil, + FilterBlockSizeLimit: AlwaysUseFilterBlock, + Env: NoReadEnv, + ReaderProvider: MakeTrivialReaderProvider(r), + BlobContext: blobContext, + } + return r.NewPointIter(context.TODO(), opts) +} + +// NewCompactionIter returns an iterator similar to NewIter but it also increments +// the number of bytes iterated. If an error occurs, NewCompactionIter cleans up +// after itself and returns a nil iterator. +func (r *Reader) NewCompactionIter( + transforms IterTransforms, env ReadEnv, rp valblk.ReaderProvider, blobContext TableBlobContext, +) (Iterator, error) { + return r.newCompactionIter(transforms, env, rp, blobContext) +} + +func (r *Reader) newCompactionIter( + transforms IterTransforms, env ReadEnv, rp valblk.ReaderProvider, blobContext TableBlobContext, +) (Iterator, error) { + if env.IsSharedIngested { + transforms.HideObsoletePoints = true + } + ctx := context.Background() + opts := IterOptions{ + Transforms: transforms, + Filterer: nil, + FilterBlockSizeLimit: NeverUseFilterBlock, + Env: env, + ReaderProvider: rp, + BlobContext: blobContext, + } + + if r.Attributes.Has(AttributeTwoLevelIndex) { + if !r.tableFormat.BlockColumnar() { + i, err := newRowBlockTwoLevelIterator(ctx, r, opts) + if err != nil { + return nil, err + } + i.SetupForCompaction() + return i, nil + } + i, err := newColumnBlockTwoLevelIterator(ctx, r, opts) + if err != nil { + return nil, err + } + i.SetupForCompaction() + return i, nil + } + if !r.tableFormat.BlockColumnar() { + i, err := newRowBlockSingleLevelIterator(ctx, r, opts) + if err != nil { + return nil, err + } + i.SetupForCompaction() + return i, nil + } + i, err := newColumnBlockSingleLevelIterator(ctx, r, opts) + if err != nil { + return nil, err + } + i.SetupForCompaction() + return i, nil +} + +// NewRawRangeDelIter returns an internal iterator for the contents of the +// range-del block for the table. Returns nil if the table does not contain +// any range deletions. +func (r *Reader) NewRawRangeDelIter( + ctx context.Context, transforms FragmentIterTransforms, env ReadEnv, +) (iter keyspan.FragmentIterator, err error) { + if r.rangeDelBH.Length == 0 { + return nil, nil + } + h, err := r.readRangeDelBlock(ctx, env.Block, noReadHandle, r.rangeDelBH) + if err != nil { + return nil, err + } + if r.tableFormat.BlockColumnar() { + iter = colblk.NewKeyspanIter(r.Comparer.Compare, h, transforms) + } else { + iter, err = rowblk.NewFragmentIter(r.blockReader.FileNum(), r.Comparer, h, transforms) + if err != nil { + return nil, err + } + } + + i := keyspan.MaybeAssert(iter, r.Comparer.Compare) + if env.Virtual != nil { + i = keyspan.Truncate( + r.Comparer.Compare, i, + base.UserKeyBoundsFromInternal(env.Virtual.Lower, env.Virtual.Upper), + ) + } + return i, nil +} + +// NewRawRangeKeyIter returns an internal iterator for the contents of the +// range-key block for the table. Returns nil if the table does not contain any +// range keys. +func (r *Reader) NewRawRangeKeyIter( + ctx context.Context, transforms FragmentIterTransforms, env ReadEnv, +) (iter keyspan.FragmentIterator, err error) { + syntheticSeqNum := transforms.SyntheticSeqNum + if env.IsSharedIngested { + // Don't pass a synthetic sequence number for shared ingested sstables. We + // need to know the materialized sequence numbers, and we will set up the + // appropriate sequence number substitution below. + transforms.SyntheticSeqNum = 0 + } + + if r.rangeKeyBH.Length == 0 { + return nil, nil + } + h, err := r.readRangeKeyBlock(ctx, env.Block, noReadHandle, r.rangeKeyBH) + if err != nil { + return nil, err + } + if r.tableFormat.BlockColumnar() { + iter = colblk.NewKeyspanIter(r.Comparer.Compare, h, transforms) + } else { + iter, err = rowblk.NewFragmentIter(r.blockReader.FileNum(), r.Comparer, h, transforms) + if err != nil { + return nil, err + } + } + i := keyspan.MaybeAssert(iter, r.Comparer.Compare) + + if env.Virtual != nil { + // We need to coalesce range keys within each sstable, and then apply the + // synthetic sequence number. For this, we use ForeignSSTTransformer. + // + // TODO(bilal): Avoid these allocations by hoisting the transformer and + // transform iter up. + if env.IsSharedIngested { + transform := &rangekey.ForeignSSTTransformer{ + Equal: r.Comparer.Equal, + SeqNum: base.SeqNum(syntheticSeqNum), + } + transformIter := &keyspan.TransformerIter{ + FragmentIterator: i, + Transformer: transform, + SuffixCmp: r.Comparer.CompareRangeSuffixes, + } + i = transformIter + } + i = keyspan.Truncate( + r.Comparer.Compare, i, + base.UserKeyBoundsFromInternal(env.Virtual.Lower, env.Virtual.Upper), + ) + } + return i, nil +} + +// noReadHandle is used when we don't want to pass a ReadHandle to one of the +// read block methods. +var noReadHandle objstorage.ReadHandle = nil + +var noInitBlockMetadataFn = func(*block.Metadata, []byte) error { return nil } + +// readMetaindexBlock reads the metaindex block. +func (r *Reader) readMetaindexBlock( + ctx context.Context, env block.ReadEnv, readHandle objstorage.ReadHandle, +) (block.BufferHandle, error) { + return r.blockReader.Read(ctx, env, readHandle, r.metaindexBH, blockkind.Metadata, noInitBlockMetadataFn) +} + +// readTopLevelIndexBlock reads the top-level index block. +func (r *Reader) readTopLevelIndexBlock( + ctx context.Context, env block.ReadEnv, readHandle objstorage.ReadHandle, +) (block.BufferHandle, error) { + return r.readIndexBlock(ctx, env, readHandle, r.indexBH) +} + +// readIndexBlock reads a top-level or second-level index block. +func (r *Reader) readIndexBlock( + ctx context.Context, env block.ReadEnv, readHandle objstorage.ReadHandle, bh block.Handle, +) (block.BufferHandle, error) { + return r.blockReader.Read(ctx, env, readHandle, bh, blockkind.Index, r.initIndexBlockMetadata) +} + +// initIndexBlockMetadata initializes the Metadata for a data block. This will +// later be used (and reused) when reading from the block. +func (r *Reader) initIndexBlockMetadata(metadata *block.Metadata, data []byte) error { + if r.tableFormat.BlockColumnar() { + return colblk.InitIndexBlockMetadata(metadata, data) + } + return nil +} + +func (r *Reader) readDataBlock( + ctx context.Context, env block.ReadEnv, readHandle objstorage.ReadHandle, bh block.Handle, +) (block.BufferHandle, error) { + return r.blockReader.Read(ctx, env, readHandle, bh, blockkind.SSTableData, r.initDataBlockMetadata) +} + +// initDataBlockMetadata initializes the Metadata for a data block. This will +// later be used (and reused) when reading from the block. +func (r *Reader) initDataBlockMetadata(metadata *block.Metadata, data []byte) error { + if r.tableFormat.BlockColumnar() { + return colblk.InitDataBlockMetadata(r.keySchema, metadata, data) + } + return nil +} + +func (r *Reader) readFilterBlock( + ctx context.Context, env block.ReadEnv, readHandle objstorage.ReadHandle, bh block.Handle, +) (block.BufferHandle, error) { + return r.blockReader.Read(ctx, env, readHandle, bh, blockkind.Filter, noInitBlockMetadataFn) +} + +func (r *Reader) readRangeDelBlock( + ctx context.Context, env block.ReadEnv, readHandle objstorage.ReadHandle, bh block.Handle, +) (block.BufferHandle, error) { + return r.blockReader.Read(ctx, env, readHandle, bh, blockkind.RangeDel, r.initKeyspanBlockMetadata) +} + +func (r *Reader) readRangeKeyBlock( + ctx context.Context, env block.ReadEnv, readHandle objstorage.ReadHandle, bh block.Handle, +) (block.BufferHandle, error) { + return r.blockReader.Read(ctx, env, readHandle, bh, blockkind.RangeKey, r.initKeyspanBlockMetadata) +} + +// initKeyspanBlockMetadata initializes the Metadata for a rangedel or range key +// block. This will later be used (and reused) when reading from the block. +func (r *Reader) initKeyspanBlockMetadata(metadata *block.Metadata, data []byte) error { + if r.tableFormat.BlockColumnar() { + return colblk.InitKeyspanBlockMetadata(metadata, data) + } + return nil +} + +// ReadValueBlockExternal implements valblk.ExternalBlockReader, allowing a +// base.LazyValue to read a value block. +func (r *Reader) ReadValueBlockExternal( + ctx context.Context, bh block.Handle, +) (block.BufferHandle, error) { + return r.readValueBlock(ctx, block.NoReadEnv, noReadHandle, bh) +} + +func (r *Reader) readValueBlock( + ctx context.Context, env block.ReadEnv, readHandle objstorage.ReadHandle, bh block.Handle, +) (block.BufferHandle, error) { + return r.blockReader.Read(ctx, env, readHandle, bh, blockkind.SSTableValue, noInitBlockMetadataFn) +} + +func (r *Reader) ReadBlobRefIndexBlock( + ctx context.Context, env block.ReadEnv, +) (block.BufferHandle, error) { + return r.readBlobRefIndexBlock(ctx, env, noReadHandle) +} + +func (r *Reader) readBlobRefIndexBlock( + ctx context.Context, env block.ReadEnv, readHandle objstorage.ReadHandle, +) (block.BufferHandle, error) { + return r.blockReader.Read(ctx, env, readHandle, r.blobRefIndexBH, blockkind.BlobReferenceValueLivenessIndex, noInitBlockMetadataFn) +} + +// metaBufferPools is a sync pool of BufferPools used exclusively when opening a +// table and loading its meta blocks. +var metaBufferPools = sync.Pool{ + New: func() any { + bp := new(block.BufferPool) + // New pools are initialized with a capacity of 3 to accommodate the + // meta block (1), and both the compressed properties block (1) and + // decompressed properties block (1) simultaneously. + bp.Init(3) + return bp + }, +} + +func (r *Reader) readAndDecodeMetaindex( + ctx context.Context, bufferPool *block.BufferPool, readHandle objstorage.ReadHandle, +) (map[string]block.Handle, valblk.IndexHandle, error) { + metaEnv := block.ReadEnv{BufferPool: bufferPool} + b, err := r.readMetaindexBlock(ctx, metaEnv, readHandle) + if err != nil { + return nil, valblk.IndexHandle{}, err + } + data := b.BlockData() + defer b.Release() + + if uint64(len(data)) != r.metaindexBH.Length { + return nil, valblk.IndexHandle{}, base.CorruptionErrorf("pebble/table: unexpected metaindex block size: %d vs %d", + errors.Safe(len(data)), errors.Safe(r.metaindexBH.Length)) + } + + var meta map[string]block.Handle + var valueBIH valblk.IndexHandle + if r.tableFormat >= TableFormatPebblev6 { + meta, valueBIH, err = decodeColumnarMetaIndex(data) + } else { + meta, valueBIH, err = decodeMetaindex(data) + } + return meta, valueBIH, err +} + +func (r *Reader) initMetaindexBlocks( + ctx context.Context, + bufferPool *block.BufferPool, + readHandle objstorage.ReadHandle, + filters map[string]FilterPolicy, +) error { + var meta map[string]block.Handle + var err error + meta, r.valueBIH, err = r.readAndDecodeMetaindex(ctx, bufferPool, readHandle) + if err != nil { + return err + } + + if bh, ok := meta[metaPropertiesName]; ok { + r.propertiesBH = bh + } else { + return errors.New("did not read any value for the properties block in the meta index") + } + + if bh, ok := meta[metaBlobRefIndexName]; ok { + r.blobRefIndexBH = bh + } + + if bh, ok := meta[metaRangeDelV2Name]; ok { + r.rangeDelBH = bh + } else if _, ok := meta[metaRangeDelV1Name]; ok { + // This version of Pebble requires a format major version at least as + // high as FormatFlushableIngest (see pebble.FormatMinSupported). In + // this format major verison, we have a guarantee that we've compacted + // away all RocksDB sstables. It should not be possible to encounter an + // sstable with a v1 range deletion block but not a v2 range deletion + // block. + err := errors.Newf("pebble/table: unexpected range-del block type: %s", metaRangeDelV1Name) + return errors.Mark(err, base.ErrCorruption) + } + + if bh, ok := meta[metaRangeKeyName]; ok { + r.rangeKeyBH = bh + } + + for name, fp := range filters { + if bh, ok := meta["fullfilter."+name]; ok { + r.filterBH = bh + r.tableFilter = newTableFilterReader(fp, r.filterMetricsTracker) + break + } + } + return nil +} + +// decodePropertiesBlock decodes the (uncompressed) properties block. +func decodePropertiesBlock(tableFormat TableFormat, blockData []byte) (Properties, error) { + var props Properties + if tableFormat >= TableFormatPebblev7 { + var decoder colblk.KeyValueBlockDecoder + decoder.Init(blockData) + if err := props.load(decoder.All()); err != nil { + return Properties{}, err + } + } else { + i, err := rowblk.NewRawIter(bytes.Compare, blockData) + if err != nil { + return Properties{}, err + } + if err := props.load(i.All()); err != nil { + return Properties{}, err + } + } + return props, nil +} + +var propertiesBlockBufPools = sync.Pool{ + New: func() any { + bp := new(block.BufferPool) + // New pools are initialized with a capacity of 2 to accommodate + // both the compressed properties block (1) and decompressed + // properties block (1). + bp.Init(2) + return bp + }, +} + +// ReadPropertiesBlock reads the properties block +// from the table. We always read the properties block into a buffer pool +// instead of the block cache. +func (r *Reader) ReadPropertiesBlock( + ctx context.Context, bufferPool *block.BufferPool, +) (Properties, error) { + return r.readPropertiesBlockInternal(ctx, bufferPool, noReadHandle) +} + +func (r *Reader) readPropertiesBlockInternal( + ctx context.Context, bufferPool *block.BufferPool, readHandle objstorage.ReadHandle, +) (Properties, error) { + if bufferPool == nil { + // We always use a buffer pool when reading the properties block as + // we don't want it in the block cache. + bufferPool = propertiesBlockBufPools.Get().(*block.BufferPool) + defer propertiesBlockBufPools.Put(bufferPool) + defer bufferPool.Release() + } + env := block.ReadEnv{BufferPool: bufferPool} + b, err := r.blockReader.Read(ctx, env, readHandle, r.propertiesBH, blockkind.Metadata, noInitBlockMetadataFn) + if err != nil { + return Properties{}, err + } + defer b.Release() + return decodePropertiesBlock(r.tableFormat, b.BlockData()) +} + +// Layout returns the layout (block organization) for an sstable. +func (r *Reader) Layout() (*Layout, error) { + if r.err != nil { + return nil, r.err + } + + l := &Layout{ + Data: make([]block.HandleWithProperties, 0), + RangeDel: r.rangeDelBH, + RangeKey: r.rangeKeyBH, + ValueIndex: r.valueBIH.Handle, + Properties: r.propertiesBH, + MetaIndex: r.metaindexBH, + Footer: r.footerBH, + Format: r.tableFormat, + BlobReferenceIndex: r.blobRefIndexBH, + } + + bufferPool := metaBufferPools.Get().(*block.BufferPool) + defer metaBufferPools.Put(bufferPool) + defer bufferPool.Release() + + ctx := context.TODO() + meta, _, err := r.readAndDecodeMetaindex(ctx, bufferPool, noReadHandle) + if err != nil { + return nil, err + } + for name, bh := range meta { + if strings.HasPrefix(name, "fullfilter.") { + l.Filter = append(l.Filter, NamedBlockHandle{Name: name, Handle: bh}) + } + } + + indexH, err := r.readTopLevelIndexBlock(ctx, block.NoReadEnv, noReadHandle) + if err != nil { + return nil, err + } + defer indexH.Release() + + var alloc bytealloc.A + + if !r.Attributes.Has(AttributeTwoLevelIndex) { + l.Index = append(l.Index, r.indexBH) + iter := r.tableFormat.newIndexIter() + err := iter.Init(r.Comparer, indexH.BlockData(), NoTransforms) + if err != nil { + return nil, errors.Wrap(err, "reading index block") + } + for valid := iter.First(); valid; valid = iter.Next() { + dataBH, err := iter.BlockHandleWithProperties() + if err != nil { + return nil, errCorruptIndexEntry(err) + } + if len(dataBH.Props) > 0 { + alloc, dataBH.Props = alloc.Copy(dataBH.Props) + } + l.Data = append(l.Data, dataBH) + } + } else { + l.TopIndex = r.indexBH + topIter := r.tableFormat.newIndexIter() + err := topIter.Init(r.Comparer, indexH.BlockData(), NoTransforms) + if err != nil { + return nil, errors.Wrap(err, "reading index block") + } + iter := r.tableFormat.newIndexIter() + for valid := topIter.First(); valid; valid = topIter.Next() { + indexBH, err := topIter.BlockHandleWithProperties() + if err != nil { + return nil, errCorruptIndexEntry(err) + } + l.Index = append(l.Index, indexBH.Handle) + + subIndex, err := r.readIndexBlock(ctx, block.NoReadEnv, noReadHandle, indexBH.Handle) + if err != nil { + return nil, err + } + err = func() error { + defer subIndex.Release() + // TODO(msbutler): figure out how to pass virtualState to layout call. + if err := iter.Init(r.Comparer, subIndex.BlockData(), NoTransforms); err != nil { + return err + } + for valid := iter.First(); valid; valid = iter.Next() { + dataBH, err := iter.BlockHandleWithProperties() + if err != nil { + return errCorruptIndexEntry(err) + } + if len(dataBH.Props) > 0 { + alloc, dataBH.Props = alloc.Copy(dataBH.Props) + } + l.Data = append(l.Data, dataBH) + } + return nil + }() + if err != nil { + return nil, err + } + } + } + if r.valueBIH.Handle.Length != 0 { + vbiH, err := r.readValueBlock(context.Background(), block.NoReadEnv, noReadHandle, r.valueBIH.Handle) + if err != nil { + return nil, err + } + defer vbiH.Release() + l.ValueBlock, err = valblk.DecodeIndex(vbiH.BlockData(), r.valueBIH) + if err != nil { + return nil, err + } + } + + return l, nil +} + +// ValidateBlockChecksums validates the checksums for each block in the SSTable. +func (r *Reader) ValidateBlockChecksums() error { + // Pre-compute the BlockHandles for the underlying file. + l, err := r.Layout() + if err != nil { + return err + } + + type blk struct { + bh block.Handle + readFn func(context.Context, block.ReadEnv, objstorage.ReadHandle, block.Handle) (block.BufferHandle, error) + } + // Construct the set of blocks to check. Note that the footer is not checked + // as it is not a block with a checksum. + blocks := make([]blk, 0, len(l.Data)+6) + for i := range l.Data { + blocks = append(blocks, blk{ + bh: l.Data[i].Handle, + readFn: r.readDataBlock, + }) + } + for _, h := range l.Index { + blocks = append(blocks, blk{ + bh: h, + readFn: r.readIndexBlock, + }) + } + blocks = append(blocks, blk{ + bh: l.TopIndex, + readFn: r.readIndexBlock, + }) + for _, bh := range l.Filter { + blocks = append(blocks, blk{ + bh: bh.Handle, + readFn: r.readFilterBlock, + }) + } + blocks = append(blocks, blk{ + bh: l.RangeDel, + readFn: r.readRangeDelBlock, + }) + blocks = append(blocks, blk{ + bh: l.RangeKey, + readFn: r.readRangeKeyBlock, + }) + readNoInit := func(ctx context.Context, env block.ReadEnv, rh objstorage.ReadHandle, bh block.Handle) (block.BufferHandle, error) { + return r.blockReader.Read(ctx, env, rh, bh, blockkind.Metadata, noInitBlockMetadataFn) + } + blocks = append(blocks, blk{ + bh: l.Properties, + readFn: readNoInit, + }) + blocks = append(blocks, blk{ + bh: l.MetaIndex, + readFn: readNoInit, + }) + blocks = append(blocks, blk{ + bh: l.BlobReferenceIndex, + readFn: readNoInit, + }) + + // Sorting by offset ensures we are performing a sequential scan of the + // file. + slices.SortFunc(blocks, func(a, b blk) int { + return cmp.Compare(a.bh.Offset, b.bh.Offset) + }) + + ctx := context.Background() + for _, b := range blocks { + // Certain blocks may not be present, in which case we skip them. + if b.bh.Length == 0 { + continue + } + h, err := b.readFn(ctx, block.NoReadEnv, noReadHandle, b.bh) + if err != nil { + return err + } + h.Release() + } + + return nil +} + +// EstimateDiskUsage returns the total size of data blocks overlapping the range +// `[start, end]`. Even if a data block partially overlaps, or we cannot +// determine overlap due to abbreviated index keys, the full data block size is +// included in the estimation. +// +// This function does not account for any metablock space usage. Assumes there +// is at least partial overlap, i.e., `[start, end]` falls neither completely +// before nor completely after the file's range. +// +// Only blocks containing point keys are considered. Range deletion and range +// key blocks are not considered. +// +// TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of +// data blocks overlapped and add that same fraction of the metadata blocks to the +// estimate. +func (r *Reader) EstimateDiskUsage(start []byte, end []byte, env ReadEnv) (uint64, error) { + if env.Virtual != nil { + _, start, end = env.Virtual.ConstrainBounds(start, end, false, r.Comparer.Compare) + } + + if !r.tableFormat.BlockColumnar() { + return estimateDiskUsage[rowblk.IndexIter, *rowblk.IndexIter](r, start, end) + } + return estimateDiskUsage[colblk.IndexIter, *colblk.IndexIter](r, start, end) +} + +func estimateDiskUsage[I any, PI indexBlockIterator[I]]( + r *Reader, start, end []byte, +) (uint64, error) { + if r.err != nil { + return 0, r.err + } + ctx := context.TODO() + + indexH, err := r.readTopLevelIndexBlock(ctx, block.NoReadEnv, noReadHandle) + if err != nil { + return 0, err + } + // We are using InitHandle below but we never Close those iterators, which + // allows us to release the index handle ourselves. + // TODO(radu): clean this up. + defer indexH.Release() + + // Iterators over the bottom-level index blocks containing start and end. + // These may be different in case of partitioned index but will both point + // to the same blockIter over the single index in the unpartitioned case. + var startIdxIter, endIdxIter PI + if !r.Attributes.Has(AttributeTwoLevelIndex) { + startIdxIter = new(I) + if err := startIdxIter.InitHandle(r.Comparer, indexH, NoTransforms); err != nil { + return 0, err + } + endIdxIter = startIdxIter + } else { + var topIter PI = new(I) + if err := topIter.InitHandle(r.Comparer, indexH, NoTransforms); err != nil { + return 0, err + } + if !topIter.SeekGE(start) { + // The range falls completely after this file. + return 0, nil + } + startIndexBH, err := topIter.BlockHandleWithProperties() + if err != nil { + return 0, errCorruptIndexEntry(err) + } + startIdxBlock, err := r.readIndexBlock(ctx, block.NoReadEnv, noReadHandle, startIndexBH.Handle) + if err != nil { + return 0, err + } + defer startIdxBlock.Release() + startIdxIter = new(I) + err = startIdxIter.InitHandle(r.Comparer, startIdxBlock, NoTransforms) + if err != nil { + return 0, err + } + + if topIter.SeekGE(end) { + endIndexBH, err := topIter.BlockHandleWithProperties() + if err != nil { + return 0, errCorruptIndexEntry(err) + } + endIdxBlock, err := r.readIndexBlock(ctx, block.NoReadEnv, noReadHandle, endIndexBH.Handle) + if err != nil { + return 0, err + } + defer endIdxBlock.Release() + endIdxIter = new(I) + err = endIdxIter.InitHandle(r.Comparer, endIdxBlock, NoTransforms) + if err != nil { + return 0, err + } + } + } + // startIdxIter should not be nil at this point, while endIdxIter can be if the + // range spans past the end of the file. + + if !startIdxIter.SeekGE(start) { + // The range falls completely after this file. + return 0, nil + } + startBH, err := startIdxIter.BlockHandleWithProperties() + if err != nil { + return 0, errCorruptIndexEntry(err) + } + + props, err := r.ReadPropertiesBlock(ctx, nil /* buffer pool */) + if err != nil { + return 0, err + } + + includeInterpolatedValueBlocksSize := func(dataBlockSize uint64) uint64 { + // INVARIANT: props.DataSize > 0 since startIdxIter is not nil. + // Linearly interpolate what is stored in value blocks. + // + // TODO(sumeer): if we need more accuracy, without loading any data blocks + // (which contain the value handles, and which may also be insufficient if + // the values are in separate files), we will need to accumulate the + // logical size of the key-value pairs and store the cumulative value for + // each data block in the index block entry. This increases the size of + // the BlockHandle, so wait until this becomes necessary. + return dataBlockSize + + uint64((float64(dataBlockSize)/float64(props.DataSize))* + float64(props.ValueBlocksSize)) + } + if endIdxIter == nil { + // The range spans beyond this file. Include data blocks through the last. + return includeInterpolatedValueBlocksSize(props.DataSize - startBH.Offset), nil + } + if !endIdxIter.SeekGE(end) { + // The range spans beyond this file. Include data blocks through the last. + return includeInterpolatedValueBlocksSize(props.DataSize - startBH.Offset), nil + } + endBH, err := endIdxIter.BlockHandleWithProperties() + if err != nil { + return 0, errCorruptIndexEntry(err) + } + return includeInterpolatedValueBlocksSize( + endBH.Offset + endBH.Length + block.TrailerLen - startBH.Offset), nil +} + +// TableFormat returns the format version for the table. +func (r *Reader) TableFormat() (TableFormat, error) { + if r.err != nil { + return TableFormatUnspecified, r.err + } + return r.tableFormat, nil +} + +// BlockReader returns the block.Reader that can be used to directly read +// blocks from the sstable. +func (r *Reader) BlockReader() *block.Reader { + return &r.blockReader +} + +// NewReader returns a new table reader for the file. Closing the reader will +// close the file. +// +// The context is used for tracing any operations performed by NewReader; it is +// NOT stored for future use. +// +// In error cases, the objstorage.Readable is still open. The caller remains +// responsible for closing it if necessary. +func NewReader(ctx context.Context, f objstorage.Readable, o ReaderOptions) (*Reader, error) { + if f == nil { + return nil, errors.New("pebble/table: nil file") + } + o = o.ensureDefaults() + + r := &Reader{ + filterMetricsTracker: o.FilterMetricsTracker, + } + + var preallocRH objstorageprovider.PreallocatedReadHandle + rh := objstorageprovider.UsePreallocatedReadHandle( + f, objstorage.ReadBeforeForNewReader, &preallocRH) + defer func() { _ = rh.Close() }() + + footer, err := readFooter(ctx, f, rh, o.LoggerAndTracer, o.CacheOpts.FileNum) + if err != nil { + return nil, err + } + r.blockReader.Init(f, o.ReaderOptions, footer.checksum) + r.tableFormat = footer.format + r.indexBH = footer.indexBH + r.metaindexBH = footer.metaindexBH + r.footerBH = footer.footerBH + + // Read the metaindex and properties blocks. + // We use a BufferPool when reading metaindex blocks in order to avoid + // populating the block cache with these blocks. In heavy-write workloads, + // especially with high compaction concurrency, new tables may be created + // frequently. Populating the block cache with these metaindex blocks adds + // additional contention on the block cache mutexes (see #1997). + // Additionally, these blocks are exceedingly unlikely to be read again + // while they're still in the block cache except in misconfigurations with + // excessive sstables counts or a file cache that's far too small. + bufferPool := metaBufferPools.Get().(*block.BufferPool) + defer metaBufferPools.Put(bufferPool) + // When we're finished, release the buffers we've allocated back to memory + // allocator. + defer bufferPool.Release() + + if err := r.initMetaindexBlocks(ctx, bufferPool, rh, o.Filters); err != nil { + r.err = err + return nil, err + } + + props, err := r.readPropertiesBlockInternal(ctx, bufferPool, rh) + if err != nil { + r.err = err + return nil, err + } + r.UserProperties = props.UserProperties + + // Set which attributes are in use based on property values. + r.Attributes = props.toAttributes() + if footer.format >= TableFormatPebblev7 && footer.attributes != r.Attributes { + // For now we just verify that our derived attributes from the properties match the bitset + // on the footer. + r.err = base.CorruptionErrorf("pebble/table: %d: attributes mismatch: %s (footer) vs %s (derived)", + errors.Safe(r.blockReader.FileNum()), errors.Safe(footer.attributes), errors.Safe(r.Attributes)) + } + + if props.ComparerName == "" || o.Comparer.Name == props.ComparerName { + r.Comparer = o.Comparer + } else if comparer, ok := o.Comparers[props.ComparerName]; ok { + r.Comparer = comparer + } else { + r.err = errors.Errorf("pebble/table: %d: unknown comparer %s", + errors.Safe(r.blockReader.FileNum()), errors.Safe(props.ComparerName)) + } + + if mergerName := props.MergerName; mergerName != "" && mergerName != "nullptr" { + if o.Merger != nil && o.Merger.Name == mergerName { + // opts.Merger matches. + } else if _, ok := o.Mergers[mergerName]; ok { + // Known merger. + } else { + r.err = errors.Errorf("pebble/table: %d: unknown merger %s", + errors.Safe(r.blockReader.FileNum()), errors.Safe(props.MergerName)) + } + } + + if r.tableFormat.BlockColumnar() { + if ks, ok := o.KeySchemas[props.KeySchemaName]; ok { + r.keySchema = ks + } else { + var known []string + for name := range o.KeySchemas { + known = append(known, fmt.Sprintf("%q", name)) + } + slices.Sort(known) + + r.err = errors.Newf("pebble/table: %d: unknown key schema %q; known key schemas: %s", + errors.Safe(r.blockReader.FileNum()), errors.Safe(props.KeySchemaName), errors.Safe(known)) + panic(r.err) + } + } + + if r.err != nil { + return nil, r.err + } + return r, nil +} + +// ReadableFile describes the smallest subset of vfs.File that is required for +// reading SSTs. +type ReadableFile interface { + io.ReaderAt + io.Closer + Stat() (vfs.FileInfo, error) +} + +// NewSimpleReadable wraps a ReadableFile in a objstorage.Readable +// implementation (which does not support read-ahead) +func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error) { + info, err := r.Stat() + if err != nil { + return nil, err + } + res := &simpleReadable{ + f: r, + size: info.Size(), + } + res.rh = objstorage.MakeNoopReadHandle(res) + return res, nil +} + +// simpleReadable wraps a ReadableFile to implement objstorage.Readable. +type simpleReadable struct { + f ReadableFile + size int64 + rh objstorage.NoopReadHandle +} + +var _ objstorage.Readable = (*simpleReadable)(nil) + +// ReadAt is part of the objstorage.Readable interface. +func (s *simpleReadable) ReadAt(_ context.Context, p []byte, off int64) error { + n, err := s.f.ReadAt(p, off) + if invariants.Enabled && err == nil && n != len(p) { + panic("short read") + } + return err +} + +// Close is part of the objstorage.Readable interface. +func (s *simpleReadable) Close() error { + return s.f.Close() +} + +// Size is part of the objstorage.Readable interface. +func (s *simpleReadable) Size() int64 { + return s.size +} + +// NewReadHandle is part of the objstorage.Readable interface. +func (s *simpleReadable) NewReadHandle( + readBeforeSize objstorage.ReadBeforeSize, +) objstorage.ReadHandle { + return &s.rh +} + +func errCorruptIndexEntry(err error) error { + err = base.CorruptionErrorf("pebble/table: corrupt index entry: %v", err) + if invariants.Enabled { + panic(err) + } + return err +} + +// MakeTrivialReaderProvider creates a valblk.ReaderProvider which always +// returns the given reader. It should be used when the Reader will outlive the +// iterator tree. +func MakeTrivialReaderProvider(r *Reader) valblk.ReaderProvider { + return (*trivialReaderProvider)(r) +} + +// trivialReaderProvider implements valblk.ReaderProvider for a Reader that will +// outlive the top-level iterator in the iterator tree. +// +// Defining the type in this manner (as opposed to a struct) avoids allocation. +type trivialReaderProvider Reader + +var _ valblk.ReaderProvider = (*trivialReaderProvider)(nil) + +// GetReader implements ReaderProvider. +func (trp *trivialReaderProvider) GetReader( + ctx context.Context, +) (valblk.ExternalBlockReader, error) { + return (*Reader)(trp), nil +} + +// Close implements ReaderProvider. +func (trp *trivialReaderProvider) Close() {} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_common.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_common.go new file mode 100644 index 0000000..5cd672f --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_common.go @@ -0,0 +1,58 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "math" + + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// FilterBlockSizeLimit is a size limit for bloom filter blocks - if a bloom +// filter is present, it is used only when it is at most this size. +type FilterBlockSizeLimit uint32 + +const ( + // NeverUseFilterBlock indicates that bloom filter blocks should never be used. + NeverUseFilterBlock FilterBlockSizeLimit = 0 + // AlwaysUseFilterBlock indicates that bloom filter blocks should always be + // used, regardless of size. + AlwaysUseFilterBlock FilterBlockSizeLimit = math.MaxUint32 +) + +type ( + // BufferPool re-exports block.BufferPool. + BufferPool = block.BufferPool + // IterTransforms re-exports block.IterTransforms. + IterTransforms = block.IterTransforms + // FragmentIterTransforms re-exports block.FragmentIterTransforms. + FragmentIterTransforms = block.FragmentIterTransforms + // SyntheticSeqNum re-exports block.SyntheticSeqNum. + SyntheticSeqNum = block.SyntheticSeqNum + // SyntheticSuffix re-exports block.SyntheticSuffix. + SyntheticSuffix = block.SyntheticSuffix + // SyntheticPrefix re-exports block.SyntheticPrefix. + SyntheticPrefix = block.SyntheticPrefix + // SyntheticPrefixAndSuffix re-exports block.SyntheticPrefixAndSuffix. + SyntheticPrefixAndSuffix = block.SyntheticPrefixAndSuffix +) + +// NoTransforms is the default value for IterTransforms. +var NoTransforms = block.NoTransforms + +// NoFragmentTransforms is the default value for FragmentIterTransforms. +var NoFragmentTransforms = block.NoFragmentTransforms + +// MakeSyntheticPrefixAndSuffix returns a SyntheticPrefixAndSuffix with the +// given prefix and suffix. +func MakeSyntheticPrefixAndSuffix( + prefix SyntheticPrefix, suffix SyntheticSuffix, +) SyntheticPrefixAndSuffix { + return block.MakeSyntheticPrefixAndSuffix(prefix, suffix) +} + +// NoSyntheticSeqNum is the default zero value for SyntheticSeqNum, which +// disables overriding the sequence number. +const NoSyntheticSeqNum = block.NoSyntheticSeqNum diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_iter.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_iter.go new file mode 100644 index 0000000..758e3ab --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_iter.go @@ -0,0 +1,226 @@ +// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "fmt" + "os" + "sync" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/colblk" + "github.com/cockroachdb/pebble/v2/sstable/rowblk" +) + +// dataBlockIterator extends the block.IndexBlockIterator interface with a +// constraint that the implementing type be a pointer to a type I. +// +// DataBlockIterator requires that the type be a pointer to its type parameter, +// D, to allow sstable iterators embed the block iterator within its struct. See +// this example from the Go generics proposal: +// https://go.googlesource.com/proposal/+/refs/heads/master/design/43651-type-parameters.md#pointer-method-example +type dataBlockIterator[D any] interface { + block.DataBlockIterator + + *D // non-interface type constraint element +} + +// indexBlockIterator extends the block.IndexBlockIterator interface with a +// constraint that the implementing type be a pointer to a type I. +// +// indexBlockIterator requires that the type be a pointer to its type parameter, +// I, to allow sstable iterators embed the block iterator within its struct. See +// this example from the Go generics proposal: +// https://go.googlesource.com/proposal/+/refs/heads/master/design/43651-type-parameters.md#pointer-method-example +type indexBlockIterator[I any] interface { + block.IndexBlockIterator + + *I // non-interface type constraint element +} + +// Iterator iterates over an entire table of data. +type Iterator interface { + base.InternalIterator + + // NextPrefix implements (base.InternalIterator).NextPrefix. + NextPrefix(succKey []byte) *base.InternalKV + + // SetCloseHook sets a function that will be called when the iterator is + // closed. This is used by the file cache to release the reference count on + // the open sstable.Reader when the iterator is closed. + SetCloseHook(func()) +} + +// Iterator positioning optimizations and singleLevelIterator and +// twoLevelIterator: +// +// An iterator is absolute positioned using one of the Seek or First or Last +// calls. After absolute positioning, there can be relative positioning done +// by stepping using Prev or Next. +// +// We implement optimizations below where an absolute positioning call can in +// some cases use the current position to do less work. To understand these, +// we first define some terms. An iterator is bounds-exhausted if the bounds +// (upper of lower) have been reached. An iterator is data-exhausted if it has +// the reached the end of the data (forward or reverse) in the sstable. A +// singleLevelIterator only knows a local-data-exhausted property since when +// it is used as part of a twoLevelIterator, the twoLevelIterator can step to +// the next lower-level index block. +// +// The bounds-exhausted property is tracked by +// singleLevelIterator.exhaustedBounds being +1 (upper bound reached) or -1 +// (lower bound reached). The same field is reused by twoLevelIterator. Either +// may notice the exhaustion of the bound and set it. Note that if +// singleLevelIterator sets this property, it is not a local property (since +// the bound has been reached regardless of whether this is in the context of +// the twoLevelIterator or not). +// +// The data-exhausted property is tracked in a more subtle manner. We define +// two predicates: +// - partial-local-data-exhausted (PLDE): +// i.data.IsDataInvalidated() || !i.data.Valid() +// - partial-global-data-exhausted (PGDE): +// i.index.IsDataInvalidated() || !i.index.Valid() || i.data.IsDataInvalidated() || +// !i.data.Valid() +// +// PLDE is defined for a singleLevelIterator. PGDE is defined for a +// twoLevelIterator. Oddly, in our code below the singleLevelIterator does not +// know when it is part of a twoLevelIterator so it does not know when its +// property is local or global. +// +// Now to define data-exhausted: +// - Prerequisite: we must know that the iterator has been positioned and +// i.err is nil. +// - bounds-exhausted must not be true: +// If bounds-exhausted is true, we have incomplete knowledge of +// data-exhausted since PLDE or PGDE could be true because we could have +// chosen not to load index block or data block and figured out that the +// bound is exhausted (due to block property filters filtering out index and +// data blocks and going past the bound on the top level index block). Note +// that if we tried to separate out the BPF case from others we could +// develop more knowledge here. +// - PGDE is true for twoLevelIterator. PLDE is true if it is a standalone +// singleLevelIterator. !PLDE or !PGDE of course imply that data-exhausted +// is not true. +// +// An implication of the above is that if we are going to somehow utilize +// knowledge of data-exhausted in an optimization, we must not forget the +// existing value of bounds-exhausted since by forgetting the latter we can +// erroneously think that data-exhausted is true. Bug #2036 was due to this +// forgetting. +// +// Now to the two categories of optimizations we currently have: +// - Monotonic bounds optimization that reuse prior iterator position when +// doing seek: These only work with !data-exhausted. We could choose to make +// these work with data-exhausted but have not bothered because in the +// context of a DB if data-exhausted were true, the DB would move to the +// next file in the level. Note that this behavior of moving to the next +// file is not necessarily true for L0 files, so there could be some benefit +// in the future in this optimization. See the WARNING-data-exhausted +// comments if trying to optimize this in the future. +// - TrySeekUsingNext optimizations: these work regardless of exhaustion +// state. +// +// Implementation detail: In the code PLDE only checks that +// i.data.IsDataInvalidated(). This narrower check is safe, since this is a +// subset of the set expressed by the OR expression. Also, it is not a +// de-optimization since whenever we exhaust the iterator we explicitly call +// i.data.Invalidate(). PGDE checks i.index.IsDataInvalidated() && +// i.data.IsDataInvalidated(). Again, this narrower check is safe, and not a +// de-optimization since whenever we exhaust the iterator we explicitly call +// i.index.Invalidate() and i.data.Invalidate(). The && is questionable -- for +// now this is a bit of defensive code. We should seriously consider removing +// it, since defensive code suggests we are not confident about our invariants +// (and if we are not confident, we need more invariant assertions, not +// defensive code). +// +// TODO(sumeer): remove the aforementioned defensive code. + +type ( + singleLevelIteratorRowBlocks = singleLevelIterator[rowblk.IndexIter, *rowblk.IndexIter, rowblk.Iter, *rowblk.Iter] + twoLevelIteratorRowBlocks = twoLevelIterator[rowblk.IndexIter, *rowblk.IndexIter, rowblk.Iter, *rowblk.Iter] + singleLevelIteratorColumnBlocks = singleLevelIterator[colblk.IndexIter, *colblk.IndexIter, colblk.DataBlockIter, *colblk.DataBlockIter] + twoLevelIteratorColumnBlocks = twoLevelIterator[colblk.IndexIter, *colblk.IndexIter, colblk.DataBlockIter, *colblk.DataBlockIter] +) + +var ( + singleLevelIterRowBlockPool sync.Pool // *singleLevelIteratorRowBlocks + twoLevelIterRowBlockPool sync.Pool // *twoLevelIteratorRowBlocks + singleLevelIterColumnBlockPool sync.Pool // *singleLevelIteratorColumnBlocks + twoLevelIterColumnBlockPool sync.Pool // *singleLevelIteratorColumnBlocks +) + +func init() { + singleLevelIterRowBlockPool = sync.Pool{ + New: func() interface{} { + i := &singleLevelIteratorRowBlocks{pool: &singleLevelIterRowBlockPool} + if invariants.UseFinalizers { + invariants.SetFinalizer(i, checkSingleLevelIterator[rowblk.IndexIter, *rowblk.IndexIter, rowblk.Iter, *rowblk.Iter]) + } + return i + }, + } + twoLevelIterRowBlockPool = sync.Pool{ + New: func() interface{} { + i := &twoLevelIteratorRowBlocks{pool: &twoLevelIterRowBlockPool} + if invariants.UseFinalizers { + invariants.SetFinalizer(i, checkTwoLevelIterator[rowblk.IndexIter, *rowblk.IndexIter, rowblk.Iter, *rowblk.Iter]) + } + return i + }, + } + singleLevelIterColumnBlockPool = sync.Pool{ + New: func() interface{} { + i := &singleLevelIteratorColumnBlocks{ + pool: &singleLevelIterColumnBlockPool, + } + if invariants.UseFinalizers { + invariants.SetFinalizer(i, checkSingleLevelIterator[colblk.IndexIter, *colblk.IndexIter, colblk.DataBlockIter, *colblk.DataBlockIter]) + } + return i + }, + } + twoLevelIterColumnBlockPool = sync.Pool{ + New: func() interface{} { + i := &twoLevelIteratorColumnBlocks{ + pool: &twoLevelIterColumnBlockPool, + } + if invariants.UseFinalizers { + invariants.SetFinalizer(i, checkTwoLevelIterator[colblk.IndexIter, *colblk.IndexIter, colblk.DataBlockIter, *colblk.DataBlockIter]) + } + return i + }, + } +} + +func checkSingleLevelIterator[I any, PI indexBlockIterator[I], D any, PD dataBlockIterator[D]]( + obj interface{}, +) { + i := obj.(*singleLevelIterator[I, PI, D, PD]) + if h := PD(&i.data).Handle(); h.Valid() { + fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %#v\n", h) + os.Exit(1) + } + if h := PI(&i.index).Handle(); h.Valid() { + fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %#v\n", h) + os.Exit(1) + } +} + +func checkTwoLevelIterator[I any, PI indexBlockIterator[I], D any, PD dataBlockIterator[D]]( + obj interface{}, +) { + i := obj.(*twoLevelIterator[I, PI, D, PD]) + if h := PD(&i.secondLevel.data).Handle(); h.Valid() { + fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %#v\n", h) + os.Exit(1) + } + if h := PI(&i.secondLevel.index).Handle(); h.Valid() { + fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %#v\n", h) + os.Exit(1) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/sstable/reader_iter_single_lvl.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_iter_single_lvl.go similarity index 51% rename from vendor/github.com/cockroachdb/pebble/sstable/reader_iter_single_lvl.go rename to vendor/github.com/cockroachdb/pebble/v2/sstable/reader_iter_single_lvl.go index 8b094cd..9019c17 100644 --- a/vendor/github.com/cockroachdb/pebble/sstable/reader_iter_single_lvl.go +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_iter_single_lvl.go @@ -5,21 +5,41 @@ package sstable import ( + "bytes" "context" "fmt" + "sync" "unsafe" - "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/invariants" - "github.com/cockroachdb/pebble/objstorage" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider" - "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/objstorage/objstorageprovider" + "github.com/cockroachdb/pebble/v2/sstable/block" + "github.com/cockroachdb/pebble/v2/sstable/valblk" ) // singleLevelIterator iterates over an entire table of data. To seek for a given // key, it first looks in the index for the block that contains that key, and then // looks inside that block. -type singleLevelIterator struct { +// +// singleLevelIterator is parameterized by the type of the data block iterator +// and index block iterator. The type parameters are designed to allow the +// singleLevelIterator to embed the data block and index block iterator structs +// within itself, avoiding an extra allocation and pointer indirection. The +// complication comes from the fact that we want to implement the interfaces on +// pointer receivers but embed the non-pointer types within the struct. The D +// and I type parameters are the non-pointer data and index block iterator +// types, and the PD and PI type parameters are the *D and *I types that +// actually implement the DataBlockIterator and IndexBlockIterator constraints. +// +// Unfortunately, uses of the [data] and [index] fields must explicitly cast +// &data/&index to the PD/PI type in order to access its interface methods. This +// pattern is taken from the Go generics proposal: +// https://go.googlesource.com/proposal/+/refs/heads/master/design/43651-type-parameters.md#pointer-method-example +type singleLevelIterator[I any, PI indexBlockIterator[I], D any, PD dataBlockIterator[D]] struct { ctx context.Context cmp Compare // Global lower/upper bound for the iterator. @@ -31,29 +51,26 @@ type singleLevelIterator struct { blockLower []byte blockUpper []byte reader *Reader - // vState will be set iff the iterator is constructed for virtual sstable - // iteration. - vState *virtualState // endKeyInclusive is set to force the iterator to treat the upper field as // inclusive while iterating instead of exclusive. - endKeyInclusive bool - index blockIter - data blockIter - dataRH objstorage.ReadHandle - dataRHPrealloc objstorageprovider.PreallocatedReadHandle + endKeyInclusive bool + indexFilterRH objstorage.ReadHandle + indexFilterRHPrealloc objstorageprovider.PreallocatedReadHandle + dataRH objstorage.ReadHandle + dataRHPrealloc objstorageprovider.PreallocatedReadHandle // dataBH refers to the last data block that the iterator considered // loading. It may not actually have loaded the block, due to an error or // because it was considered irrelevant. - dataBH BlockHandle - vbReader *valueBlockReader + dataBH block.Handle + internalValueConstructor defaultInternalValueConstructor // vbRH is the read handle for value blocks, which are in a different // part of the sstable than data blocks. vbRH objstorage.ReadHandle vbRHPrealloc objstorageprovider.PreallocatedReadHandle err error - closeHook func(i Iterator) error - stats *base.InternalIteratorStats - bufferPool *BufferPool + closeHook func() + + readEnv ReadEnv // boundsCmp and positionedUsingLatestBounds are for optimizing iteration // that uses multiple adjacent bounds. The seek after setting a new bound @@ -141,162 +158,224 @@ type singleLevelIterator struct { // singleLevelIterator, given that these two iterators share this field. exhaustedBounds int8 - // maybeFilteredKeysSingleLevel indicates whether the last iterator - // positioning operation may have skipped any data blocks due to - // block-property filters when positioning the index. - maybeFilteredKeysSingleLevel bool - - // useFilter specifies whether the filter block in this sstable, if present, - // should be used for prefix seeks or not. In some cases it is beneficial - // to skip a filter block even if it exists (eg. if probability of a match - // is high). - useFilter bool + // useFilterBlock controls whether the bloom filter block in this sstable, if + // present, should be used for prefix seeks or not. In some cases it is + // beneficial to skip a filter block even if it exists (eg. if probability of + // a match is high). + useFilterBlock bool lastBloomFilterMatched bool - hideObsoletePoints bool + transforms IterTransforms + + // All fields above this field are cleared when resetting the iterator for reuse. + clearForResetBoundary struct{} + + index I + data D + // inPool is set to true before putting the iterator in the reusable pool; + // used to detect double-close. + inPool bool + // pool is the pool from which the iterator was allocated and to which the + // iterator should be returned on Close. Because the iterator is + // parameterized by the type of the data block iterator, pools must be + // specific to the type of the data block iterator. + // + // If the iterator is embedded within a twoLevelIterator, pool is nil and + // the twoLevelIterator.pool field may be non-nil. + pool *sync.Pool + + // NOTE: any new fields should be added above the clearForResetBoundary field, + // unless they need to be retained when resetting the iterator. } // singleLevelIterator implements the base.InternalIterator interface. -var _ base.InternalIterator = (*singleLevelIterator)(nil) +var _ base.InternalIterator = (*singleLevelIteratorRowBlocks)(nil) -// init initializes a singleLevelIterator for reading from the table. It is -// synonmous with Reader.NewIter, but allows for reusing of the iterator -// between different Readers. +// newColumnBlockSingleLevelIterator reads the index block and creates and +// initializes a singleLevelIterator over an sstable with column-oriented data +// blocks. // -// Note that lower, upper passed into init has nothing to do with virtual sstable -// bounds. If the virtualState passed in is not nil, then virtual sstable bounds -// will be enforced. -func (i *singleLevelIterator) init( - ctx context.Context, - r *Reader, - v *virtualState, - lower, upper []byte, - filterer *BlockPropertiesFilterer, - useFilter, hideObsoletePoints bool, - stats *base.InternalIteratorStats, - rp ReaderProvider, - bufferPool *BufferPool, -) error { +// Note that lower, upper are iterator bounds and are separate from virtual +// sstable bounds. If the virtualState passed in is not nil, then virtual +// sstable bounds will be enforced. +func newColumnBlockSingleLevelIterator( + ctx context.Context, r *Reader, opts IterOptions, +) (*singleLevelIteratorColumnBlocks, error) { if r.err != nil { - return r.err + return nil, r.err } - indexH, err := r.readIndex(ctx, stats) - if err != nil { - return err + if !r.tableFormat.BlockColumnar() { + panic(errors.AssertionFailedf("table format %d should not use columnar block format", r.tableFormat)) } - if v != nil { - i.vState = v - i.endKeyInclusive, lower, upper = v.constrainBounds(lower, upper, false /* endInclusive */) + i := singleLevelIterColumnBlockPool.Get().(*singleLevelIteratorColumnBlocks) + i.init(ctx, r, opts) + if r.Attributes.Has(AttributeValueBlocks) { + i.internalValueConstructor.vbReader = valblk.MakeReader(i, opts.ReaderProvider, r.valueBIH, opts.Env.Block.Stats) + i.vbRH = r.blockReader.UsePreallocatedReadHandle(objstorage.NoReadBefore, &i.vbRHPrealloc) + } + i.data.InitOnce(r.keySchema, r.Comparer, &i.internalValueConstructor) + indexH, err := r.readTopLevelIndexBlock(ctx, i.readEnv.Block, i.indexFilterRH) + if err == nil { + err = i.index.InitHandle(r.Comparer, indexH, opts.Transforms) } - - i.ctx = ctx - i.lower = lower - i.upper = upper - i.bpfs = filterer - i.useFilter = useFilter - i.reader = r - i.cmp = r.Compare - i.stats = stats - i.hideObsoletePoints = hideObsoletePoints - i.bufferPool = bufferPool - err = i.index.initHandle(i.cmp, indexH, r.Properties.GlobalSeqNum, false) if err != nil { - // blockIter.Close releases indexH and always returns a nil error - _ = i.index.Close() - return err + _ = i.Close() + return nil, err } - i.dataRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.dataRHPrealloc) + return i, nil +} + +// newRowBlockSingleLevelIterator reads the index block and creates and +// initializes a singleLevelIterator over an sstable with row-oriented data +// blocks. +// +// Note that lower, upper are iterator bounds and are separate from virtual +// sstable bounds. If the virtualState passed in is not nil, then virtual +// sstable bounds will be enforced. +func newRowBlockSingleLevelIterator( + ctx context.Context, r *Reader, opts IterOptions, +) (*singleLevelIteratorRowBlocks, error) { + if r.err != nil { + return nil, r.err + } + if r.tableFormat.BlockColumnar() { + panic(errors.AssertionFailedf("table format %s uses block columnar format", r.tableFormat)) + } + i := singleLevelIterRowBlockPool.Get().(*singleLevelIteratorRowBlocks) + i.init(ctx, r, opts) if r.tableFormat >= TableFormatPebblev3 { - if r.Properties.NumValueBlocks > 0 { - // NB: we cannot avoid this ~248 byte allocation, since valueBlockReader - // can outlive the singleLevelIterator due to be being embedded in a - // LazyValue. This consumes ~2% in microbenchmark CPU profiles, but we - // should only optimize this if it shows up as significant in end-to-end - // CockroachDB benchmarks, since it is tricky to do so. One possibility - // is that if many sstable iterators only get positioned at latest - // versions of keys, and therefore never expose a LazyValue that is - // separated to their callers, they can put this valueBlockReader into a - // sync.Pool. - i.vbReader = &valueBlockReader{ - ctx: ctx, - bpOpen: i, - rp: rp, - vbih: r.valueBIH, - stats: stats, - } - i.data.lazyValueHandling.vbr = i.vbReader - i.vbRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.vbRHPrealloc) + if r.Attributes.Has(AttributeValueBlocks) { + i.internalValueConstructor.vbReader = valblk.MakeReader(i, opts.ReaderProvider, r.valueBIH, opts.Env.Block.Stats) + // We can set the GetLazyValuer directly to the vbReader because + // rowblk sstables never contain blob value handles. + (&i.data).SetGetLazyValuer(&i.internalValueConstructor.vbReader) + i.vbRH = r.blockReader.UsePreallocatedReadHandle(objstorage.NoReadBefore, &i.vbRHPrealloc) } - i.data.lazyValueHandling.hasValuePrefix = true + i.data.SetHasValuePrefix(true) } - return nil -} -// Helper function to check if keys returned from iterator are within global and virtual bounds. -func (i *singleLevelIterator) maybeVerifyKey( - iKey *InternalKey, val base.LazyValue, -) (*InternalKey, base.LazyValue) { - // maybeVerify key is only used for virtual sstable iterators. - if invariants.Enabled && i.vState != nil && iKey != nil { - key := iKey.UserKey + indexH, err := r.readTopLevelIndexBlock(ctx, i.readEnv.Block, i.indexFilterRH) + if err == nil { + err = i.index.InitHandle(r.Comparer, indexH, opts.Transforms) + } + if err != nil { + _ = i.Close() + return nil, err + } + return i, nil +} - uc, vuc := i.cmp(key, i.upper), i.cmp(key, i.vState.upper.UserKey) - lc, vlc := i.cmp(key, i.lower), i.cmp(key, i.vState.lower.UserKey) +// init initializes the singleLevelIterator struct. It does not read the index. +func (i *singleLevelIterator[I, PI, D, PD]) init(ctx context.Context, r *Reader, opts IterOptions) { + i.inPool = false + i.ctx = ctx + i.lower = opts.Lower + i.upper = opts.Upper + i.bpfs = opts.Filterer + i.useFilterBlock = shouldUseFilterBlock(r, opts.FilterBlockSizeLimit) + i.reader = r + i.cmp = r.Comparer.Compare + i.transforms = opts.Transforms + i.readEnv = opts.Env + i.internalValueConstructor.blobContext = opts.BlobContext + i.internalValueConstructor.env = &i.readEnv.Block + if opts.Env.Virtual != nil { + i.endKeyInclusive, i.lower, i.upper = opts.Env.Virtual.ConstrainBounds(opts.Lower, opts.Upper, false /* endInclusive */, r.Comparer.Compare) + } + + i.indexFilterRH = r.blockReader.UsePreallocatedReadHandle( + objstorage.ReadBeforeForIndexAndFilter, &i.indexFilterRHPrealloc) + i.dataRH = r.blockReader.UsePreallocatedReadHandle( + objstorage.NoReadBefore, &i.dataRHPrealloc) +} - if (i.vState.upper.IsExclusiveSentinel() && vuc == 0) || (!i.endKeyInclusive && uc == 0) || uc > 0 || vuc > 0 || lc < 0 || vlc < 0 { - panic(fmt.Sprintf("key: %s out of bounds of singleLevelIterator", key)) +// Helper function to check if keys returned from iterator are within virtual bounds. +func (i *singleLevelIterator[I, PI, D, PD]) maybeVerifyKey(kv *base.InternalKV) *base.InternalKV { + if invariants.Enabled && kv != nil && i.readEnv.Virtual != nil { + key := kv.K.UserKey + v := i.readEnv.Virtual + lc := i.cmp(key, v.Lower.UserKey) + uc := i.cmp(key, v.Upper.UserKey) + if lc < 0 || uc > 0 || (uc == 0 && v.Upper.IsExclusiveSentinel()) { + panic(fmt.Sprintf("key %q out of singleLeveliterator virtual bounds %s %s", key, v.Lower.UserKey, v.Upper.UserKey)) } } - return iKey, val + return kv } -// setupForCompaction sets up the singleLevelIterator for use with compactionIter. +// SetupForCompaction sets up the singleLevelIterator for use with compactionIter. // Currently, it skips readahead ramp-up. It should be called after init is called. -func (i *singleLevelIterator) setupForCompaction() { +func (i *singleLevelIterator[I, PI, D, PD]) SetupForCompaction() { i.dataRH.SetupForCompaction() if i.vbRH != nil { i.vbRH.SetupForCompaction() } } -func (i *singleLevelIterator) resetForReuse() singleLevelIterator { - return singleLevelIterator{ - index: i.index.resetForReuse(), - data: i.data.resetForReuse(), - } +const clearLen = unsafe.Offsetof(singleLevelIteratorRowBlocks{}.clearForResetBoundary) + +// Assert that clearLen is consistent betwen the row and columnar implementations. +const clearLenColBlocks = unsafe.Offsetof(singleLevelIteratorColumnBlocks{}.clearForResetBoundary) +const _ uintptr = clearLen - clearLenColBlocks +const _ uintptr = clearLenColBlocks - clearLen + +func (i *singleLevelIterator[I, PI, D, PD]) resetForReuse() { + *(*[clearLen]byte)(unsafe.Pointer(i)) = [clearLen]byte{} + i.inPool = true } -func (i *singleLevelIterator) initBounds() { +func (i *singleLevelIterator[I, PI, D, PD]) initBounds() { // Trim the iteration bounds for the current block. We don't have to check // the bounds on each iteration if the block is entirely contained within the // iteration bounds. i.blockLower = i.lower if i.blockLower != nil { - key, _ := i.data.First() - if key != nil && i.cmp(i.blockLower, key.UserKey) < 0 { + kv := PD(&i.data).First() + // TODO(radu): this should be <= 0 + if kv != nil && i.cmp(i.blockLower, kv.K.UserKey) < 0 { // The lower-bound is less than the first key in the block. No need // to check the lower-bound again for this block. i.blockLower = nil } } i.blockUpper = i.upper - if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 { + // TODO(radu): this should be >= 0 if blockUpper is inclusive. + if i.blockUpper != nil && PI(&i.index).SeparatorLT(i.blockUpper) { // The upper-bound is greater than the index key which itself is greater // than or equal to every key in the block. No need to check the // upper-bound again for this block. Even if blockUpper is inclusive // because of upper being inclusive, we can still safely set blockUpper // to nil here. - // - // TODO(bananabrick): We could also set blockUpper to nil for the >= - // case, if blockUpper is inclusive. i.blockUpper = nil } } -// Deterministic disabling of the bounds-based optimization that avoids seeking. -// Uses the iterator pointer, since we want diversity in iterator behavior for -// the same SetBounds call. Used for tests. -func disableBoundsOpt(bound []byte, ptr uintptr) bool { +func (i *singleLevelIterator[I, PI, D, PD]) initBoundsForAlreadyLoadedBlock() { + // TODO(radu): determine automatically if we need to call First or not and + // unify this function with initBounds(). + i.blockLower = i.lower + if i.blockLower != nil && PD(&i.data).IsLowerBound(i.blockLower) { + // The lower-bound is less than the first key in the block. No need + // to check the lower-bound again for this block. + i.blockLower = nil + } + i.blockUpper = i.upper + // TODO(radu): this should be >= 0 if blockUpper is inclusive. + if i.blockUpper != nil && PI(&i.index).SeparatorLT(i.blockUpper) { + // The upper-bound is greater than the index key which itself is greater + // than or equal to every key in the block. No need to check the + // upper-bound again for this block. + i.blockUpper = nil + } +} + +// Deterministic disabling (in testing mode) of the bounds-based optimization +// that avoids seeking. Uses the iterator pointer, since we want diversity in +// iterator behavior for the same SetBounds call. Used for tests. +func testingDisableBoundsOpt(bound []byte, ptr uintptr) bool { + if !invariants.Enabled || ensureBoundsOptDeterminism { + return false + } // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ simpleHash := (11400714819323198485 * uint64(ptr)) >> 63 return bound[len(bound)-1]&byte(1) == 0 && simpleHash == 0 @@ -308,17 +387,34 @@ func disableBoundsOpt(bound []byte, ptr uintptr) bool { // state and require this behavior to be deterministic. var ensureBoundsOptDeterminism bool +// SetBoundsWithSyntheticPrefix indicates whether this iterator requires keys +// passed to its SetBounds() method by a prefix rewriting wrapper to be *not* +// rewritten to be in terms of this iterator's content, but instead be passed +// as-is, i.e. with the synthetic prefix still on them. +// +// This allows an optimization when this iterator is passing these bounds on to +// a vState to additionally constrain them. In said vState, passed bounds are +// combined with the vState bounds which are in terms of the rewritten prefix. +// If the caller rewrote bounds to be in terms of content prefix and SetBounds +// passed those to vState, the vState would need to *un*rewrite them back to the +// synthetic prefix in order to combine them with the vState bounds. Thus, if +// this iterator knows bounds will be passed to vState, it can signal that it +// they should be passed without being rewritten to skip converting to and fro. +func (i singleLevelIterator[I, PI, P, PD]) SetBoundsWithSyntheticPrefix() bool { + return i.readEnv.Virtual != nil +} + // SetBounds implements internalIterator.SetBounds, as documented in the pebble // package. Note that the upper field is exclusive. -func (i *singleLevelIterator) SetBounds(lower, upper []byte) { +func (i *singleLevelIterator[I, PI, P, PD]) SetBounds(lower, upper []byte) { i.boundsCmp = 0 - if i.vState != nil { + if i.readEnv.Virtual != nil { // If the reader is constructed for a virtual sstable, then we must // constrain the bounds of the reader. For physical sstables, the bounds // can be wider than the actual sstable's bounds because we won't // accidentally expose additional keys as there are no additional keys. - i.endKeyInclusive, lower, upper = i.vState.constrainBounds( - lower, upper, false, + i.endKeyInclusive, lower, upper = i.readEnv.Virtual.ConstrainBounds( + lower, upper, false, i.reader.Comparer.Compare, ) } else { // TODO(bananabrick): Figure out the logic here to enable the boundsCmp @@ -326,14 +422,12 @@ func (i *singleLevelIterator) SetBounds(lower, upper []byte) { if i.positionedUsingLatestBounds { if i.upper != nil && lower != nil && i.cmp(i.upper, lower) <= 0 { i.boundsCmp = +1 - if invariants.Enabled && !ensureBoundsOptDeterminism && - disableBoundsOpt(lower, uintptr(unsafe.Pointer(i))) { + if testingDisableBoundsOpt(lower, uintptr(unsafe.Pointer(i))) { i.boundsCmp = 0 } } else if i.lower != nil && upper != nil && i.cmp(upper, i.lower) <= 0 { i.boundsCmp = -1 - if invariants.Enabled && !ensureBoundsOptDeterminism && - disableBoundsOpt(upper, uintptr(unsafe.Pointer(i))) { + if testingDisableBoundsOpt(upper, uintptr(unsafe.Pointer(i))) { i.boundsCmp = 0 } } @@ -347,84 +441,86 @@ func (i *singleLevelIterator) SetBounds(lower, upper []byte) { i.blockUpper = nil } -// loadBlock loads the block at the current index position and leaves i.data +func (i *singleLevelIterator[I, PI, P, PD]) SetContext(ctx context.Context) { + i.ctx = ctx +} + +// loadDataBlock loads the block at the current index position and leaves i.data // unpositioned. If unsuccessful, it sets i.err to any error encountered, which // may be nil if we have simply exhausted the entire table. -func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult { - if !i.index.valid() { +func (i *singleLevelIterator[I, PI, P, PD]) loadDataBlock(dir int8) loadBlockResult { + if !PI(&i.index).Valid() { // Ensure the data block iterator is invalidated even if loading of the block // fails. - i.data.invalidate() + PD(&i.data).Invalidate() return loadBlockFailed } // Load the next block. - v := i.index.value() - bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue()) - if i.dataBH == bhp.BlockHandle && i.data.valid() { + bhp, err := PI(&i.index).BlockHandleWithProperties() + if i.dataBH == bhp.Handle && PD(&i.data).Valid() { // We're already at the data block we want to load. Reset bounds in case // they changed since the last seek, but don't reload the block from cache // or disk. // // It's safe to leave i.data in its original state here, as all callers to - // loadBlock make an absolute positioning call (i.e. a seek, first, or last) - // to `i.data` right after loadBlock returns loadBlockOK. + // loadDataBlock make an absolute positioning call (i.e. a seek, first, or last) + // to `i.data` right after loadDataBlock returns loadBlockOK. i.initBounds() return loadBlockOK } // Ensure the data block iterator is invalidated even if loading of the block // fails. - i.data.invalidate() - i.dataBH = bhp.BlockHandle + PD(&i.data).Invalidate() + i.dataBH = bhp.Handle if err != nil { - i.err = errCorruptIndexEntry + i.err = errCorruptIndexEntry(err) return loadBlockFailed } if i.bpfs != nil { intersects, err := i.bpfs.intersects(bhp.Props) if err != nil { - i.err = errCorruptIndexEntry + i.err = errCorruptIndexEntry(err) return loadBlockFailed } if intersects == blockMaybeExcluded { intersects = i.resolveMaybeExcluded(dir) } if intersects == blockExcluded { - i.maybeFilteredKeysSingleLevel = true return loadBlockIrrelevant } // blockIntersects } - ctx := objiotracing.WithBlockType(i.ctx, objiotracing.DataBlock) - block, err := i.reader.readBlock(ctx, i.dataBH, nil /* transform */, i.dataRH, i.stats, i.bufferPool) + block, err := i.reader.readDataBlock(i.ctx, i.readEnv.Block, i.dataRH, i.dataBH) if err != nil { i.err = err return loadBlockFailed } - i.err = i.data.initHandle(i.cmp, block, i.reader.Properties.GlobalSeqNum, i.hideObsoletePoints) + i.err = PD(&i.data).InitHandle(i.reader.Comparer, block, i.transforms) if i.err != nil { // The block is partially loaded, and we don't want it to appear valid. - i.data.invalidate() + PD(&i.data).Invalidate() return loadBlockFailed } i.initBounds() return loadBlockOK } -// readBlockForVBR implements the blockProviderWhenOpen interface for use by -// the valueBlockReader. -func (i *singleLevelIterator) readBlockForVBR( - ctx context.Context, h BlockHandle, stats *base.InternalIteratorStats, -) (bufferHandle, error) { - ctx = objiotracing.WithBlockType(ctx, objiotracing.ValueBlock) - return i.reader.readBlock(ctx, h, nil, i.vbRH, stats, i.bufferPool) +// ReadValueBlock implements the valblk.BlockProviderWhenOpen interface for use +// by the valblk.Reader. +func (i *singleLevelIterator[I, PI, D, PD]) ReadValueBlock( + bh block.Handle, stats *base.InternalIteratorStats, +) (block.BufferHandle, error) { + env := i.readEnv.Block + env.Stats = stats + return i.reader.readValueBlock(i.ctx, env, i.vbRH, bh) } // resolveMaybeExcluded is invoked when the block-property filterer has found // that a block is excluded according to its properties but only if its bounds // fall within the filter's current bounds. This function consults the // apprioriate bound, depending on the iteration direction, and returns either -// `blockIntersects` or `blockMaybeExcluded`. -func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { +// `blockIntersects` or `blockExcluded`. +func (i *singleLevelIterator[I, PI, D, PD]) resolveMaybeExcluded(dir int8) intersectsResult { // TODO(jackson): We could first try comparing to top-level index block's // key, and if within bounds avoid per-data block key comparisons. @@ -446,7 +542,7 @@ func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { // need. if dir > 0 { // Forward iteration. - if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.index.Key().UserKey) { + if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(PI(&i.index).Separator()) { return blockExcluded } return blockIntersects @@ -470,120 +566,80 @@ func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { // previous block's separator, which provides an inclusive lower bound on // the original block's keys. Afterwards, we step forward to restore our // index position. - if peekKey, _ := i.index.Prev(); peekKey == nil { + if !PI(&i.index).Prev() { // The original block points to the first block of this index block. If // there's a two-level index, it could potentially provide a lower // bound, but the code refactoring necessary to read it doesn't seem // worth the payoff. We fall through to loading the block. - } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey.UserKey) { + } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(PI(&i.index).Separator()) { // The lower-bound on the original block falls within the filter's // bounds, and we can skip the block (after restoring our current index // position). - _, _ = i.index.Next() + _ = PI(&i.index).Next() return blockExcluded } - _, _ = i.index.Next() + _ = PI(&i.index).Next() return blockIntersects } -func (i *singleLevelIterator) initBoundsForAlreadyLoadedBlock() { - if i.data.getFirstUserKey() == nil { - panic("initBoundsForAlreadyLoadedBlock must not be called on empty or corrupted block") - } - i.blockLower = i.lower - if i.blockLower != nil { - firstUserKey := i.data.getFirstUserKey() - if firstUserKey != nil && i.cmp(i.blockLower, firstUserKey) < 0 { - // The lower-bound is less than the first key in the block. No need - // to check the lower-bound again for this block. - i.blockLower = nil - } - } - i.blockUpper = i.upper - if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 { - // The upper-bound is greater than the index key which itself is greater - // than or equal to every key in the block. No need to check the - // upper-bound again for this block. - i.blockUpper = nil - } -} - // The number of times to call Next/Prev in a block before giving up and seeking. // The value of 4 is arbitrary. // TODO(sumeer): experiment with dynamic adjustment based on the history of // seeks for a particular iterator. const numStepsBeforeSeek = 4 -func (i *singleLevelIterator) trySeekGEUsingNextWithinBlock( +func (i *singleLevelIterator[I, PI, D, PD]) trySeekGEUsingNextWithinBlock( key []byte, -) (k *InternalKey, v base.LazyValue, done bool) { - k, v = i.data.Key(), i.data.value() +) (kv *base.InternalKV, done bool) { + kv = PD(&i.data).KV() for j := 0; j < numStepsBeforeSeek; j++ { - curKeyCmp := i.cmp(k.UserKey, key) + curKeyCmp := i.cmp(kv.K.UserKey, key) if curKeyCmp >= 0 { if i.blockUpper != nil { - cmp := i.cmp(k.UserKey, i.blockUpper) + cmp := i.cmp(kv.K.UserKey, i.blockUpper) if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { i.exhaustedBounds = +1 - return nil, base.LazyValue{}, true + return nil, true } } - return k, v, true + return kv, true } - k, v = i.data.Next() - if k == nil { + kv = PD(&i.data).Next() + if kv == nil { break } } - return k, v, false + return kv, false } -func (i *singleLevelIterator) trySeekLTUsingPrevWithinBlock( +func (i *singleLevelIterator[I, PI, D, PD]) trySeekLTUsingPrevWithinBlock( key []byte, -) (k *InternalKey, v base.LazyValue, done bool) { - k, v = i.data.Key(), i.data.value() +) (kv *base.InternalKV, done bool) { + kv = PD(&i.data).KV() for j := 0; j < numStepsBeforeSeek; j++ { - curKeyCmp := i.cmp(k.UserKey, key) + curKeyCmp := i.cmp(kv.K.UserKey, key) if curKeyCmp < 0 { - if i.blockLower != nil && i.cmp(k.UserKey, i.blockLower) < 0 { + if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 { i.exhaustedBounds = -1 - return nil, base.LazyValue{}, true + return nil, true } - return k, v, true + return kv, true } - k, v = i.data.Prev() - if k == nil { + kv = PD(&i.data).Prev() + if kv == nil { break } } - return k, v, false -} - -func (i *singleLevelIterator) recordOffset() uint64 { - offset := i.dataBH.Offset - if i.data.valid() { - // - i.dataBH.Length/len(i.data.data) is the compression ratio. If - // uncompressed, this is 1. - // - i.data.nextOffset is the uncompressed position of the current record - // in the block. - // - i.dataBH.Offset is the offset of the block in the sstable before - // decompression. - offset += (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data)) - } else { - // Last entry in the block must increment bytes iterated by the size of the block trailer - // and restart points. - offset += i.dataBH.Length + blockTrailerLen - } - return offset + return kv, false } // SeekGE implements internalIterator.SeekGE, as documented in the pebble // package. Note that SeekGE only checks the upper bound. It is up to the // caller to ensure that key is greater than or equal to the lower bound. -func (i *singleLevelIterator) SeekGE( +func (i *singleLevelIterator[I, PI, D, PD]) SeekGE( key []byte, flags base.SeekGEFlags, -) (*InternalKey, base.LazyValue) { - if i.vState != nil { +) *base.InternalKV { + if i.readEnv.Virtual != nil { // Callers of SeekGE don't know about virtual sstable bounds, so we may // have to internally restrict the bounds. // @@ -598,9 +654,9 @@ func (i *singleLevelIterator) SeekGE( // The i.exhaustedBounds comparison indicates that the upper bound was // reached. The i.data.isDataInvalidated() indicates that the sstable was // exhausted. - if (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) && i.err == nil { + if (i.exhaustedBounds == +1 || PD(&i.data).IsDataInvalidated()) && i.err == nil { // Already exhausted, so return nil. - return nil, base.LazyValue{} + return nil } if i.err != nil { // The current iterator position cannot be used. @@ -623,42 +679,32 @@ func (i *singleLevelIterator) SeekGE( } // seekGEHelper contains the common functionality for SeekGE and SeekPrefixGE. -func (i *singleLevelIterator) seekGEHelper( +func (i *singleLevelIterator[I, PI, D, PD]) seekGEHelper( key []byte, boundsCmp int, flags base.SeekGEFlags, -) (*InternalKey, base.LazyValue) { +) *base.InternalKV { // Invariant: trySeekUsingNext => !i.data.isDataInvalidated() && i.exhaustedBounds != +1 // SeekGE performs various step-instead-of-seeking optimizations: eg enabled // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp). - // Care must be taken to ensure that when performing these optimizations and - // the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately. - // Consider a previous SeekGE that filtered keys from k until the current - // iterator position. - // - // If the previous SeekGE exhausted the iterator, it's possible keys greater - // than or equal to the current search key were filtered. We must not reuse - // the current iterator position without remembering the previous value of - // maybeFilteredKeys. var dontSeekWithinBlock bool - if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() && - boundsCmp > 0 && i.cmp(key, i.index.Key().UserKey) <= 0 { + if !PD(&i.data).IsDataInvalidated() && PD(&i.data).Valid() && PI(&i.index).Valid() && + boundsCmp > 0 && PI(&i.index).SeparatorGT(key, true /* orEqual */) { // Fast-path: The bounds have moved forward and this SeekGE is - // respecting the lower bound (guaranteed by Iterator). We know that - // the iterator must already be positioned within or just outside the - // previous bounds. Therefore it cannot be positioned at a block (or - // the position within that block) that is ahead of the seek position. + // respecting the lower bound (guaranteed by Iterator). We know that the + // iterator must already be positioned within or just outside the + // previous bounds. Therefore it cannot be positioned at a block (or the + // position within that block) that is ahead of the seek position. // However it can be positioned at an earlier block. This fast-path to // use Next() on the block is only applied when we are already at the - // block that the slow-path (the else-clause) would load -- this is - // the motivation for the i.cmp(key, i.index.Key().UserKey) <= 0 - // predicate. + // block that the slow-path (the else-clause) would load -- this is the + // motivation for the IsSeparatorUpperBound(key, true) predicate. i.initBoundsForAlreadyLoadedBlock() - ikey, val, done := i.trySeekGEUsingNextWithinBlock(key) + kv, done := i.trySeekGEUsingNextWithinBlock(key) if done { - return ikey, val + return kv } - if ikey == nil { + if kv == nil { // Done with this block. dontSeekWithinBlock = true } @@ -669,78 +715,70 @@ func (i *singleLevelIterator) seekGEHelper( if flags.TrySeekUsingNext() { // seekPrefixGE or SeekGE has already ensured // !i.data.isDataInvalidated() && i.exhaustedBounds != +1 - currKey := i.data.Key() - value := i.data.value() - less := i.cmp(currKey.UserKey, key) < 0 + curr := PD(&i.data).KV() + less := i.cmp(curr.K.UserKey, key) < 0 // We could be more sophisticated and confirm that the seek // position is within the current block before applying this // optimization. But there may be some benefit even if it is in // the next block, since we can avoid seeking i.index. for j := 0; less && j < numStepsBeforeSeek; j++ { - currKey, value = i.Next() - if currKey == nil { - return nil, base.LazyValue{} + curr = i.Next() + if curr == nil { + return nil } - less = i.cmp(currKey.UserKey, key) < 0 + less = i.cmp(curr.K.UserKey, key) < 0 } if !less { if i.blockUpper != nil { - cmp := i.cmp(currKey.UserKey, i.blockUpper) + cmp := i.cmp(curr.K.UserKey, i.blockUpper) if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { i.exhaustedBounds = +1 - return nil, base.LazyValue{} + return nil } } - return currKey, value + return curr } } // Slow-path. - // Since we're re-seeking the iterator, the previous value of - // maybeFilteredKeysSingleLevel is irrelevant. If we filter out blocks - // during seeking, loadBlock will set it to true. - i.maybeFilteredKeysSingleLevel = false - var ikey *InternalKey - if ikey, _ = i.index.SeekGE(key, flags.DisableTrySeekUsingNext()); ikey == nil { + if !PI(&i.index).SeekGE(key) { // The target key is greater than any key in the index block. // Invalidate the block iterator so that a subsequent call to Prev() // will return the last key in the table. - i.data.invalidate() - return nil, base.LazyValue{} + PD(&i.data).Invalidate() + return nil } - result := i.loadBlock(+1) + result := i.loadDataBlock(+1) if result == loadBlockFailed { - return nil, base.LazyValue{} + return nil } if result == loadBlockIrrelevant { - // Enforce the upper bound here since don't want to bother moving - // to the next block if upper bound is already exceeded. Note that - // the next block starts with keys >= ikey.UserKey since even + // Enforce the upper bound here since don't want to bother moving to + // the next block if upper bound is already exceeded. Note that the + // next block may start with keys >= index.Separator() since even // though this is the block separator, the same user key can span - // multiple blocks. If upper is exclusive we use >= below, else - // we use >. - if i.upper != nil { - cmp := i.cmp(ikey.UserKey, i.upper) - if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { - i.exhaustedBounds = +1 - return nil, base.LazyValue{} - } + // multiple blocks. If upper is exclusive we pass orEqual=true + // below, else we require the separator to be strictly greater than + // upper. + if i.upper != nil && PI(&i.index).SeparatorGT(i.upper, !i.endKeyInclusive) { + i.exhaustedBounds = +1 + return nil } // Want to skip to the next block. dontSeekWithinBlock = true } } if !dontSeekWithinBlock { - if ikey, val := i.data.SeekGE(key, flags.DisableTrySeekUsingNext()); ikey != nil { + if ikv := PD(&i.data).SeekGE(key, flags.DisableTrySeekUsingNext()); ikv != nil { if i.blockUpper != nil { - cmp := i.cmp(ikey.UserKey, i.blockUpper) + cmp := i.cmp(ikv.K.UserKey, i.blockUpper) if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { i.exhaustedBounds = +1 - return nil, base.LazyValue{} + return nil } } - return ikey, val + return ikv } } return i.skipForward() @@ -749,10 +787,10 @@ func (i *singleLevelIterator) seekGEHelper( // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up // to the caller to ensure that key is greater than or equal to the lower bound. -func (i *singleLevelIterator) SeekPrefixGE( +func (i *singleLevelIterator[I, PI, D, PD]) SeekPrefixGE( prefix, key []byte, flags base.SeekGEFlags, -) (*base.InternalKey, base.LazyValue) { - if i.vState != nil { +) *base.InternalKV { + if i.readEnv.Virtual != nil { // Callers of SeekPrefixGE aren't aware of virtual sstable bounds, so // we may have to internally restrict the bounds. // @@ -762,41 +800,35 @@ func (i *singleLevelIterator) SeekPrefixGE( key = i.lower } } - return i.seekPrefixGE(prefix, key, flags, i.useFilter) + return i.seekPrefixGE(prefix, key, flags) } -func (i *singleLevelIterator) seekPrefixGE( - prefix, key []byte, flags base.SeekGEFlags, checkFilter bool, -) (k *InternalKey, value base.LazyValue) { +func (i *singleLevelIterator[I, PI, D, PD]) seekPrefixGE( + prefix, key []byte, flags base.SeekGEFlags, +) (kv *base.InternalKV) { // NOTE: prefix is only used for bloom filter checking and not later work in // this method. Hence, we can use the existing iterator position if the last // SeekPrefixGE did not fail bloom filter matching. err := i.err i.err = nil // clear cached iteration error - if checkFilter && i.reader.tableFilter != nil { + if i.useFilterBlock { if !i.lastBloomFilterMatched { // Iterator is not positioned based on last seek. flags = flags.DisableTrySeekUsingNext() } i.lastBloomFilterMatched = false // Check prefix bloom filter. - var dataH bufferHandle - dataH, i.err = i.reader.readFilter(i.ctx, i.stats) - if i.err != nil { - i.data.invalidate() - return nil, base.LazyValue{} - } - mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix) - dataH.Release() - if !mayContain { - // This invalidation may not be necessary for correctness, and may - // be a place to optimize later by reusing the already loaded - // block. It was necessary in earlier versions of the code since - // the caller was allowed to call Next when SeekPrefixGE returned + var mayContain bool + mayContain, i.err = i.bloomFilterMayContain(prefix) + if i.err != nil || !mayContain { + // In the i.err == nil case, this invalidation may not be necessary for + // correctness, and may be a place to optimize later by reusing the + // already loaded block. It was necessary in earlier versions of the code + // since the caller was allowed to call Next when SeekPrefixGE returned // nil. This is no longer allowed. - i.data.invalidate() - return nil, base.LazyValue{} + PD(&i.data).Invalidate() + return nil } i.lastBloomFilterMatched = true } @@ -804,9 +836,9 @@ func (i *singleLevelIterator) seekPrefixGE( // The i.exhaustedBounds comparison indicates that the upper bound was // reached. The i.data.isDataInvalidated() indicates that the sstable was // exhausted. - if (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) && err == nil { + if (i.exhaustedBounds == +1 || PD(&i.data).IsDataInvalidated()) && err == nil { // Already exhausted, so return nil. - return nil, base.LazyValue{} + return nil } if err != nil { // The current iterator position cannot be used. @@ -825,45 +857,143 @@ func (i *singleLevelIterator) seekPrefixGE( // Seek optimization only applies until iterator is first positioned after SetBounds. i.boundsCmp = 0 i.positionedUsingLatestBounds = true - k, value = i.seekGEHelper(key, boundsCmp, flags) - return i.maybeVerifyKey(k, value) + return i.maybeVerifyKey(i.seekGEHelper(key, boundsCmp, flags)) +} + +// shouldUseFilterBlock returns whether we should use the filter block, based on +// its length and the size limit. +func shouldUseFilterBlock(reader *Reader, filterBlockSizeLimit FilterBlockSizeLimit) bool { + return reader.tableFilter != nil && reader.filterBH.Length <= uint64(filterBlockSizeLimit) +} + +func (i *singleLevelIterator[I, PI, D, PD]) bloomFilterMayContain(prefix []byte) (bool, error) { + // Check prefix bloom filter. + prefixToCheck := prefix + if i.transforms.HasSyntheticPrefix() { + // We have to remove the synthetic prefix. + var ok bool + prefixToCheck, ok = bytes.CutPrefix(prefix, i.transforms.SyntheticPrefix()) + if !ok { + // This prefix will not be found inside this table. + return false, nil + } + } + + dataH, err := i.reader.readFilterBlock(i.ctx, i.readEnv.Block, i.indexFilterRH, i.reader.filterBH) + if err != nil { + return false, err + } + defer dataH.Release() + return i.reader.tableFilter.mayContain(dataH.BlockData(), prefixToCheck), nil } -// virtualLast should only be called if i.vReader != nil. -func (i *singleLevelIterator) virtualLast() (*InternalKey, base.LazyValue) { - if i.vState == nil { +// virtualLast should only be called if i.readBlockEnv.Virtual != nil +func (i *singleLevelIterator[I, PI, D, PD]) virtualLast() *base.InternalKV { + if i.readEnv.Virtual == nil { panic("pebble: invalid call to virtualLast") } - // Seek to the first internal key. - ikey, _ := i.SeekGE(i.upper, base.SeekGEFlagsNone) - if i.endKeyInclusive { - // Let's say the virtual sstable upper bound is c#1, with the keys c#3, c#2, - // c#1, d, e, ... in the sstable. So, the last key in the virtual sstable is - // c#1. We can perform SeekGE(i.upper) and then keep nexting until we find - // the last key with userkey == i.upper. - // - // TODO(bananabrick): Think about how to improve this. If many internal keys - // with the same user key at the upper bound then this could be slow, but - // maybe the odds of having many internal keys with the same user key at the - // upper bound are low. - for ikey != nil && i.cmp(ikey.UserKey, i.upper) == 0 { - ikey, _ = i.Next() - } - return i.Prev() + if !i.endKeyInclusive { + // Trivial case. + return i.SeekLT(i.upper, base.SeekLTFlagsNone) } + return i.virtualLastSeekLE() +} - // We seeked to the first key >= i.upper. - return i.Prev() +// virtualLastSeekLE is called by virtualLast to do a SeekLE as part of a +// virtualLast. Consider generalizing this into a SeekLE() if there are other +// uses of this method in the future. Does a SeekLE on the upper bound of the +// file/iterator. +func (i *singleLevelIterator[I, PI, D, PD]) virtualLastSeekLE() *base.InternalKV { + // Callers of SeekLE don't know about virtual sstable bounds, so we may + // have to internally restrict the bounds. + // + // TODO(bananabrick): We can optimize this check away for the level iter + // if necessary. + if !i.endKeyInclusive { + panic("unexpected virtualLastSeekLE with exclusive upper bounds") + } + key := i.upper + + i.exhaustedBounds = 0 + i.err = nil // clear cached iteration error + // Seek optimization only applies until iterator is first positioned with a + // SeekGE or SeekLT after SetBounds. + i.boundsCmp = 0 + i.positionedUsingLatestBounds = true + + indexOk := PI(&i.index).SeekGE(key) + // We can have multiple internal keys with the same user key as the seek + // key. In that case, we want the last (greatest) internal key. + // + // INVARIANT: One of two cases: + // A. !indexOk. There is no data block with index key >= key. So all keys + // in the last data block are < key. + // B. i.index.Separator() >= key. This data block may have some keys > key. + // + // Subcases of B: + // B1. Separator() == key. This is when loop iteration happens. + // Since Separator() >= largest data key in the block, the largest data + // key in this block is <= key. + // B2. Separator() > key. Loop iteration will not happen. + // + // NB: We can avoid this Next()ing if we just implement a blockIter.SeekLE(). + // This might be challenging to do correctly, so impose regular operations + // for now. + // TODO(jackson): Consider implementing SeekLE since it's easier to do in + // colblk. + for indexOk && bytes.Equal(PI(&i.index).Separator(), key) { + indexOk = PI(&i.index).Next() + } + if !indexOk { + // Cases A or B1 where B1 exhausted all blocks. In both cases the last block + // has all keys <= key. skipBackward enforces the lower bound. + return i.skipBackward() + } + // Case B. We are here because we were originally in case B2, or we were in B1 + // and we arrived at a block where ikey.UserKey > key. Either way, ikey.UserKey + // > key. So there could be keys in the block > key. But the block preceding + // this block cannot have any keys > key, otherwise it would have been the + // result of the original index.SeekGE. + result := i.loadDataBlock(-1) + if result == loadBlockFailed { + return nil + } + if result == loadBlockIrrelevant { + // Want to skip to the previous block. + return i.skipBackward() + } + ikv := PD(&i.data).SeekGE(key, base.SeekGEFlagsNone) + // Go to the last user key that matches key, and then Prev() on the data + // block. + for ikv != nil && bytes.Equal(ikv.K.UserKey, key) { + ikv = PD(&i.data).Next() + } + ikv = PD(&i.data).Prev() + if ikv != nil { + // Enforce the lower bound here, as we could have gone past it. This happens + // if keys between `i.blockLower` and `key` are obsolete, for instance. Even + // though i.blockLower (which is either nil or equal to i.lower) is <= key, + // all internal keys in the user key interval [i.blockLower, key] could be + // obsolete (due to a RANGEDEL which will not be observed here). And + // i.data.Prev will skip all these obsolete keys, and could land on a key + // below the lower bound, requiring the lower bound check. + if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 { + i.exhaustedBounds = -1 + return nil + } + return ikv + } + return i.skipBackward() } // SeekLT implements internalIterator.SeekLT, as documented in the pebble // package. Note that SeekLT only checks the lower bound. It is up to the // caller to ensure that key is less than or equal to the upper bound. -func (i *singleLevelIterator) SeekLT( +func (i *singleLevelIterator[I, PI, D, PD]) SeekLT( key []byte, flags base.SeekLTFlags, -) (*InternalKey, base.LazyValue) { - if i.vState != nil { +) *base.InternalKV { + if i.readEnv.Virtual != nil { // Might have to fix upper bound since virtual sstable bounds are not // known to callers of SeekLT. // @@ -874,7 +1004,7 @@ func (i *singleLevelIterator) SeekLT( // first internal key with user key < key. if cmp > 0 { // Return the last key in the virtual sstable. - return i.virtualLast() + return i.maybeVerifyKey(i.virtualLast()) } } @@ -885,22 +1015,13 @@ func (i *singleLevelIterator) SeekLT( i.boundsCmp = 0 // Seeking operations perform various step-instead-of-seeking optimizations: - // eg by considering monotonically increasing bounds (i.boundsCmp). Care - // must be taken to ensure that when performing these optimizations and the - // iterator becomes exhausted i.maybeFilteredKeysSingleLevel is set - // appropriately. Consider a previous SeekLT that filtered keys from k - // until the current iterator position. - // - // If the previous SeekLT did exhausted the iterator, it's possible keys - // less than the current search key were filtered. We must not reuse the - // current iterator position without remembering the previous value of - // maybeFilteredKeysSingleLevel. + // eg by considering monotonically increasing bounds (i.boundsCmp). i.positionedUsingLatestBounds = true var dontSeekWithinBlock bool - if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() && - boundsCmp < 0 && i.cmp(i.data.getFirstUserKey(), key) < 0 { + if !PD(&i.data).IsDataInvalidated() && PD(&i.data).Valid() && PI(&i.index).Valid() && + boundsCmp < 0 && !PD(&i.data).IsLowerBound(key) { // Fast-path: The bounds have moved backward, and this SeekLT is // respecting the upper bound (guaranteed by Iterator). We know that // the iterator must already be positioned within or just outside the @@ -911,33 +1032,30 @@ func (i *singleLevelIterator) SeekLT( // block that can satisfy this seek -- this is the motivation for the // the i.cmp(i.data.firstKey.UserKey, key) < 0 predicate. i.initBoundsForAlreadyLoadedBlock() - ikey, val, done := i.trySeekLTUsingPrevWithinBlock(key) + ikv, done := i.trySeekLTUsingPrevWithinBlock(key) if done { - return ikey, val + return ikv } - if ikey == nil { + if ikv == nil { // Done with this block. dontSeekWithinBlock = true } } else { // Slow-path. - i.maybeFilteredKeysSingleLevel = false - var ikey *InternalKey // NB: If a bound-limited block property filter is configured, it's // externally ensured that the filter is disabled (through returning // Intersects=false irrespective of the block props provided) during // seeks. - if ikey, _ = i.index.SeekGE(key, base.SeekGEFlagsNone); ikey == nil { - ikey, _ = i.index.Last() - if ikey == nil { - return nil, base.LazyValue{} + if !PI(&i.index).SeekGE(key) { + if !PI(&i.index).Last() { + return nil } } // INVARIANT: ikey != nil. - result := i.loadBlock(-1) + result := i.loadDataBlock(-1) if result == loadBlockFailed { - return nil, base.LazyValue{} + return nil } if result == loadBlockIrrelevant { // Enforce the lower bound here since don't want to bother moving @@ -945,21 +1063,21 @@ func (i *singleLevelIterator) SeekLT( // that the previous block starts with keys <= ikey.UserKey since // even though this is the current block's separator, the same // user key can span multiple blocks. - if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { + if i.lower != nil && PI(&i.index).SeparatorLT(i.lower) { i.exhaustedBounds = -1 - return nil, base.LazyValue{} + return nil } // Want to skip to the previous block. dontSeekWithinBlock = true } } if !dontSeekWithinBlock { - if ikey, val := i.data.SeekLT(key, flags); ikey != nil { - if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 { + if ikv := PD(&i.data).SeekLT(key, flags); ikv != nil { + if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 { i.exhaustedBounds = -1 - return nil, base.LazyValue{} + return nil } - return ikey, val + return ikv } } // The index contains separator keys which may lie between @@ -980,19 +1098,15 @@ func (i *singleLevelIterator) SeekLT( // package. Note that First only checks the upper bound. It is up to the caller // to ensure that key is greater than or equal to the lower bound (e.g. via a // call to SeekGE(lower)). -func (i *singleLevelIterator) First() (*InternalKey, base.LazyValue) { - // If the iterator was created on a virtual sstable, we will SeekGE to the - // lower bound instead of using First, because First does not respect - // bounds. - if i.vState != nil { +func (i *singleLevelIterator[I, PI, D, PD]) First() *base.InternalKV { + // If we have a lower bound, use SeekGE. Note that in general this is not + // supported usage, except when the lower bound is there because the table is + // virtual. + if i.lower != nil { return i.SeekGE(i.lower, base.SeekGEFlagsNone) } - if i.lower != nil { - panic("singleLevelIterator.First() used despite lower bound") - } i.positionedUsingLatestBounds = true - i.maybeFilteredKeysSingleLevel = false return i.firstInternal() } @@ -1001,46 +1115,43 @@ func (i *singleLevelIterator) First() (*InternalKey, base.LazyValue) { // index file, or for positioning in the second-level index in a two-level // index file. For the latter, one cannot make any claims about absolute // positioning. -func (i *singleLevelIterator) firstInternal() (*InternalKey, base.LazyValue) { +func (i *singleLevelIterator[I, PI, D, PD]) firstInternal() *base.InternalKV { i.exhaustedBounds = 0 i.err = nil // clear cached iteration error // Seek optimization only applies until iterator is first positioned after SetBounds. i.boundsCmp = 0 - var ikey *InternalKey - if ikey, _ = i.index.First(); ikey == nil { - i.data.invalidate() - return nil, base.LazyValue{} + if !PI(&i.index).First() { + PD(&i.data).Invalidate() + return nil } - result := i.loadBlock(+1) + result := i.loadDataBlock(+1) if result == loadBlockFailed { - return nil, base.LazyValue{} + return nil } if result == loadBlockOK { - if ikey, val := i.data.First(); ikey != nil { + if kv := PD(&i.data).First(); kv != nil { if i.blockUpper != nil { - cmp := i.cmp(ikey.UserKey, i.blockUpper) + cmp := i.cmp(kv.K.UserKey, i.blockUpper) if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { i.exhaustedBounds = +1 - return nil, base.LazyValue{} + return nil } } - return ikey, val + return kv } // Else fall through to skipForward. } else { // result == loadBlockIrrelevant. Enforce the upper bound here since // don't want to bother moving to the next block if upper bound is - // already exceeded. Note that the next block starts with keys >= - // ikey.UserKey since even though this is the block separator, the - // same user key can span multiple blocks. If upper is exclusive we - // use >= below, else we use >. - if i.upper != nil { - cmp := i.cmp(ikey.UserKey, i.upper) - if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { - i.exhaustedBounds = +1 - return nil, base.LazyValue{} - } + // already exceeded. Note that the next block may start with keys >= + // index.Separator() since even though this is the block separator, the + // same user key can span multiple blocks. If upper is exclusive we pass + // orEqual=true below, else we require the separator to be strictly + // greater than upper. + if i.upper != nil && PI(&i.index).SeparatorGT(i.upper, !i.endKeyInclusive) { + i.exhaustedBounds = +1 + return nil } // Else fall through to skipForward. } @@ -1052,16 +1163,15 @@ func (i *singleLevelIterator) firstInternal() (*InternalKey, base.LazyValue) { // package. Note that Last only checks the lower bound. It is up to the caller // to ensure that key is less than the upper bound (e.g. via a call to // SeekLT(upper)) -func (i *singleLevelIterator) Last() (*InternalKey, base.LazyValue) { - if i.vState != nil { - return i.virtualLast() +func (i *singleLevelIterator[I, PI, D, PD]) Last() *base.InternalKV { + if i.readEnv.Virtual != nil { + return i.maybeVerifyKey(i.virtualLast()) } if i.upper != nil { panic("singleLevelIterator.Last() used despite upper bound") } i.positionedUsingLatestBounds = true - i.maybeFilteredKeysSingleLevel = false return i.lastInternal() } @@ -1069,28 +1179,27 @@ func (i *singleLevelIterator) Last() (*InternalKey, base.LazyValue) { // index file, or for positioning in the second-level index in a two-level // index file. For the latter, one cannot make any claims about absolute // positioning. -func (i *singleLevelIterator) lastInternal() (*InternalKey, base.LazyValue) { +func (i *singleLevelIterator[I, PI, D, PD]) lastInternal() *base.InternalKV { i.exhaustedBounds = 0 i.err = nil // clear cached iteration error // Seek optimization only applies until iterator is first positioned after SetBounds. i.boundsCmp = 0 - var ikey *InternalKey - if ikey, _ = i.index.Last(); ikey == nil { - i.data.invalidate() - return nil, base.LazyValue{} + if !PI(&i.index).Last() { + PD(&i.data).Invalidate() + return nil } - result := i.loadBlock(-1) + result := i.loadDataBlock(-1) if result == loadBlockFailed { - return nil, base.LazyValue{} + return nil } if result == loadBlockOK { - if ikey, val := i.data.Last(); ikey != nil { - if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 { + if ikv := PD(&i.data).Last(); ikv != nil { + if i.blockLower != nil && i.cmp(ikv.K.UserKey, i.blockLower) < 0 { i.exhaustedBounds = -1 - return nil, base.LazyValue{} + return nil } - return ikey, val + return ikv } // Else fall through to skipBackward. } else { @@ -1099,9 +1208,9 @@ func (i *singleLevelIterator) lastInternal() (*InternalKey, base.LazyValue) { // already exceeded. Note that the previous block starts with keys <= // key.UserKey since even though this is the current block's // separator, the same user key can span multiple blocks. - if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { + if i.lower != nil && PI(&i.index).SeparatorLT(i.lower) { i.exhaustedBounds = -1 - return nil, base.LazyValue{} + return nil } } @@ -1112,101 +1221,99 @@ func (i *singleLevelIterator) lastInternal() (*InternalKey, base.LazyValue) { // package. // Note: compactionIterator.Next mirrors the implementation of Iterator.Next // due to performance. Keep the two in sync. -func (i *singleLevelIterator) Next() (*InternalKey, base.LazyValue) { +func (i *singleLevelIterator[I, PI, D, PD]) Next() *base.InternalKV { if i.exhaustedBounds == +1 { panic("Next called even though exhausted upper bound") } i.exhaustedBounds = 0 - i.maybeFilteredKeysSingleLevel = false // Seek optimization only applies until iterator is first positioned after SetBounds. i.boundsCmp = 0 if i.err != nil { - return nil, base.LazyValue{} + // TODO(jackson): Can this case be turned into a panic? Once an error is + // encountered, the iterator must be re-seeked. + return nil } - if key, val := i.data.Next(); key != nil { + if kv := PD(&i.data).Next(); kv != nil { if i.blockUpper != nil { - cmp := i.cmp(key.UserKey, i.blockUpper) + cmp := i.cmp(kv.K.UserKey, i.blockUpper) if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { i.exhaustedBounds = +1 - return nil, base.LazyValue{} + return nil } } - return key, val + return kv } return i.skipForward() } // NextPrefix implements (base.InternalIterator).NextPrefix. -func (i *singleLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { +func (i *singleLevelIterator[I, PI, D, PD]) NextPrefix(succKey []byte) *base.InternalKV { if i.exhaustedBounds == +1 { panic("NextPrefix called even though exhausted upper bound") } i.exhaustedBounds = 0 - i.maybeFilteredKeysSingleLevel = false // Seek optimization only applies until iterator is first positioned after SetBounds. i.boundsCmp = 0 if i.err != nil { - return nil, base.LazyValue{} + // TODO(jackson): Can this case be turned into a panic? Once an error is + // encountered, the iterator must be re-seeked. + return nil } - if key, val := i.data.NextPrefix(succKey); key != nil { + if kv := PD(&i.data).NextPrefix(succKey); kv != nil { if i.blockUpper != nil { - cmp := i.cmp(key.UserKey, i.blockUpper) + cmp := i.cmp(kv.K.UserKey, i.blockUpper) if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { i.exhaustedBounds = +1 - return nil, base.LazyValue{} + return nil } } - return key, val + return kv } // Did not find prefix in the existing data block. This is the slow-path // where we effectively seek the iterator. - var ikey *InternalKey // The key is likely to be in the next data block, so try one step. - if ikey, _ = i.index.Next(); ikey == nil { + if !PI(&i.index).Next() { // The target key is greater than any key in the index block. // Invalidate the block iterator so that a subsequent call to Prev() // will return the last key in the table. - i.data.invalidate() - return nil, base.LazyValue{} + PD(&i.data).Invalidate() + return nil } - if i.cmp(succKey, ikey.UserKey) > 0 { + if PI(&i.index).SeparatorLT(succKey) { // Not in the next data block, so seek the index. - if ikey, _ = i.index.SeekGE(succKey, base.SeekGEFlagsNone); ikey == nil { + if !PI(&i.index).SeekGE(succKey) { // The target key is greater than any key in the index block. // Invalidate the block iterator so that a subsequent call to Prev() // will return the last key in the table. - i.data.invalidate() - return nil, base.LazyValue{} + PD(&i.data).Invalidate() + return nil } } - result := i.loadBlock(+1) + result := i.loadDataBlock(+1) if result == loadBlockFailed { - return nil, base.LazyValue{} + return nil } if result == loadBlockIrrelevant { - // Enforce the upper bound here since don't want to bother moving - // to the next block if upper bound is already exceeded. Note that - // the next block starts with keys >= ikey.UserKey since even - // though this is the block separator, the same user key can span - // multiple blocks. If upper is exclusive we use >= below, else we use - // >. - if i.upper != nil { - cmp := i.cmp(ikey.UserKey, i.upper) - if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { - i.exhaustedBounds = +1 - return nil, base.LazyValue{} - } + // Enforce the upper bound here since don't want to bother moving to the + // next block if upper bound is already exceeded. Note that the next + // block may start with keys >= index.Separator() since even though this + // is the block separator, the same user key can span multiple blocks. + // If upper is exclusive we pass orEqual=true below, else we require + // the separator to be strictly greater than upper. + if i.upper != nil && PI(&i.index).SeparatorGT(i.upper, !i.endKeyInclusive) { + i.exhaustedBounds = +1 + return nil } - } else if key, val := i.data.SeekGE(succKey, base.SeekGEFlagsNone); key != nil { + } else if kv := PD(&i.data).SeekGE(succKey, base.SeekGEFlagsNone); kv != nil { if i.blockUpper != nil { - cmp := i.cmp(key.UserKey, i.blockUpper) + cmp := i.cmp(kv.K.UserKey, i.blockUpper) if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { i.exhaustedBounds = +1 - return nil, base.LazyValue{} + return nil } } - return i.maybeVerifyKey(key, val) + return i.maybeVerifyKey(kv) } return i.skipForward() @@ -1214,83 +1321,112 @@ func (i *singleLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.Laz // Prev implements internalIterator.Prev, as documented in the pebble // package. -func (i *singleLevelIterator) Prev() (*InternalKey, base.LazyValue) { +func (i *singleLevelIterator[I, PI, D, PD]) Prev() *base.InternalKV { if i.exhaustedBounds == -1 { panic("Prev called even though exhausted lower bound") } i.exhaustedBounds = 0 - i.maybeFilteredKeysSingleLevel = false // Seek optimization only applies until iterator is first positioned after SetBounds. i.boundsCmp = 0 if i.err != nil { - return nil, base.LazyValue{} + return nil } - if key, val := i.data.Prev(); key != nil { - if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 { + if kv := PD(&i.data).Prev(); kv != nil { + if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 { i.exhaustedBounds = -1 - return nil, base.LazyValue{} + return nil } - return key, val + return kv } return i.skipBackward() } -func (i *singleLevelIterator) skipForward() (*InternalKey, base.LazyValue) { +func (i *singleLevelIterator[I, PI, D, PD]) skipForward() *base.InternalKV { for { - var key *InternalKey - if key, _ = i.index.Next(); key == nil { - i.data.invalidate() + if !PI(&i.index).Next() { + PD(&i.data).Invalidate() break } - result := i.loadBlock(+1) + result := i.loadDataBlock(+1) if result != loadBlockOK { if i.err != nil { break } if result == loadBlockFailed { // We checked that i.index was at a valid entry, so - // loadBlockFailed could not have happened due to to i.index + // loadBlockFailed could not have happened due to i.index // being exhausted, and must be due to an error. - panic("loadBlock should not have failed with no error") + panic("loadDataBlock should not have failed with no error") } - // result == loadBlockIrrelevant. Enforce the upper bound here - // since don't want to bother moving to the next block if upper - // bound is already exceeded. Note that the next block starts with - // keys >= key.UserKey since even though this is the block - // separator, the same user key can span multiple blocks. If upper - // is exclusive we use >= below, else we use >. - if i.upper != nil { - cmp := i.cmp(key.UserKey, i.upper) - if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { - i.exhaustedBounds = +1 - return nil, base.LazyValue{} - } + // result == loadBlockIrrelevant. Enforce the upper bound here since + // don't want to bother moving to the next block if upper bound is + // already exceeded. Note that the next block may start with keys >= + // index.Separator() since even though this is the block separator, + // the same user key can span multiple blocks. If upper is exclusive + // we pass orEqual=true below, else we require the separator to be + // strictly greater than upper. + if i.upper != nil && PI(&i.index).SeparatorGT(i.upper, !i.endKeyInclusive) { + i.exhaustedBounds = +1 + return nil } continue } - if key, val := i.data.First(); key != nil { + var kv *base.InternalKV + // It is possible that skipBackward went too far and the virtual table lower + // bound is after the first key in the block we are about to load, in which + // case we must use SeekGE. + // + // An example of how this can happen: + // + // Data block 1 - contains keys a@1, c@1 + // Data block 2 - contains keys e@1, g@1 + // Data block 3 - contains keys i@2, k@2 + // + // The virtual table lower bound is f. We have a range key masking filter + // that filters keys with @1 suffix. We are positioned inside block 3 then + // we Prev(). Block 2 is entirely filtered out, which makes us move to + // block 1. Now the range key masking filter gets an update (via + // SpanChanged) and it no longer filters out any keys. At this point if a + // Next happens, we will load block 2 but it would not be legal to return + // "e@1" which is outside the virtual bounds. + // + // The core of the problem is that skipBackward doesn't know it can stop + // at block 2, because it doesn't know what keys are at the start of that + // block. This is why we don't have this problem in the opposite + // direction: skipForward will never go beyond the last relevant block + // because it looks at the separator key which is an upper bound for the + // block. + // + // Note that this is only a problem with virtual tables; we make no + // guarantees wrt an iterator lower bound when we iterate forward. But we + // must never return keys that are not inside the virtual table. + if i.readEnv.Virtual != nil && i.blockLower != nil { + kv = PD(&i.data).SeekGE(i.lower, base.SeekGEFlagsNone) + } else { + kv = PD(&i.data).First() + } + if kv != nil { if i.blockUpper != nil { - cmp := i.cmp(key.UserKey, i.blockUpper) + cmp := i.cmp(kv.K.UserKey, i.blockUpper) if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { i.exhaustedBounds = +1 - return nil, base.LazyValue{} + return nil } } - return i.maybeVerifyKey(key, val) + return i.maybeVerifyKey(kv) } } - return nil, base.LazyValue{} + return nil } -func (i *singleLevelIterator) skipBackward() (*InternalKey, base.LazyValue) { +func (i *singleLevelIterator[I, PI, D, PD]) skipBackward() *base.InternalKV { for { - var key *InternalKey - if key, _ = i.index.Prev(); key == nil { - i.data.invalidate() + if !PI(&i.index).Prev() { + PD(&i.data).Invalidate() break } - result := i.loadBlock(-1) + result := i.loadDataBlock(-1) if result != loadBlockOK { if i.err != nil { break @@ -1299,51 +1435,53 @@ func (i *singleLevelIterator) skipBackward() (*InternalKey, base.LazyValue) { // We checked that i.index was at a valid entry, so // loadBlockFailed could not have happened due to to i.index // being exhausted, and must be due to an error. - panic("loadBlock should not have failed with no error") + panic("loadDataBlock should not have failed with no error") } // result == loadBlockIrrelevant. Enforce the lower bound here // since don't want to bother moving to the previous block if lower // bound is already exceeded. Note that the previous block starts with // keys <= key.UserKey since even though this is the current block's // separator, the same user key can span multiple blocks. - if i.lower != nil && i.cmp(key.UserKey, i.lower) < 0 { + if i.lower != nil && PI(&i.index).SeparatorLT(i.lower) { i.exhaustedBounds = -1 - return nil, base.LazyValue{} + return nil } continue } - key, val := i.data.Last() - if key == nil { - return nil, base.LazyValue{} + kv := PD(&i.data).Last() + if kv == nil { + // The block iter could have hid some obsolete points, so it isn't + // safe to assume that there are no keys if we keep skipping backwards. + // Check the previous block, but check the lower bound before doing + // that. + if i.lower != nil && PI(&i.index).SeparatorLT(i.lower) { + i.exhaustedBounds = -1 + return nil + } + continue } - if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 { + if i.blockLower != nil && i.cmp(kv.K.UserKey, i.blockLower) < 0 { i.exhaustedBounds = -1 - return nil, base.LazyValue{} + return nil } - return i.maybeVerifyKey(key, val) + return i.maybeVerifyKey(kv) } - return nil, base.LazyValue{} + return nil } // Error implements internalIterator.Error, as documented in the pebble // package. -func (i *singleLevelIterator) Error() error { - if err := i.data.Error(); err != nil { +func (i *singleLevelIterator[I, PI, D, PD]) Error() error { + if err := PD(&i.data).Error(); err != nil { return err } return i.err } -// MaybeFilteredKeys may be called when an iterator is exhausted to indicate -// whether or not the last positioning method may have skipped any keys due to -// block-property filters. -func (i *singleLevelIterator) MaybeFilteredKeys() bool { - return i.maybeFilteredKeysSingleLevel -} - -// SetCloseHook sets a function that will be called when the iterator is -// closed. -func (i *singleLevelIterator) SetCloseHook(fn func(i Iterator) error) { +// SetCloseHook sets a function that will be called when the iterator is closed. +// This is used by the file cache to release the reference count on the open +// sstable.Reader when the iterator is closed. +func (i *singleLevelIterator[I, PI, D, PD]) SetCloseHook(fn func()) { i.closeHook = fn } @@ -1356,13 +1494,31 @@ func firstError(err0, err1 error) error { // Close implements internalIterator.Close, as documented in the pebble // package. -func (i *singleLevelIterator) Close() error { - var err error +func (i *singleLevelIterator[I, PI, D, PD]) Close() error { + err := i.closeInternal() + pool := i.pool + i.resetForReuse() + if pool != nil { + pool.Put(i) + } + return err +} + +func (i *singleLevelIterator[I, PI, D, PD]) closeInternal() error { + if invariants.Enabled && i.inPool { + panic("Close called on interator in pool") + } + if i.closeHook != nil { - err = firstError(err, i.closeHook(i)) + i.closeHook() + } + var err error + err = firstError(err, PD(&i.data).Close()) + err = firstError(err, PI(&i.index).Close()) + if i.indexFilterRH != nil { + err = firstError(err, i.indexFilterRH.Close()) + i.indexFilterRH = nil } - err = firstError(err, i.data.Close()) - err = firstError(err, i.index.Close()) if i.dataRH != nil { err = firstError(err, i.dataRH.Close()) i.dataRH = nil @@ -1371,21 +1527,22 @@ func (i *singleLevelIterator) Close() error { if i.bpfs != nil { releaseBlockPropertiesFilterer(i.bpfs) } - if i.vbReader != nil { - i.vbReader.close() - } + i.internalValueConstructor.vbReader.Close() if i.vbRH != nil { err = firstError(err, i.vbRH.Close()) i.vbRH = nil } - *i = i.resetForReuse() - singleLevelIterPool.Put(i) return err } -func (i *singleLevelIterator) String() string { - if i.vState != nil { - return i.vState.fileNum.String() +func (i *singleLevelIterator[I, PI, D, PD]) String() string { + if i.readEnv.Virtual != nil { + return i.readEnv.Virtual.FileNum.String() } - return i.reader.fileNum.String() + return i.reader.blockReader.FileNum().String() +} + +// DebugTree is part of the InternalIterator interface. +func (i *singleLevelIterator[I, PI, D, PD]) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p) fileNum=%s", i, i, i.String()) } diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_iter_two_lvl.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_iter_two_lvl.go new file mode 100644 index 0000000..0e7292f --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/reader_iter_two_lvl.go @@ -0,0 +1,1026 @@ +// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package sstable + +import ( + "bytes" + "context" + "fmt" + "sync" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/objstorage" + "github.com/cockroachdb/pebble/v2/sstable/valblk" +) + +type twoLevelIterator[I any, PI indexBlockIterator[I], D any, PD dataBlockIterator[D]] struct { + secondLevel singleLevelIterator[I, PI, D, PD] + topLevelIndex I + // pool is the pool from which the iterator was allocated and to which the + // iterator should be returned on Close. Because the iterator is + // parameterized by the type of the data block iterator, pools must be + // specific to the type of the data block iterator. + pool *sync.Pool + + // useFilterBlock controls whether we consult the bloom filter in the + // twoLevelIterator code. Note that secondLevel.useFilterBlock is always + // false - any filtering happens at the top level. + useFilterBlock bool + lastBloomFilterMatched bool +} + +var _ Iterator = (*twoLevelIteratorRowBlocks)(nil) + +// loadSecondLevelIndexBlock loads the index block at the current top level +// index position and leaves i.index unpositioned. If unsuccessful, it gets +// i.secondLevel.err to any error encountered, which may be nil if we have +// simply exhausted the entire table. This is used for two level indexes. +func (i *twoLevelIterator[I, PI, D, PD]) loadSecondLevelIndexBlock(dir int8) loadBlockResult { + // Ensure the index data block iterators are invalidated even if loading of + // the index fails. + PD(&i.secondLevel.data).Invalidate() + PI(&i.secondLevel.index).Invalidate() + if !PI(&i.topLevelIndex).Valid() { + return loadBlockFailed + } + bhp, err := PI(&i.topLevelIndex).BlockHandleWithProperties() + if err != nil { + i.secondLevel.err = base.CorruptionErrorf("pebble/table: corrupt top level index entry (%v)", err) + return loadBlockFailed + } + if i.secondLevel.bpfs != nil { + intersects, err := i.secondLevel.bpfs.intersects(bhp.Props) + if err != nil { + i.secondLevel.err = errCorruptIndexEntry(err) + return loadBlockFailed + } + if intersects == blockMaybeExcluded { + intersects = i.resolveMaybeExcluded(dir) + } + if intersects == blockExcluded { + return loadBlockIrrelevant + } + // blockIntersects + } + indexBlock, err := i.secondLevel.reader.readIndexBlock(i.secondLevel.ctx, i.secondLevel.readEnv.Block, i.secondLevel.indexFilterRH, bhp.Handle) + if err != nil { + i.secondLevel.err = err + return loadBlockFailed + } + err = PI(&i.secondLevel.index).InitHandle(i.secondLevel.reader.Comparer, indexBlock, i.secondLevel.transforms) + if err != nil { + PI(&i.secondLevel.index).Invalidate() + i.secondLevel.err = err + return loadBlockFailed + } + return loadBlockOK +} + +// resolveMaybeExcluded is invoked when the block-property filterer has found +// that an index block is excluded according to its properties but only if its +// bounds fall within the filter's current bounds. This function consults the +// appropriate bound, depending on the iteration direction, and returns either +// `blockIntersects` or `blockExcluded`. +func (i *twoLevelIterator[I, PI, D, PD]) resolveMaybeExcluded(dir int8) intersectsResult { + // This iterator is configured with a bound-limited block property filter. + // The bpf determined this entire index block could be excluded from + // iteration based on the property encoded in the block handle. However, we + // still need to determine if the index block is wholly contained within the + // filter's key bounds. + // + // External guarantees ensure all its data blocks' keys are ≥ the filter's + // lower bound during forward iteration, and that all its data blocks' keys + // are < the filter's upper bound during backward iteration. We only need to + // determine if the opposite bound is also met. + // + // The index separator in topLevelIndex.Separator() provides an inclusive + // upper-bound for the index block's keys, guaranteeing that all its keys + // are ≤ topLevelIndex.Separator(). For forward iteration, this is all we + // need. + if dir > 0 { + // Forward iteration. + if i.secondLevel.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(PI(&i.topLevelIndex).Separator()) { + return blockExcluded + } + return blockIntersects + } + + // Reverse iteration. + // + // Because we're iterating in the reverse direction, we don't yet have + // enough context available to determine if the block is wholly contained + // within its bounds. This case arises only during backward iteration, + // because of the way the index is structured. + // + // Consider a bound-limited bpf limited to the bounds [b,d), loading the + // block with separator `c`. During reverse iteration, the guarantee that + // all the block's keys are < `d` is externally provided, but no guarantee + // is made on the bpf's lower bound. The separator `c` only provides an + // inclusive upper bound on the block's keys, indicating that the + // corresponding block handle points to a block containing only keys ≤ `c`. + // + // To establish a lower bound, we step the top-level index backwards to read + // the previous block's separator, which provides an inclusive lower bound + // on the original index block's keys. Afterwards, we step forward to + // restore our top-level index position. + if !PI(&i.topLevelIndex).Prev() { + // The original block points to the first index block of this table. If + // we knew the lower bound for the entire table, it could provide a + // lower bound, but the code refactoring necessary to read it doesn't + // seem worth the payoff. We fall through to loading the block. + } else if i.secondLevel.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(PI(&i.topLevelIndex).Separator()) { + // The lower-bound on the original index block falls within the filter's + // bounds, and we can skip the block (after restoring our current + // top-level index position). + _ = PI(&i.topLevelIndex).Next() + return blockExcluded + } + _ = PI(&i.topLevelIndex).Next() + return blockIntersects +} + +// newColumnBlockTwoLevelIterator reads the top-level index block and creates and +// initializes a two-level iterator over an sstable with column-oriented data +// blocks. +// +// Note that lower, upper are iterator bounds and are separate from virtual +// sstable bounds. If the virtualState passed in is not nil, then virtual +// sstable bounds will be enforced. +func newColumnBlockTwoLevelIterator( + ctx context.Context, r *Reader, opts IterOptions, +) (*twoLevelIteratorColumnBlocks, error) { + if r.err != nil { + return nil, r.err + } + if !r.tableFormat.BlockColumnar() { + panic(errors.AssertionFailedf("table format %d should not use columnar block format", r.tableFormat)) + } + i := twoLevelIterColumnBlockPool.Get().(*twoLevelIteratorColumnBlocks) + i.secondLevel.init(ctx, r, opts) + // Only check the bloom filter at the top level. + i.useFilterBlock = i.secondLevel.useFilterBlock + i.secondLevel.useFilterBlock = false + + if r.Attributes.Has(AttributeValueBlocks) { + // NB: we cannot avoid this ~248 byte allocation, since valueBlockReader + // can outlive the singleLevelIterator due to be being embedded in a + // LazyValue. This consumes ~2% in microbenchmark CPU profiles, but we + // should only optimize this if it shows up as significant in end-to-end + // CockroachDB benchmarks, since it is tricky to do so. One possibility + // is that if many sstable iterators only get positioned at latest + // versions of keys, and therefore never expose a LazyValue that is + // separated to their callers, they can put this valueBlockReader into a + // sync.Pool. + i.secondLevel.internalValueConstructor.vbReader = valblk.MakeReader(&i.secondLevel, opts.ReaderProvider, r.valueBIH, opts.Env.Block.Stats) + i.secondLevel.vbRH = r.blockReader.UsePreallocatedReadHandle( + objstorage.NoReadBefore, &i.secondLevel.vbRHPrealloc) + } + i.secondLevel.data.InitOnce(r.keySchema, r.Comparer, &i.secondLevel.internalValueConstructor) + topLevelIndexH, err := r.readTopLevelIndexBlock(ctx, i.secondLevel.readEnv.Block, i.secondLevel.indexFilterRH) + if err == nil { + err = i.topLevelIndex.InitHandle(r.Comparer, topLevelIndexH, opts.Transforms) + } + if err != nil { + _ = i.Close() + return nil, err + } + return i, nil +} + +// newRowBlockTwoLevelIterator reads the top-level index block and creates and +// initializes a two-level iterator over an sstable with row-oriented data +// blocks. +// +// Note that lower, upper are iterator bounds and are separate from virtual +// sstable bounds. If the virtualState passed in is not nil, then virtual +// sstable bounds will be enforced. +func newRowBlockTwoLevelIterator( + ctx context.Context, r *Reader, opts IterOptions, +) (*twoLevelIteratorRowBlocks, error) { + if r.err != nil { + return nil, r.err + } + if r.tableFormat.BlockColumnar() { + panic(errors.AssertionFailedf("table format %s uses block columnar format", r.tableFormat)) + } + i := twoLevelIterRowBlockPool.Get().(*twoLevelIteratorRowBlocks) + i.secondLevel.init(ctx, r, opts) + // Only check the bloom filter at the top level. + i.useFilterBlock = i.secondLevel.useFilterBlock + i.secondLevel.useFilterBlock = false + + if r.tableFormat >= TableFormatPebblev3 { + if r.Attributes.Has(AttributeValueBlocks) { + // NB: we cannot avoid this ~248 byte allocation, since valueBlockReader + // can outlive the singleLevelIterator due to be being embedded in a + // LazyValue. This consumes ~2% in microbenchmark CPU profiles, but we + // should only optimize this if it shows up as significant in end-to-end + // CockroachDB benchmarks, since it is tricky to do so. One possibility + // is that if many sstable iterators only get positioned at latest + // versions of keys, and therefore never expose a LazyValue that is + // separated to their callers, they can put this valueBlockReader into a + // sync.Pool. + i.secondLevel.internalValueConstructor.vbReader = valblk.MakeReader(&i.secondLevel, opts.ReaderProvider, r.valueBIH, opts.Env.Block.Stats) + // We can set the GetLazyValuer directly to the vbReader because + // rowblk sstables never contain blob value handles. + i.secondLevel.data.SetGetLazyValuer(&i.secondLevel.internalValueConstructor.vbReader) + i.secondLevel.vbRH = r.blockReader.UsePreallocatedReadHandle( + objstorage.NoReadBefore, &i.secondLevel.vbRHPrealloc) + } + i.secondLevel.data.SetHasValuePrefix(true) + } + + topLevelIndexH, err := r.readTopLevelIndexBlock(ctx, i.secondLevel.readEnv.Block, i.secondLevel.indexFilterRH) + if err == nil { + err = i.topLevelIndex.InitHandle(r.Comparer, topLevelIndexH, opts.Transforms) + } + if err != nil { + _ = i.Close() + return nil, err + } + return i, nil +} + +func (i *twoLevelIterator[I, PI, D, PD]) String() string { + return i.secondLevel.String() +} + +// DebugTree is part of the InternalIterator interface. +func (i *twoLevelIterator[I, PI, D, PD]) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p) fileNum=%s", i, i, i.String()) +} + +// SeekGE implements internalIterator.SeekGE, as documented in the pebble +// package. Note that SeekGE only checks the upper bound. It is up to the +// caller to ensure that key is greater than or equal to the lower bound. +func (i *twoLevelIterator[I, PI, D, PD]) SeekGE( + key []byte, flags base.SeekGEFlags, +) *base.InternalKV { + if i.secondLevel.readEnv.Virtual != nil { + // Callers of SeekGE don't know about virtual sstable bounds, so we may + // have to internally restrict the bounds. + // + // TODO(bananabrick): We can optimize away this check for the level iter + // if necessary. + if i.secondLevel.cmp(key, i.secondLevel.lower) < 0 { + key = i.secondLevel.lower + } + } + + err := i.secondLevel.err + i.secondLevel.err = nil // clear cached iteration error + + // The twoLevelIterator could be already exhausted. Utilize that when + // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and + // bounds-exhausted near the top of the file. + if flags.TrySeekUsingNext() && + (i.secondLevel.exhaustedBounds == +1 || (PD(&i.secondLevel.data).IsDataInvalidated() && PI(&i.secondLevel.index).IsDataInvalidated())) && + err == nil { + // Already exhausted, so return nil. + return nil + } + + // SeekGE performs various step-instead-of-seeking optimizations: eg enabled + // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp). + + // We fall into the slow path if i.index.IsDataInvalidated() even if the + // top-level iterator is already positioned correctly and all other + // conditions are met. An alternative structure could reuse topLevelIndex's + // current position and reload the index block to which it points. Arguably, + // an index block load is expensive and the index block may still be earlier + // than the index block containing the sought key, resulting in a wasteful + // block load. + + var dontSeekWithinSingleLevelIter bool + if PI(&i.topLevelIndex).IsDataInvalidated() || !PI(&i.topLevelIndex).Valid() || PI(&i.secondLevel.index).IsDataInvalidated() || err != nil || + (i.secondLevel.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || PI(&i.topLevelIndex).SeparatorLT(key) { + // Slow-path: need to position the topLevelIndex. + + // The previous exhausted state of singleLevelIterator is no longer + // relevant, since we may be moving to a different index block. + i.secondLevel.exhaustedBounds = 0 + flags = flags.DisableTrySeekUsingNext() + if !PI(&i.topLevelIndex).SeekGE(key) { + PD(&i.secondLevel.data).Invalidate() + PI(&i.secondLevel.index).Invalidate() + return nil + } + + result := i.loadSecondLevelIndexBlock(+1) + if result == loadBlockFailed { + i.secondLevel.boundsCmp = 0 + return nil + } + if result == loadBlockIrrelevant { + // Enforce the upper bound here since don't want to bother moving to + // the next entry in the top level index if upper bound is already + // exceeded. Note that the next entry starts with keys >= + // topLevelIndex.Separator() since even though this is the block + // separator, the same user key can span multiple index blocks. If + // upper is exclusive we pass orEqual=true below, else we require + // the separator to be strictly greater than upper. + if i.secondLevel.upper != nil && PI(&i.topLevelIndex).SeparatorGT( + i.secondLevel.upper, !i.secondLevel.endKeyInclusive) { + i.secondLevel.exhaustedBounds = +1 + } + // Fall through to skipForward. + dontSeekWithinSingleLevelIter = true + // Clear boundsCmp. + // + // In the typical cases where dontSeekWithinSingleLevelIter=false, + // the singleLevelIterator.SeekGE call will clear boundsCmp. + // However, in this case where dontSeekWithinSingleLevelIter=true, + // we never seek on the single-level iterator. This call will fall + // through to skipForward, which may improperly leave boundsCmp=+1 + // unless we clear it here. + i.secondLevel.boundsCmp = 0 + } + } else { + // INVARIANT: err == nil. + // + // Else fast-path: There are two possible cases, from + // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): + // + // 1) The bounds have moved forward (i.boundsCmp > 0) and this SeekGE is + // respecting the lower bound (guaranteed by Iterator). We know that the + // iterator must already be positioned within or just outside the previous + // bounds. Therefore, the topLevelIndex iter cannot be positioned at an + // entry ahead of the seek position (though it can be positioned behind). + // The !i.cmp(key, i.topLevelIndex.Separator) > 0 confirms that it is + // not behind. Since it is not ahead and not behind it must be at the + // right position. + // + // 2) This SeekGE will land on a key that is greater than the key we are + // currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, + // i.topLevelIndex.Separator()) <= 0, we are at the correct lower level + // index block. No need to reset the state of singleLevelIterator. + // + // Note that cases 1 and 2 never overlap, and one of them must be true. + // This invariant checking is important enough that we do not gate it + // behind invariants.Enabled. + if i.secondLevel.boundsCmp > 0 == flags.TrySeekUsingNext() { + panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", + i.secondLevel.boundsCmp > 0, flags.TrySeekUsingNext())) + } + + if !flags.TrySeekUsingNext() { + // Case 1. Bounds have changed so the previous exhausted bounds state is + // irrelevant. + // WARNING-data-exhausted: this is safe to do only because the monotonic + // bounds optimizations only work when !data-exhausted. If they also + // worked with data-exhausted, we have made it unclear whether + // data-exhausted is actually true. See the comment at the top of the + // file. + i.secondLevel.exhaustedBounds = 0 + } + // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to + // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See + // bug https://github.com/cockroachdb/pebble/v2/issues/2036. + } + + if !dontSeekWithinSingleLevelIter { + // Note that while trySeekUsingNext could be false here, singleLevelIterator + // could do its own boundsCmp-based optimization to seek using next. + if ikv := i.secondLevel.SeekGE(key, flags); ikv != nil { + return ikv + } + } + return i.skipForward() +} + +// SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the +// pebble package. Note that SeekPrefixGE only checks the upper bound. It is up +// to the caller to ensure that key is greater than or equal to the lower bound. +func (i *twoLevelIterator[I, PI, D, PD]) SeekPrefixGE( + prefix, key []byte, flags base.SeekGEFlags, +) *base.InternalKV { + if i.secondLevel.readEnv.Virtual != nil { + // Callers of SeekGE don't know about virtual sstable bounds, so we may + // have to internally restrict the bounds. + // + // TODO(bananabrick): We can optimize away this check for the level iter + // if necessary. + if i.secondLevel.cmp(key, i.secondLevel.lower) < 0 { + key = i.secondLevel.lower + } + } + + // NOTE: prefix is only used for bloom filter checking and not later work in + // this method. Hence, we can use the existing iterator position if the last + // SeekPrefixGE did not fail bloom filter matching. + + err := i.secondLevel.err + i.secondLevel.err = nil // clear cached iteration error + + // The twoLevelIterator could be already exhausted. Utilize that when + // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and + // bounds-exhausted near the top of the file. + filterUsedAndDidNotMatch := i.useFilterBlock && !i.lastBloomFilterMatched + if flags.TrySeekUsingNext() && !filterUsedAndDidNotMatch && + (i.secondLevel.exhaustedBounds == +1 || (PD(&i.secondLevel.data).IsDataInvalidated() && PI(&i.secondLevel.index).IsDataInvalidated())) && + err == nil { + // Already exhausted, so return nil. + return nil + } + + // Check prefix bloom filter. + if i.useFilterBlock { + if !i.lastBloomFilterMatched { + // Iterator is not positioned based on last seek. + flags = flags.DisableTrySeekUsingNext() + } + i.lastBloomFilterMatched = false + var mayContain bool + mayContain, i.secondLevel.err = i.secondLevel.bloomFilterMayContain(prefix) + if i.secondLevel.err != nil || !mayContain { + // In the i.secondLevel.err == nil case, this invalidation may not be necessary for + // correctness, and may be a place to optimize later by reusing the + // already loaded block. It was necessary in earlier versions of the code + // since the caller was allowed to call Next when SeekPrefixGE returned + // nil. This is no longer allowed. + PD(&i.secondLevel.data).Invalidate() + return nil + } + i.lastBloomFilterMatched = true + } + + // Bloom filter matches. + + // SeekPrefixGE performs various step-instead-of-seeking optimizations: eg + // enabled by trySeekUsingNext, or by monotonically increasing bounds + // (i.boundsCmp). + + // We fall into the slow path if i.index.IsDataInvalidated() even if the + // top-level iterator is already positioned correctly and all other + // conditions are met. An alternative structure could reuse topLevelIndex's + // current position and reload the index block to which it points. Arguably, + // an index block load is expensive and the index block may still be earlier + // than the index block containing the sought key, resulting in a wasteful + // block load. + + var dontSeekWithinSingleLevelIter bool + if PI(&i.topLevelIndex).IsDataInvalidated() || !PI(&i.topLevelIndex).Valid() || PI(&i.secondLevel.index).IsDataInvalidated() || err != nil || + (i.secondLevel.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || PI(&i.topLevelIndex).SeparatorLT(key) { + // Slow-path: need to position the topLevelIndex. + + // The previous exhausted state of singleLevelIterator is no longer + // relevant, since we may be moving to a different index block. + i.secondLevel.exhaustedBounds = 0 + flags = flags.DisableTrySeekUsingNext() + if !PI(&i.topLevelIndex).SeekGE(key) { + PD(&i.secondLevel.data).Invalidate() + PI(&i.secondLevel.index).Invalidate() + return nil + } + + result := i.loadSecondLevelIndexBlock(+1) + if result == loadBlockFailed { + i.secondLevel.boundsCmp = 0 + return nil + } + if result == loadBlockIrrelevant { + // Enforce the upper bound here since don't want to bother moving to + // the next entry in the top level index if upper bound is already + // exceeded. Note that the next entry starts with keys >= + // topLevelIndex.Separator() since even though this is the block + // separator, the same user key can span multiple index blocks. If + // upper is exclusive we pass orEqual=true below, else we require + // the separator to be strictly greater than upper. + if i.secondLevel.upper != nil && PI(&i.topLevelIndex).SeparatorGT( + i.secondLevel.upper, !i.secondLevel.endKeyInclusive) { + i.secondLevel.exhaustedBounds = +1 + } + // Fall through to skipForward. + dontSeekWithinSingleLevelIter = true + // Clear boundsCmp. + // + // In the typical cases where dontSeekWithinSingleLevelIter=false, + // the singleLevelIterator.SeekPrefixGE call will clear boundsCmp. + // However, in this case where dontSeekWithinSingleLevelIter=true, + // we never seek on the single-level iterator. This call will fall + // through to skipForward, which may improperly leave boundsCmp=+1 + // unless we clear it here. + i.secondLevel.boundsCmp = 0 + } + } else { + // INVARIANT: err == nil. + // + // Else fast-path: There are two possible cases, from + // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): + // + // 1) The bounds have moved forward (i.boundsCmp > 0) and this + // SeekPrefixGE is respecting the lower bound (guaranteed by Iterator). We + // know that the iterator must already be positioned within or just + // outside the previous bounds. Therefore, the topLevelIndex iter cannot + // be positioned at an entry ahead of the seek position (though it can be + // positioned behind). The !i.cmp(key, i.topLevelIndex.Separator()) > 0 + // confirms that it is not behind. Since it is not ahead and not behind it + // must be at the right position. + // + // 2) This SeekPrefixGE will land on a key that is greater than the key we + // are currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, + // i.topLevelIndex.Separator()) <= 0, we are at the correct lower level + // index block. No need to reset the state of singleLevelIterator. + // + // Note that cases 1 and 2 never overlap, and one of them must be true. + // This invariant checking is important enough that we do not gate it + // behind invariants.Enabled. + if i.secondLevel.boundsCmp > 0 == flags.TrySeekUsingNext() { + panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", + i.secondLevel.boundsCmp > 0, flags.TrySeekUsingNext())) + } + + if !flags.TrySeekUsingNext() { + // Case 1. Bounds have changed so the previous exhausted bounds state is + // irrelevant. + // WARNING-data-exhausted: this is safe to do only because the monotonic + // bounds optimizations only work when !data-exhausted. If they also + // worked with data-exhausted, we have made it unclear whether + // data-exhausted is actually true. See the comment at the top of the + // file. + i.secondLevel.exhaustedBounds = 0 + } + // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to + // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See + // bug https://github.com/cockroachdb/pebble/v2/issues/2036. + } + + if !dontSeekWithinSingleLevelIter { + if ikv := i.secondLevel.seekPrefixGE(prefix, key, flags); ikv != nil { + return ikv + } + } + // NB: skipForward checks whether exhaustedBounds is already +1. + return i.skipForward() +} + +// virtualLast should only be called if i.secondLevel.readBlockEnv.Virtual != nil. +func (i *twoLevelIterator[I, PI, D, PD]) virtualLast() *base.InternalKV { + if i.secondLevel.readEnv.Virtual == nil { + panic("pebble: invalid call to virtualLast") + } + if !i.secondLevel.endKeyInclusive { + // Trivial case. + return i.SeekLT(i.secondLevel.upper, base.SeekLTFlagsNone) + } + return i.virtualLastSeekLE() +} + +// virtualLastSeekLE implements a SeekLE() that can be used as part +// of reverse-iteration calls such as a Last() on a virtual sstable. Does a +// SeekLE on the upper bound of the file/iterator. +func (i *twoLevelIterator[I, PI, D, PD]) virtualLastSeekLE() *base.InternalKV { + // Callers of SeekLE don't know about virtual sstable bounds, so we may + // have to internally restrict the bounds. + // + // TODO(bananabrick): We can optimize this check away for the level iter + // if necessary. + if !i.secondLevel.endKeyInclusive { + panic("unexpected virtualLastSeekLE with exclusive upper bounds") + } + key := i.secondLevel.upper + // Need to position the topLevelIndex. + // + // The previous exhausted state of singleLevelIterator is no longer + // relevant, since we may be moving to a different index block. + i.secondLevel.exhaustedBounds = 0 + // Seek optimization only applies until iterator is first positioned with a + // SeekGE or SeekLT after SetBounds. + i.secondLevel.boundsCmp = 0 + topLevelOk := PI(&i.topLevelIndex).SeekGE(key) + // We can have multiple internal keys with the same user key as the seek + // key. In that case, we want the last (greatest) internal key. + for topLevelOk && bytes.Equal(PI(&i.topLevelIndex).Separator(), key) { + topLevelOk = PI(&i.topLevelIndex).Next() + } + if !topLevelOk { + return i.skipBackward() + } + result := i.loadSecondLevelIndexBlock(-1) + if result == loadBlockFailed { + i.secondLevel.boundsCmp = 0 + return nil + } + if result == loadBlockIrrelevant { + // Load the previous block. + return i.skipBackward() + } + if ikv := i.secondLevel.virtualLastSeekLE(); ikv != nil { + return ikv + } + return i.skipBackward() +} + +// SeekLT implements internalIterator.SeekLT, as documented in the pebble +// package. Note that SeekLT only checks the lower bound. It is up to the +// caller to ensure that key is less than the upper bound. +func (i *twoLevelIterator[I, PI, D, PD]) SeekLT( + key []byte, flags base.SeekLTFlags, +) *base.InternalKV { + if i.secondLevel.readEnv.Virtual != nil { + // Might have to fix upper bound since virtual sstable bounds are not + // known to callers of SeekLT. + // + // TODO(bananabrick): We can optimize away this check for the level iter + // if necessary. + cmp := i.secondLevel.cmp(key, i.secondLevel.upper) + // key == i.secondLevel.upper is fine. We'll do the right thing and return the + // first internal key with user key < key. + if cmp > 0 { + return i.virtualLast() + } + } + + i.secondLevel.exhaustedBounds = 0 + i.secondLevel.err = nil // clear cached iteration error + // Seek optimization only applies until iterator is first positioned after SetBounds. + i.secondLevel.boundsCmp = 0 + + var result loadBlockResult + // NB: Unlike SeekGE, we don't have a fast-path here since we don't know + // whether the topLevelIndex is positioned after the position that would + // be returned by doing i.topLevelIndex.SeekGE(). To know this we would + // need to know the index key preceding the current one. + // NB: If a bound-limited block property filter is configured, it's + // externally ensured that the filter is disabled (through returning + // Intersects=false irrespective of the block props provided) during seeks. + if !PI(&i.topLevelIndex).SeekGE(key) { + if !PI(&i.topLevelIndex).Last() { + PD(&i.secondLevel.data).Invalidate() + PI(&i.secondLevel.index).Invalidate() + return nil + } + + result = i.loadSecondLevelIndexBlock(-1) + if result == loadBlockFailed { + return nil + } + if result == loadBlockOK { + if ikv := i.secondLevel.lastInternal(); ikv != nil { + return i.secondLevel.maybeVerifyKey(ikv) + } + // Fall through to skipBackward since the singleLevelIterator did + // not have any blocks that satisfy the block interval + // constraints, or the lower bound was reached. + } + // Else loadBlockIrrelevant, so fall through. + } else { + result = i.loadSecondLevelIndexBlock(-1) + if result == loadBlockFailed { + return nil + } + if result == loadBlockOK { + if ikv := i.secondLevel.SeekLT(key, flags); ikv != nil { + return i.secondLevel.maybeVerifyKey(ikv) + } + // Fall through to skipBackward since the singleLevelIterator did + // not have any blocks that satisfy the block interval + // constraint, or the lower bound was reached. + } + // Else loadBlockIrrelevant, so fall through. + } + if result == loadBlockIrrelevant { + // Enforce the lower bound here since don't want to bother moving to + // the previous entry in the top level index if lower bound is already + // exceeded. Note that the previous entry starts with keys <= + // ikey.InternalKey.UserKey since even though this is the current block's + // separator, the same user key can span multiple index blocks. + if i.secondLevel.lower != nil && PI(&i.topLevelIndex).SeparatorLT(i.secondLevel.lower) { + i.secondLevel.exhaustedBounds = -1 + } + } + // NB: skipBackward checks whether exhaustedBounds is already -1. + return i.skipBackward() +} + +// First implements internalIterator.First, as documented in the pebble +// package. Note that First only checks the upper bound. It is up to the caller +// to ensure that key is greater than or equal to the lower bound (e.g. via a +// call to SeekGE(lower)). +func (i *twoLevelIterator[I, PI, D, PD]) First() *base.InternalKV { + // If we have a lower bound, use SeekGE. Note that in general this is not + // supported usage, except when the lower bound is there because the table is + // virtual. + if i.secondLevel.lower != nil { + return i.SeekGE(i.secondLevel.lower, base.SeekGEFlagsNone) + } + i.secondLevel.exhaustedBounds = 0 + i.secondLevel.err = nil // clear cached iteration error + // Seek optimization only applies until iterator is first positioned after SetBounds. + i.secondLevel.boundsCmp = 0 + + if !PI(&i.topLevelIndex).First() { + return nil + } + result := i.loadSecondLevelIndexBlock(+1) + if result == loadBlockFailed { + return nil + } + if result == loadBlockOK { + if ikv := i.secondLevel.First(); ikv != nil { + return ikv + } + // Else fall through to skipForward. + } else { + // result == loadBlockIrrelevant. Enforce the upper bound here since + // don't want to bother moving to the next entry in the top level index + // if upper bound is already exceeded. Note that the next entry starts + // with keys >= topLevelIndex.Separator() since even though this is the + // block separator, the same user key can span multiple index blocks. + // If upper is exclusive we pass orEqual=true below, else we require the + // separator to be strictly greater than upper. + if i.secondLevel.upper != nil && PI(&i.topLevelIndex).SeparatorGT( + i.secondLevel.upper, !i.secondLevel.endKeyInclusive) { + i.secondLevel.exhaustedBounds = +1 + } + } + // NB: skipForward checks whether exhaustedBounds is already +1. + return i.skipForward() +} + +// Last implements internalIterator.Last, as documented in the pebble +// package. Note that Last only checks the lower bound. It is up to the caller +// to ensure that key is less than the upper bound (e.g. via a call to +// SeekLT(upper)) +func (i *twoLevelIterator[I, PI, D, PD]) Last() *base.InternalKV { + if i.secondLevel.readEnv.Virtual != nil { + if i.secondLevel.endKeyInclusive { + return i.virtualLast() + } + return i.SeekLT(i.secondLevel.upper, base.SeekLTFlagsNone) + } + + if i.secondLevel.upper != nil { + panic("twoLevelIterator.Last() used despite upper bound") + } + i.secondLevel.exhaustedBounds = 0 + i.secondLevel.err = nil // clear cached iteration error + // Seek optimization only applies until iterator is first positioned after SetBounds. + i.secondLevel.boundsCmp = 0 + + if !PI(&i.topLevelIndex).Last() { + return nil + } + result := i.loadSecondLevelIndexBlock(-1) + if result == loadBlockFailed { + return nil + } + if result == loadBlockOK { + if ikv := i.secondLevel.Last(); ikv != nil { + return ikv + } + // Else fall through to skipBackward. + } else { + // result == loadBlockIrrelevant. Enforce the lower bound here since + // don't want to bother moving to the previous entry in the top level + // index if lower bound is already exceeded. Note that the previous + // entry starts with keys <= ikv.InternalKey.UserKey since even though + // this is the current block's separator, the same user key can span + // multiple index blocks. + if i.secondLevel.lower != nil && PI(&i.topLevelIndex).SeparatorLT(i.secondLevel.lower) { + i.secondLevel.exhaustedBounds = -1 + } + } + // NB: skipBackward checks whether exhaustedBounds is already -1. + return i.skipBackward() +} + +// Next implements internalIterator.Next, as documented in the pebble +// package. +// Note: twoLevelCompactionIterator.Next mirrors the implementation of +// twoLevelIterator.Next due to performance. Keep the two in sync. +func (i *twoLevelIterator[I, PI, D, PD]) Next() *base.InternalKV { + // Seek optimization only applies until iterator is first positioned after SetBounds. + i.secondLevel.boundsCmp = 0 + if i.secondLevel.err != nil { + // TODO(jackson): Can this case be turned into a panic? Once an error is + // encountered, the iterator must be re-seeked. + return nil + } + if ikv := i.secondLevel.Next(); ikv != nil { + return ikv + } + return i.skipForward() +} + +// NextPrefix implements (base.InternalIterator).NextPrefix. +func (i *twoLevelIterator[I, PI, D, PD]) NextPrefix(succKey []byte) *base.InternalKV { + if i.secondLevel.exhaustedBounds == +1 { + panic("Next called even though exhausted upper bound") + } + // Seek optimization only applies until iterator is first positioned after SetBounds. + i.secondLevel.boundsCmp = 0 + if i.secondLevel.err != nil { + // TODO(jackson): Can this case be turned into a panic? Once an error is + // encountered, the iterator must be re-seeked. + return nil + } + if ikv := i.secondLevel.NextPrefix(succKey); ikv != nil { + return ikv + } + // ikv == nil + if i.secondLevel.err != nil { + return nil + } + + // Did not find prefix in the existing second-level index block. This is the + // slow-path where we seek the iterator. + if !PI(&i.topLevelIndex).SeekGE(succKey) { + PD(&i.secondLevel.data).Invalidate() + PI(&i.secondLevel.index).Invalidate() + return nil + } + result := i.loadSecondLevelIndexBlock(+1) + if result == loadBlockFailed { + return nil + } + if result == loadBlockIrrelevant { + // Enforce the upper bound here since don't want to bother moving to the + // next entry in the top level index if upper bound is already exceeded. + // Note that the next entry starts with keys >= + // topLevelIndex.Separator() since even though this is the block + // separator, the same user key can span multiple index blocks. If upper + // is exclusive we pass orEqual=true below, else we require the + // separator to be strictly greater than upper. + if i.secondLevel.upper != nil && PI(&i.topLevelIndex).SeparatorGT( + i.secondLevel.upper, !i.secondLevel.endKeyInclusive) { + i.secondLevel.exhaustedBounds = +1 + } + } else if kv := i.secondLevel.SeekGE(succKey, base.SeekGEFlagsNone); kv != nil { + return i.secondLevel.maybeVerifyKey(kv) + } + return i.skipForward() +} + +// Prev implements internalIterator.Prev, as documented in the pebble +// package. +func (i *twoLevelIterator[I, PI, D, PD]) Prev() *base.InternalKV { + // Seek optimization only applies until iterator is first positioned after SetBounds. + i.secondLevel.boundsCmp = 0 + if i.secondLevel.err != nil { + return nil + } + if kv := i.secondLevel.Prev(); kv != nil { + return kv + } + return i.skipBackward() +} + +func (i *twoLevelIterator[I, PI, D, PD]) skipForward() *base.InternalKV { + for { + if i.secondLevel.err != nil || i.secondLevel.exhaustedBounds > 0 { + return nil + } + + // It is possible that skipBackward went too far and the virtual table lower + // bound is after the first key in the block we are about to load, in which + // case we must use SeekGE below. The keys in the block we are about to load + // start right after the topLevelIndex key (before we Next). + // + // An example of how this can happen: + // + // Second-level index block 1 - contains keys a@1, c@1 + // Second-level index block 2 - contains keys e@1, g@1 + // Second-level index block 3 - contains keys i@2, k@2 + // + // The virtual table lower bound is f. We have a range key masking filter + // that filters keys with @1 suffix. We are positioned inside block 3 then + // we Prev(). Block 2 is entirely filtered out, which makes us move to + // block 1. Now the range key masking filter gets an update (via + // SpanChanged) and it no longer filters out any keys. At this point if a + // Next happens, we will load block 2 but it would not be legal to return + // "e@1" which is outside the virtual bounds. + // + // The core of the problem is that skipBackward doesn't know it can stop + // at block 2, because it doesn't know what keys are at the start of that + // block. This is why we don't have this problem in the opposite + // direction: skipForward will never go beyond the last relevant block + // because it looks at the separator key which is an upper bound for the + // block. + // + // Note that this is only a problem with virtual tables; we make no + // guarantees wrt an iterator lower bound when we iterate forward. But we + // must never return keys that are not inside the virtual table. + useSeek := i.secondLevel.readEnv.Virtual != nil && (!PI(&i.topLevelIndex).Valid() || + PI(&i.topLevelIndex).SeparatorLT(i.secondLevel.readEnv.Virtual.Lower.UserKey)) + + i.secondLevel.exhaustedBounds = 0 + if !PI(&i.topLevelIndex).Next() { + PD(&i.secondLevel.data).Invalidate() + PI(&i.secondLevel.index).Invalidate() + return nil + } + result := i.loadSecondLevelIndexBlock(+1) + if result == loadBlockFailed { + return nil + } + if result == loadBlockOK { + var ikv *base.InternalKV + if useSeek { + ikv = i.secondLevel.SeekGE(i.secondLevel.lower, base.SeekGEFlagsNone) + } else { + ikv = i.secondLevel.firstInternal() + } + if ikv != nil { + return i.secondLevel.maybeVerifyKey(ikv) + } + // Next iteration will return if singleLevelIterator set + // exhaustedBounds = +1. + } else { + // result == loadBlockIrrelevant. Enforce the upper bound here since + // don't want to bother moving to the next entry in the top level + // index if upper bound is already exceeded. Note that the next + // entry starts with keys >= i.topLevelIndex.Separator() since even + // though this is the block separator, the same user key can span + // multiple index blocks. If upper is exclusive we pass orEqual=true + // below, else we require the separator to be strictly greater than + // upper. + if i.secondLevel.upper != nil && PI(&i.topLevelIndex).SeparatorGT( + i.secondLevel.upper, !i.secondLevel.endKeyInclusive) { + i.secondLevel.exhaustedBounds = +1 + // Next iteration will return. + } + } + } +} + +func (i *twoLevelIterator[I, PI, D, PD]) skipBackward() *base.InternalKV { + for { + if i.secondLevel.err != nil || i.secondLevel.exhaustedBounds < 0 { + return nil + } + i.secondLevel.exhaustedBounds = 0 + if !PI(&i.topLevelIndex).Prev() { + PD(&i.secondLevel.data).Invalidate() + PI(&i.secondLevel.index).Invalidate() + return nil + } + result := i.loadSecondLevelIndexBlock(-1) + if result == loadBlockFailed { + return nil + } + if result == loadBlockOK { + ikv := i.secondLevel.lastInternal() + if ikv != nil { + return i.secondLevel.maybeVerifyKey(ikv) + } + + // Next iteration will return if singleLevelIterator set + // exhaustedBounds = -1. + } else { + // result == loadBlockIrrelevant. Enforce the lower bound here since + // don't want to bother moving to the previous entry in the top + // level index if lower bound is already exceeded. Note that the + // previous entry starts with keys <= i.topLevelIndex.Separator() since + // even though this is the current block's separator, the same user + // key can span multiple index blocks. + if i.secondLevel.lower != nil && PI(&i.topLevelIndex).SeparatorLT(i.secondLevel.lower) { + i.secondLevel.exhaustedBounds = -1 + // Next iteration will return. + } + } + } +} + +func (i *twoLevelIterator[I, PI, D, PD]) Error() error { + return i.secondLevel.Error() +} + +func (i *twoLevelIterator[I, PI, D, PD]) SetBounds(lower, upper []byte) { + i.secondLevel.SetBounds(lower, upper) +} + +func (i *twoLevelIterator[I, PI, D, PD]) SetContext(ctx context.Context) { + i.secondLevel.SetContext(ctx) +} + +func (i *twoLevelIterator[I, PI, D, PD]) SetCloseHook(fn func()) { + i.secondLevel.SetCloseHook(fn) +} + +func (i *twoLevelIterator[I, PI, D, PD]) SetupForCompaction() { + i.secondLevel.SetupForCompaction() +} + +// Close implements internalIterator.Close, as documented in the pebble +// package. +func (i *twoLevelIterator[I, PI, D, PD]) Close() error { + if invariants.Enabled && i.secondLevel.pool != nil { + panic("twoLevelIterator's singleLevelIterator has its own non-nil pool") + } + pool := i.pool + err := i.secondLevel.closeInternal() + i.secondLevel.resetForReuse() + err = firstError(err, PI(&i.topLevelIndex).Close()) + i.useFilterBlock = false + i.lastBloomFilterMatched = false + if pool != nil { + pool.Put(i) + } + return err +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/rowblk/rowblk_fragment_iter.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/rowblk/rowblk_fragment_iter.go new file mode 100644 index 0000000..20a998f --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/rowblk/rowblk_fragment_iter.go @@ -0,0 +1,424 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package rowblk + +import ( + "bytes" + "context" + "fmt" + "os" + "sync" + + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/keyspan" + "github.com/cockroachdb/pebble/v2/internal/rangedel" + "github.com/cockroachdb/pebble/v2/internal/rangekey" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// fragmentIter wraps an Iter, implementing the keyspan.FragmentIterator +// interface. It's used for reading range deletion and range key blocks. +// +// Range deletions and range keys are fragmented before they're persisted to the +// block. Overlapping fragments have identical bounds. The fragmentIter gathers +// all the fragments with identical bounds within a block and returns a single +// keyspan.Span describing all the keys defined over the span. +// +// # Memory lifetime +// +// A Span returned by fragmentIter is only guaranteed to be stable until the +// next fragmentIter iteration positioning method. A Span's Keys slice may be +// reused, so the user must not assume it's stable. +// +// Blocks holding range deletions and range keys are configured to use a restart +// interval of 1. This provides key stability. The caller may treat the various +// byte slices (start, end, suffix, value) as stable for the lifetime of the +// iterator. +type fragmentIter struct { + suffixCmp base.CompareRangeSuffixes + blockIter Iter + keyBuf [2]keyspan.Key + span keyspan.Span + dir int8 + + // fileNum is used for logging/debugging. + fileNum base.DiskFileNum + + syntheticPrefixAndSuffix block.SyntheticPrefixAndSuffix + // startKeyBuf is a buffer that is reused to store the start key of the span + // when a synthetic prefix is used. + startKeyBuf []byte + // endKeyBuf is a buffer that is reused to generate the end key of the span + // when a synthetic prefix is set. It always starts with syntheticPrefix. + endKeyBuf []byte + + closeCheck invariants.CloseChecker +} + +var _ keyspan.FragmentIterator = (*fragmentIter)(nil) + +var fragmentBlockIterPool = sync.Pool{ + New: func() interface{} { + i := &fragmentIter{} + if invariants.UseFinalizers { + invariants.SetFinalizer(i, checkFragmentBlockIterator) + } + return i + }, +} + +// NewFragmentIter returns a new keyspan iterator that iterates over a block's +// spans. +func NewFragmentIter( + fileNum base.DiskFileNum, + comparer *base.Comparer, + blockHandle block.BufferHandle, + transforms block.FragmentIterTransforms, +) (keyspan.FragmentIterator, error) { + i := fragmentBlockIterPool.Get().(*fragmentIter) + + i.suffixCmp = comparer.CompareRangeSuffixes + // Use the i.keyBuf array to back the Keys slice to prevent an allocation + // when the spans contain few keys. + i.span.Keys = i.keyBuf[:0] + i.fileNum = fileNum + i.syntheticPrefixAndSuffix = transforms.SyntheticPrefixAndSuffix + if transforms.HasSyntheticPrefix() { + i.endKeyBuf = append(i.endKeyBuf[:0], transforms.SyntheticPrefix()...) + } + i.closeCheck = invariants.CloseChecker{} + + if err := i.blockIter.InitHandle(comparer, blockHandle, block.IterTransforms{ + SyntheticSeqNum: transforms.SyntheticSeqNum, + // We let the blockIter prepend the prefix to span start keys; the fragment + // iterator will prepend it for end keys. We could do everything in the + // fragment iterator, but we'd have to duplicate the logic for adjusting the + // seek key for SeekGE/SeekLT. + SyntheticPrefixAndSuffix: transforms.SyntheticPrefixAndSuffix.RemoveSuffix(), + // It's okay for HideObsoletePoints to be false here, even for shared + // ingested sstables. This is because rangedels do not apply to points in + // the same sstable at the same sequence number anyway, so exposing obsolete + // rangedels is harmless. + HideObsoletePoints: false, + }); err != nil { + i.Close() + return nil, err + } + return i, nil +} + +// initSpan initializes the span with a single fragment. +// +// Note that the span start and end keys and range key contents are aliased to +// the key or value when we don't have a synthetic prefix. This is ok because +// the range del/key block doesn't use prefix compression, so the key/value will +// be pointing directly into the buffer data. +func (i *fragmentIter) initSpan(ik base.InternalKey, internalValue []byte) error { + if ik.Kind() == base.InternalKeyKindRangeDelete { + i.span = rangedel.Decode(ik, internalValue, i.span.Keys[:0]) + } else { + var err error + i.span, err = rangekey.Decode(ik, internalValue, i.span.Keys[:0]) + if err != nil { + return err + } + } + // When synthetic prefix is used in the blockIter, the keys cannot be used + // across multiple blockIter operations; we have to make a copy in this case. + if i.syntheticPrefixAndSuffix.HasPrefix() || invariants.Sometimes(10) { + i.startKeyBuf = append(i.startKeyBuf[:0], i.span.Start...) + i.span.Start = i.startKeyBuf + } + return nil +} + +// addToSpan adds a fragment to the existing span. The fragment must be for the +// same start/end keys. +func (i *fragmentIter) addToSpan( + cmp base.Compare, ik base.InternalKey, internalValue []byte, +) error { + var err error + if ik.Kind() == base.InternalKeyKindRangeDelete { + err = rangedel.DecodeIntoSpan(cmp, ik, internalValue, &i.span) + } else { + err = rangekey.DecodeIntoSpan(cmp, ik, internalValue, &i.span) + } + return err +} + +// applySpanTransforms applies changes to the span that we decoded, if +// appropriate. +func (i *fragmentIter) applySpanTransforms() error { + if i.syntheticPrefixAndSuffix.HasPrefix() || invariants.Sometimes(10) { + syntheticPrefix := i.syntheticPrefixAndSuffix.Prefix() + // We have to make a copy of the start key because it will not stay valid + // across multiple blockIter operations. + i.startKeyBuf = append(i.startKeyBuf[:0], i.span.Start...) + i.span.Start = i.startKeyBuf + if invariants.Enabled && !bytes.Equal(syntheticPrefix, i.endKeyBuf[:len(syntheticPrefix)]) { + panic("pebble: invariant violation: synthetic prefix mismatch") + } + i.endKeyBuf = append(i.endKeyBuf[:len(syntheticPrefix)], i.span.End...) + i.span.End = i.endKeyBuf + } + + if i.syntheticPrefixAndSuffix.HasSuffix() { + syntheticSuffix := i.syntheticPrefixAndSuffix.Suffix() + for keyIdx := range i.span.Keys { + k := &i.span.Keys[keyIdx] + + switch k.Kind() { + case base.InternalKeyKindRangeKeySet: + if len(k.Suffix) > 0 { + if invariants.Enabled && i.suffixCmp(syntheticSuffix, k.Suffix) >= 0 { + return base.AssertionFailedf("synthetic suffix %q >= RangeKeySet suffix %q", + syntheticSuffix, k.Suffix) + } + k.Suffix = syntheticSuffix + } + case base.InternalKeyKindRangeKeyDelete: + // Nothing to do. + default: + return base.AssertionFailedf("synthetic suffix not supported with key kind %s", k.Kind()) + } + } + } + return nil +} + +// gatherForward gathers internal keys with identical bounds. Keys defined over +// spans of the keyspace are fragmented such that any overlapping key spans have +// identical bounds. When these spans are persisted to a range deletion or range +// key block, they may be persisted as multiple internal keys in order to encode +// multiple sequence numbers or key kinds. +// +// gatherForward iterates forward, re-combining the fragmented internal keys to +// reconstruct a keyspan.Span that holds all the keys defined over the span. +func (i *fragmentIter) gatherForward(kv *base.InternalKV) (*keyspan.Span, error) { + i.span = keyspan.Span{} + if kv == nil || !i.blockIter.Valid() { + return nil, nil + } + // Use the i.keyBuf array to back the Keys slice to prevent an allocation + // when a span contains few keys. + i.span.Keys = i.keyBuf[:0] + + // Decode the span's end key and individual keys from the value. + if err := i.initSpan(kv.K, kv.InPlaceValue()); err != nil { + return nil, err + } + + // There might exist additional internal keys with identical bounds encoded + // within the block. Iterate forward, accumulating all the keys with + // identical bounds to s. + + // Overlapping fragments are required to have exactly equal start and + // end bounds. + for kv = i.blockIter.Next(); kv != nil && i.blockIter.cmp(kv.K.UserKey, i.span.Start) == 0; kv = i.blockIter.Next() { + if err := i.addToSpan(i.blockIter.cmp, kv.K, kv.InPlaceValue()); err != nil { + return nil, err + } + } + if err := i.applySpanTransforms(); err != nil { + return nil, err + } + + // Apply a consistent ordering. + keyspan.SortKeysByTrailer(i.span.Keys) + + // i.blockIter is positioned over the first internal key for the next span. + return &i.span, nil +} + +// gatherBackward gathers internal keys with identical bounds. Keys defined over +// spans of the keyspace are fragmented such that any overlapping key spans have +// identical bounds. When these spans are persisted to a range deletion or range +// key block, they may be persisted as multiple internal keys in order to encode +// multiple sequence numbers or key kinds. +// +// gatherBackward iterates backwards, re-combining the fragmented internal keys +// to reconstruct a keyspan.Span that holds all the keys defined over the span. +func (i *fragmentIter) gatherBackward(kv *base.InternalKV) (*keyspan.Span, error) { + i.span = keyspan.Span{} + if kv == nil || !i.blockIter.Valid() { + return nil, nil + } + + // Decode the span's end key and individual keys from the value. + if err := i.initSpan(kv.K, kv.InPlaceValue()); err != nil { + return nil, err + } + + // There might exist additional internal keys with identical bounds encoded + // within the block. Iterate backward, accumulating all the keys with + // identical bounds to s. + // + // Overlapping fragments are required to have exactly equal start and + // end bounds. + for kv = i.blockIter.Prev(); kv != nil && i.blockIter.cmp(kv.K.UserKey, i.span.Start) == 0; kv = i.blockIter.Prev() { + if err := i.addToSpan(i.blockIter.cmp, kv.K, kv.InPlaceValue()); err != nil { + return nil, err + } + } + // i.blockIter is positioned over the last internal key for the previous + // span. + + // Apply a consistent ordering. + keyspan.SortKeysByTrailer(i.span.Keys) + + if err := i.applySpanTransforms(); err != nil { + return nil, err + } + return &i.span, nil +} + +// SetContext is part of the FragmentIterator interface. +func (i *fragmentIter) SetContext(ctx context.Context) {} + +// Close implements (keyspan.FragmentIterator).Close. +func (i *fragmentIter) Close() { + _ = i.blockIter.Close() + i.closeCheck.Close() + + if invariants.Sometimes(25) { + // In invariants mode, sometimes don't add the object to the pool so that we + // can check for double closes that take longer than the object stays in the + // pool. + return + } + i.span = keyspan.Span{} + i.dir = 0 + i.fileNum = 0 + i.syntheticPrefixAndSuffix = block.SyntheticPrefixAndSuffix{} + i.startKeyBuf = i.startKeyBuf[:0] + i.endKeyBuf = i.endKeyBuf[:0] + fragmentBlockIterPool.Put(i) +} + +// First implements (keyspan.FragmentIterator).First +func (i *fragmentIter) First() (*keyspan.Span, error) { + i.dir = +1 + return i.gatherForward(i.blockIter.First()) +} + +// Last implements (keyspan.FragmentIterator).Last. +func (i *fragmentIter) Last() (*keyspan.Span, error) { + i.dir = -1 + return i.gatherBackward(i.blockIter.Last()) +} + +// Next implements (keyspan.FragmentIterator).Next. +func (i *fragmentIter) Next() (*keyspan.Span, error) { + switch { + case i.dir == -1 && !i.span.Valid(): + // Switching directions. + // + // i.blockIter is exhausted, before the first key. Move onto the first. + i.blockIter.First() + i.dir = +1 + case i.dir == -1 && i.span.Valid(): + // Switching directions. + // + // i.blockIter is currently positioned over the last internal key for + // the previous span. Next it once to move to the first internal key + // that makes up the current span, and gatherForwaad to land on the + // first internal key making up the next span. + // + // In the diagram below, if the last span returned to the user during + // reverse iteration was [b,c), i.blockIter is currently positioned at + // [a,b). The block iter must be positioned over [d,e) to gather the + // next span's fragments. + // + // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... + // ^ ^ + // i.blockIter want + if x, err := i.gatherForward(i.blockIter.Next()); err != nil { + return nil, err + } else if invariants.Enabled && !x.Valid() { + panic("pebble: invariant violation: next entry unexpectedly invalid") + } + i.dir = +1 + } + // We know that this blockIter has in-place values. + return i.gatherForward(i.blockIter.KV()) +} + +// Prev implements (keyspan.FragmentIterator).Prev. +func (i *fragmentIter) Prev() (*keyspan.Span, error) { + switch { + case i.dir == +1 && !i.span.Valid(): + // Switching directions. + // + // i.blockIter is exhausted, after the last key. Move onto the last. + i.blockIter.Last() + i.dir = -1 + case i.dir == +1 && i.span.Valid(): + // Switching directions. + // + // i.blockIter is currently positioned over the first internal key for + // the next span. Prev it once to move to the last internal key that + // makes up the current span, and gatherBackward to land on the last + // internal key making up the previous span. + // + // In the diagram below, if the last span returned to the user during + // forward iteration was [b,c), i.blockIter is currently positioned at + // [d,e). The block iter must be positioned over [a,b) to gather the + // previous span's fragments. + // + // ... [a,b) [b,c) [b,c) [b,c) [d,e) ... + // ^ ^ + // want i.blockIter + if x, err := i.gatherBackward(i.blockIter.Prev()); err != nil { + return nil, err + } else if invariants.Enabled && !x.Valid() { + panic("pebble: invariant violation: previous entry unexpectedly invalid") + } + i.dir = -1 + } + // We know that this blockIter has in-place values. + return i.gatherBackward(i.blockIter.KV()) +} + +// SeekGE implements (keyspan.FragmentIterator).SeekGE. +func (i *fragmentIter) SeekGE(k []byte) (*keyspan.Span, error) { + if s, err := i.SeekLT(k); err != nil { + return nil, err + } else if s != nil && i.blockIter.cmp(k, s.End) < 0 { + return s, nil + } + // TODO(jackson): If the above i.SeekLT(k) discovers a span but the span + // doesn't meet the k < s.End comparison, then there's no need for the + // SeekLT to gatherBackward. + return i.Next() +} + +// SeekLT implements (keyspan.FragmentIterator).SeekLT. +func (i *fragmentIter) SeekLT(k []byte) (*keyspan.Span, error) { + i.dir = -1 + return i.gatherBackward(i.blockIter.SeekLT(k, base.SeekLTFlagsNone)) +} + +// String implements fmt.Stringer. +func (i *fragmentIter) String() string { + return "fragment-block-iter" +} + +// WrapChildren implements FragmentIterator. +func (i *fragmentIter) WrapChildren(wrap keyspan.WrapFn) {} + +// DebugTree is part of the FragmentIterator interface. +func (i *fragmentIter) DebugTree(tp treeprinter.Node) { + tp.Childf("%T(%p) fileNum=%s", i, i, i.fileNum) +} + +func checkFragmentBlockIterator(obj interface{}) { + i := obj.(*fragmentIter) + if h := i.blockIter.Handle(); h.Valid() { + fmt.Fprintf(os.Stderr, "fragmentBlockIter.blockIter.handle is not nil: %#v\n", h) + os.Exit(1) + } +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/rowblk/rowblk_index_iter.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/rowblk/rowblk_index_iter.go new file mode 100644 index 0000000..1901afe --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/rowblk/rowblk_index_iter.go @@ -0,0 +1,121 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package rowblk + +import ( + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// IndexIter is a lightweight adapter that implements block.IndexIterator for a +// row-based index block. +type IndexIter struct { + iter Iter +} + +// Assert that IndexIter satisfies the block.IndexBlockIterator interface. +var _ block.IndexBlockIterator = (*IndexIter)(nil) + +// Init initializes an iterator from the provided block data slice. +func (i *IndexIter) Init(c *base.Comparer, blk []byte, transforms block.IterTransforms) error { + return i.iter.Init(c.Compare, c.ComparePointSuffixes, c.Split, blk, transforms) +} + +// InitHandle initializes an iterator from the provided block handle. +func (i *IndexIter) InitHandle( + comparer *base.Comparer, block block.BufferHandle, transforms block.IterTransforms, +) error { + return i.iter.InitHandle(comparer, block, transforms) +} + +// Valid returns true if the iterator is currently positioned at a valid block +// handle. +func (i *IndexIter) Valid() bool { + return i.iter.offset >= 0 && i.iter.offset < i.iter.restarts +} + +// IsDataInvalidated returns true when the blockIter has been invalidated +// using an invalidate call. NB: this is different from blockIter.Valid +// which is part of the InternalIterator implementation. +func (i *IndexIter) IsDataInvalidated() bool { + return i.iter.IsDataInvalidated() +} + +// Invalidate invalidates the block iterator, removing references to the block +// it was initialized with. +func (i *IndexIter) Invalidate() { + i.iter.Invalidate() +} + +// Handle returns the underlying block buffer handle, if the iterator was +// initialized with one. +func (i *IndexIter) Handle() block.BufferHandle { + return i.iter.handle +} + +// Separator returns the separator at the iterator's current position. The +// iterator must be positioned at a valid row. A Separator is a user key +// guaranteed to be greater than or equal to every key contained within the +// referenced block(s). +func (i *IndexIter) Separator() []byte { + return i.iter.ikv.K.UserKey +} + +// SeparatorLT returns true if the separator at the iterator's current +// position is strictly less than the provided key. +func (i *IndexIter) SeparatorLT(key []byte) bool { + return i.iter.cmp(i.iter.ikv.K.UserKey, key) < 0 +} + +// SeparatorGT returns true if the separator at the iterator's current position +// is strictly greater than (or equal, if orEqual=true) the provided key. +func (i *IndexIter) SeparatorGT(key []byte, inclusively bool) bool { + cmp := i.iter.cmp(i.iter.ikv.K.UserKey, key) + return cmp > 0 || (cmp == 0 && inclusively) +} + +// BlockHandleWithProperties decodes the block handle with any encoded +// properties at the iterator's current position. +func (i *IndexIter) BlockHandleWithProperties() (block.HandleWithProperties, error) { + return block.DecodeHandleWithProperties(i.iter.ikv.InPlaceValue()) +} + +// SeekGE seeks the index iterator to the first block entry with a separator key +// greater or equal to the given key. If it returns true, the iterator is +// positioned over the first block that might contain the key [key], and +// following blocks have keys ≥ Separator(). It returns false if the seek key is +// greater than all index block separators. +func (i *IndexIter) SeekGE(key []byte) bool { + return i.iter.SeekGE(key, base.SeekGEFlagsNone) != nil +} + +// First seeks index iterator to the first block entry. It returns false if +// the index block is empty. +func (i *IndexIter) First() bool { + return i.iter.First() != nil +} + +// Last seeks index iterator to the last block entry. It returns false if +// the index block is empty. +func (i *IndexIter) Last() bool { + return i.iter.Last() != nil +} + +// Next steps the index iterator to the next block entry. It returns false +// if the index block is exhausted. +func (i *IndexIter) Next() bool { + return i.iter.Next() != nil +} + +// Prev steps the index iterator to the previous block entry. It returns +// false if the index block is exhausted. +func (i *IndexIter) Prev() bool { + return i.iter.Prev() != nil +} + +// Close closes the iterator, releasing any resources it holds. +func (i *IndexIter) Close() error { + return i.iter.Close() +} diff --git a/vendor/github.com/cockroachdb/pebble/v2/sstable/rowblk/rowblk_iter.go b/vendor/github.com/cockroachdb/pebble/v2/sstable/rowblk/rowblk_iter.go new file mode 100644 index 0000000..ff81a21 --- /dev/null +++ b/vendor/github.com/cockroachdb/pebble/v2/sstable/rowblk/rowblk_iter.go @@ -0,0 +1,1976 @@ +// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package rowblk + +import ( + "bytes" + "context" + "encoding/binary" + "io" + "iter" + "slices" + "sort" + "unsafe" + + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2/internal/base" + "github.com/cockroachdb/pebble/v2/internal/invariants" + "github.com/cockroachdb/pebble/v2/internal/treeprinter" + "github.com/cockroachdb/pebble/v2/sstable/block" +) + +// Iter is an iterator over a single block of data. +// +// An Iter provides an additional guarantee around key stability when a block +// has a restart interval of 1 (i.e. when there is no prefix compression). Key +// stability refers to whether the InternalKey.UserKey bytes returned by a +// positioning call will remain stable after a subsequent positioning call. The +// normal case is that a positioning call will invalidate any previously +// returned InternalKey.UserKey. If a block has a restart interval of 1 (no +// prefix compression), Iter guarantees that InternalKey.UserKey will point to +// the key as stored in the block itself which will remain valid until the Iter +// is closed. The key stability guarantee is used by the range tombstone and +// range key code, which knows that the respective blocks are always encoded +// with a restart interval of 1. This per-block key stability guarantee is +// sufficient for range tombstones and range deletes as they are always encoded +// in a single block. Note: this stability guarantee no longer holds for a block +// iter with synthetic prefix/suffix replacement, but we don't use the synthetic +// suffix/prefix functionality of Iter for range keys. +// +// An Iter also provides a value stability guarantee for range deletions and +// range keys since there is only a single range deletion and range key block +// per sstable and the Iter will not release the bytes for the block until it is +// closed. +// +// Note on why Iter knows about lazyValueHandling: +// +// Iter's positioning functions (that return a LazyValue), are too +// complex to inline even prior to lazyValueHandling. Iter.Next and +// Iter.First were by far the cheapest and had costs 195 and 180 +// respectively, which exceeds the budget of 80. We initially tried to keep +// the lazyValueHandling logic out of Iter by wrapping it with a +// lazyValueDataBlockIter. singleLevelIter and twoLevelIter would use this +// wrapped iter. The functions in lazyValueDataBlockIter were simple, in that +// they called the corresponding Iter func and then decided whether the +// value was in fact in-place (so return immediately) or needed further +// handling. But these also turned out too costly for mid-stack inlining since +// simple calls like the following have a high cost that is barely under the +// budget of 80 +// +// k, v := i.data.SeekGE(key, flags) // cost 74 +// k, v := i.data.Next() // cost 72 +// +// We have 2 options for minimizing performance regressions: +// - Include the lazyValueHandling logic in the already non-inlineable +// Iter functions: Since most of the time is spent in data block iters, +// it is acceptable to take the small hit of unnecessary branching (which +// hopefully branch prediction will predict correctly) for other kinds of +// blocks. +// - Duplicate the logic of singleLevelIterator and twoLevelIterator for the +// v3 sstable and only use the aforementioned lazyValueDataBlockIter for a +// v3 sstable. We would want to manage these copies via code generation. +// +// We have picked the first option here. +type Iter struct { + cmp base.Compare + split base.Split + + // Iterator transforms. + // + // SyntheticSuffix, if set, will replace the decoded ikey.UserKey suffix + // before the key is returned to the user. A sequence of iter operations on a + // block with a syntheticSuffix rule should return keys as if those operations + // ran on a block with keys that all had the syntheticSuffix. As an example: + // any sequence of block iter cmds should return the same keys for the + // following two blocks: + // + // blockA: a@3,b@3,c@3 + // blockB: a@1,b@2,c@1 with syntheticSuffix=3 + // + // To ensure this, Suffix replacement will not change the ordering of keys in + // the block because the iter assumes that no two keys in the block share the + // same prefix. Furthermore, during SeekGE and SeekLT operations, the block + // iterator handles "off by one" errors (explained in more detail in those + // functions) when, for a given key, originalSuffix < searchSuffix < + // replacementSuffix, with integer comparison. To handle these cases, the + // iterator assumes: + // + // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{originalSuffix}) < 0 + // for keys with a suffix. + // + // NB: it is possible for a block iter to add a synthetic suffix on a key + // without a suffix, which implies + // pebble.Compare(keyPrefix{replacementSuffix},keyPrefix{noSuffix}) > 0 , + // however, the iterator would never need to handle an off by one error in + // this case since originalSuffix (empty) > searchSuffix (non empty), with + // integer comparison. + // + // + // In addition, we also assume that any block with rangekeys will not contain + // a synthetic suffix. + transforms block.IterTransforms + + // offset is the byte index that marks where the current key/value is + // encoded in the block. + offset offsetInBlock + // nextOffset is the byte index where the next key/value is encoded in the + // block. + nextOffset offsetInBlock + // A "restart point" in a block is a point where the full key is encoded, + // instead of just having a suffix of the key encoded. See readEntry() for + // how prefix compression of keys works. Keys in between two restart points + // only have a suffix encoded in the block. When restart interval is 1, no + // prefix compression of keys happens. This is the case with range tombstone + // blocks. + // + // All restart offsets are listed in increasing order in + // i.ptr[i.restarts:len(block)-4], while numRestarts is encoded in the last + // 4 bytes of the block as a uint32 (i.ptr[len(block)-4:]). i.restarts can + // therefore be seen as the point where data in the block ends, and a list + // of offsets of all restart points begins. + // + // int64 is used to prevent overflow and preserve signedness for binary + // search invariants. + restarts offsetInBlock + // Number of restart points in this block. Encoded at the end of the block + // as a uint32. + numRestarts int32 + ptr unsafe.Pointer + data []byte + // key contains the raw key the iterator is currently pointed at. This may + // point directly to data stored in the block (for a key which has no prefix + // compression), to fullKey (for a prefix compressed key), or to a slice of + // data stored in cachedBuf (during reverse iteration). + // + // NB: In general, key contains the same logical content as ikey + // (i.e. ikey = decode(key)), but if the iterator contains a synthetic suffix + // replacement rule, this will not be the case. Therefore, key should never + // be used after ikey is set. + key []byte + // fullKey is a buffer used for key prefix decompression. Note that if + // transforms.SyntheticPrifix is not nil, fullKey always starts with that + // prefix. + fullKey []byte + // val contains the value the iterator is currently pointed at. If non-nil, + // this points to a slice of the block data. + val []byte + // ikv contains the decoded internal KV the iterator is currently positioned + // at. + // + // ikv.InternalKey contains the decoded InternalKey the iterator is + // currently pointed at. Note that the memory backing ikv.UserKey is either + // data stored directly in the block, fullKey, or cachedBuf. The key + // stability guarantee for blocks built with a restart interval of 1 is + // achieved by having ikv.UserKey always point to data stored directly in + // the block. + // + // ikv.LazyValue is val turned into a LazyValue, whenever a positioning + // method returns a non-nil key-value pair. + ikv base.InternalKV + // cached and cachedBuf are used during reverse iteration. They are needed + // because we can't perform prefix decoding in reverse, only in the forward + // direction. In order to iterate in reverse, we decode and cache the entries + // between two restart points. + // + // Note that cached[len(cached)-1] contains the previous entry to the one the + // blockIter is currently pointed at. As usual, nextOffset will contain the + // offset of the next entry. During reverse iteration, nextOffset will be + // updated to point to offset, and we'll set the blockIter to point at the + // entry cached[len(cached)-1]. See Prev() for more details. + // + // For a block encoded with a restart interval of 1, cached and cachedBuf + // will not be used as there are no prefix compressed entries between the + // restart points. + cached []blockEntry + cachedBuf []byte + handle block.BufferHandle + // for block iteration for already loaded blocks. + firstUserKey []byte + lazyValueHandling struct { + getValue block.GetInternalValueForPrefixAndValueHandler + hasValuePrefix bool + } + synthSuffixBuf []byte + firstUserKeyWithPrefixBuf []byte +} + +// offsetInBlock represents an offset in a block +// +// While restart points are serialized as uint32's, it is possible for offsets to +// be greater than math.MaxUint32 since they may point to an offset after the KVs. +// +// Previously, offsets were represented as int32, which causes problems with +// integer overflows while indexing into blocks (i.data) with large KVs in SeekGE() +// and SeekLT(). Using an int64 solves the problem of overflows as wraparounds will +// be prevented. Additionally, the signedness of int64 allows repsentation of +// iterators that have conducted backward interation and allows for binary search +// invariants in SeekGE() and SeekLT() to be preserved. +type offsetInBlock int64 + +type blockEntry struct { + offset offsetInBlock + keyStart offsetInBlock + keyEnd offsetInBlock + valStart offsetInBlock + valSize uint32 +} + +// *Iter implements the block.DataBlockIterator interface. +var _ block.DataBlockIterator = (*Iter)(nil) + +// NewIter constructs a new row-oriented block iterator over the provided serialized block. +func NewIter( + cmp base.Compare, + suffixCmp base.ComparePointSuffixes, + split base.Split, + block []byte, + transforms block.IterTransforms, +) (*Iter, error) { + i := &Iter{} + return i, i.Init(cmp, suffixCmp, split, block, transforms) +} + +// String implements fmt.Stringer. +func (i *Iter) String() string { + return "block" +} + +// Init initializes the block iterator from the provided block. +func (i *Iter) Init( + cmp base.Compare, + suffixCmp base.ComparePointSuffixes, + split base.Split, + blk []byte, + transforms block.IterTransforms, +) error { + numRestarts := int32(binary.LittleEndian.Uint32(blk[len(blk)-4:])) + if numRestarts == 0 { + return base.CorruptionErrorf("pebble/table: invalid table (block has no restart points)") + } + i.transforms = transforms + i.synthSuffixBuf = i.synthSuffixBuf[:0] + i.split = split + i.cmp = cmp + i.restarts = offsetInBlock(len(blk)) - 4*(1+offsetInBlock(numRestarts)) + i.numRestarts = numRestarts + i.ptr = unsafe.Pointer(&blk[0]) + i.data = blk + if i.transforms.HasSyntheticPrefix() { + i.fullKey = append(i.fullKey[:0], i.transforms.SyntheticPrefix()...) + } else { + i.fullKey = i.fullKey[:0] + } + i.val = nil + i.clearCache() + if i.restarts > 0 { + if err := i.readFirstKey(); err != nil { + return err + } + } else { + // Block is empty. + i.firstUserKey = nil + } + return nil +} + +// InitHandle initializes an iterator from the provided block handle. +// NB: two cases of hideObsoletePoints: +// - Local sstable iteration: syntheticSeqNum will be set iff the sstable was +// ingested. +// - Foreign sstable iteration: syntheticSeqNum is always set. +func (i *Iter) InitHandle( + comparer *base.Comparer, block block.BufferHandle, transforms block.IterTransforms, +) error { + i.handle.Release() + i.handle = block + return i.Init( + comparer.Compare, + comparer.ComparePointSuffixes, + comparer.Split, + block.BlockData(), + transforms) +} + +// SetHasValuePrefix sets whether or not the block iterator should expect values +// corresponding to Set keys to have a prefix byte. +func (i *Iter) SetHasValuePrefix(hasValuePrefix bool) { + i.lazyValueHandling.hasValuePrefix = hasValuePrefix +} + +// SetGetLazyValuer sets the value block reader the iterator should use to get +// lazy values when the value encodes a value prefix. +func (i *Iter) SetGetLazyValuer(g block.GetInternalValueForPrefixAndValueHandler) { + i.lazyValueHandling.getValue = g + +} + +// Handle returns the underlying block buffer handle, if the iterator was +// initialized with one. +func (i *Iter) Handle() block.BufferHandle { + return i.handle +} + +// Invalidate invalidates the block iterator, removing references to the block +// it was initialized with. +func (i *Iter) Invalidate() { + i.clearCache() + i.offset = 0 + i.nextOffset = 0 + i.restarts = 0 + i.numRestarts = 0 + i.data = nil +} + +// IsDataInvalidated returns true when the blockIter has been invalidated +// using an invalidate call. NB: this is different from blockIter.Valid +// which is part of the InternalIterator implementation. +func (i *Iter) IsDataInvalidated() bool { + return i.data == nil +} + +func (i *Iter) readEntry() { + ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(i.offset)) + + // This is an ugly performance hack. Reading entries from blocks is one of + // the inner-most routines and decoding the 3 varints per-entry takes + // significant time. Neither go1.11 or go1.12 will inline decodeVarint for + // us, so we do it manually. This provides a 10-15% performance improvement + // on blockIter benchmarks on both go1.11 and go1.12. + // + // TODO(peter): remove this hack if go:inline is ever supported. + + var shared uint32 + if a := *((*uint8)(ptr)); a < 128 { + shared = uint32(a) + ptr = unsafe.Add(ptr, 1) + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + shared = uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 2) + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + shared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 3) + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + shared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 4) + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + shared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 5) + } + + var unshared uint32 + if a := *((*uint8)(ptr)); a < 128 { + unshared = uint32(a) + ptr = unsafe.Add(ptr, 1) + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + unshared = uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 2) + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 3) + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 4) + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 5) + } + + var value uint32 + if a := *((*uint8)(ptr)); a < 128 { + value = uint32(a) + ptr = unsafe.Add(ptr, 1) + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + value = uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 2) + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + value = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 3) + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + value = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 4) + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + value = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 5) + } + shared += i.transforms.SyntheticPrefixAndSuffix.PrefixLen() + unsharedKey := unsafe.Slice((*byte)(ptr), int(unshared)) + // TODO(sumeer): move this into the else block below. + i.fullKey = append(i.fullKey[:shared], unsharedKey...) + if shared == 0 { + // Provide stability for the key across positioning calls if the key + // doesn't share a prefix with the previous key. This removes requiring the + // key to be copied if the caller knows the block has a restart interval of + // 1. An important example of this is range-del blocks. + i.key = unsharedKey + } else { + i.key = i.fullKey + } + ptr = unsafe.Add(ptr, unshared) + i.val = unsafe.Slice((*byte)(ptr), int(value)) + i.nextOffset = offsetInBlock(uintptr(ptr)-uintptr(i.ptr)) + offsetInBlock(value) +} + +func (i *Iter) readFirstKey() error { + ptr := i.ptr + + // This is an ugly performance hack. Reading entries from blocks is one of + // the inner-most routines and decoding the 3 varints per-entry takes + // significant time. Neither go1.11 or go1.12 will inline decodeVarint for + // us, so we do it manually. This provides a 10-15% performance improvement + // on blockIter benchmarks on both go1.11 and go1.12. + // + // TODO(peter): remove this hack if go:inline is ever supported. + + if shared := *((*uint8)(ptr)); shared == 0 { + ptr = unsafe.Add(ptr, 1) + } else { + // The shared length is != 0, which is invalid. + panic("first key in block must have zero shared length") + } + + var unshared uint32 + if a := *((*uint8)(ptr)); a < 128 { + unshared = uint32(a) + ptr = unsafe.Add(ptr, 1) + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + unshared = uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 2) + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + unshared = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 3) + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + unshared = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 4) + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + unshared = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 5) + } + + // Skip the value length. + if a := *((*uint8)(ptr)); a < 128 { + ptr = unsafe.Add(ptr, 1) + } else if a := *((*uint8)(unsafe.Add(ptr, 1))); a < 128 { + ptr = unsafe.Add(ptr, 2) + } else if a := *((*uint8)(unsafe.Add(ptr, 2))); a < 128 { + ptr = unsafe.Add(ptr, 3) + } else if a := *((*uint8)(unsafe.Add(ptr, 3))); a < 128 { + ptr = unsafe.Add(ptr, 4) + } else { + ptr = unsafe.Add(ptr, 5) + } + + firstKey := unsafe.Slice((*byte)(ptr), int(unshared)) + // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on + // BlockIter benchmarks. + if n := len(firstKey) - 8; n >= 0 { + i.firstUserKey = firstKey[:n:n] + } else { + i.firstUserKey = nil + return base.CorruptionErrorf("pebble/table: invalid firstKey in block") + } + if i.transforms.HasSyntheticPrefix() { + syntheticPrefix := i.transforms.SyntheticPrefix() + i.firstUserKeyWithPrefixBuf = slices.Grow(i.firstUserKeyWithPrefixBuf[:0], len(syntheticPrefix)+len(i.firstUserKey)) + i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, syntheticPrefix...) + i.firstUserKeyWithPrefixBuf = append(i.firstUserKeyWithPrefixBuf, i.firstUserKey...) + i.firstUserKey = i.firstUserKeyWithPrefixBuf + } + return nil +} + +func (i *Iter) decodeInternalKey(key []byte) (hiddenPoint bool) { + // Manually inlining base.DecodeInternalKey provides a 5-10% speedup on + // BlockIter benchmarks. + if n := len(key) - 8; n >= 0 { + trailer := base.InternalKeyTrailer(binary.LittleEndian.Uint64(key[n:])) + hiddenPoint = i.transforms.HideObsoletePoints && + (trailer&TrailerObsoleteBit != 0) + i.ikv.K.Trailer = trailer & TrailerObsoleteMask + i.ikv.K.UserKey = key[:n:n] + if n := i.transforms.SyntheticSeqNum; n != 0 { + i.ikv.K.SetSeqNum(base.SeqNum(n)) + } + } else { + i.ikv.K.Trailer = base.InternalKeyTrailer(base.InternalKeyKindInvalid) + i.ikv.K.UserKey = nil + } + return hiddenPoint +} + +// maybeReplaceSuffix replaces the suffix in i.ikey.UserKey with +// i.transforms.syntheticSuffix. +func (i *Iter) maybeReplaceSuffix() { + if i.transforms.HasSyntheticSuffix() && i.ikv.K.UserKey != nil { + prefixLen := i.split(i.ikv.K.UserKey) + // If ikey is cached or may get cached, we must copy + // UserKey to a new buffer before suffix replacement. + i.synthSuffixBuf = append(i.synthSuffixBuf[:0], i.ikv.K.UserKey[:prefixLen]...) + i.synthSuffixBuf = append(i.synthSuffixBuf, i.transforms.SyntheticSuffix()...) + i.ikv.K.UserKey = i.synthSuffixBuf + } +} + +func (i *Iter) clearCache() { + i.cached = i.cached[:0] + i.cachedBuf = i.cachedBuf[:0] +} + +func (i *Iter) cacheEntry() { + var valStart offsetInBlock + valSize := uint32(len(i.val)) + if valSize > 0 { + valStart = offsetInBlock(uintptr(unsafe.Pointer(&i.val[0])) - uintptr(i.ptr)) + } + + i.cached = append(i.cached, blockEntry{ + offset: i.offset, + keyStart: offsetInBlock(len(i.cachedBuf)), + keyEnd: offsetInBlock(len(i.cachedBuf) + len(i.key)), + valStart: valStart, + valSize: valSize, + }) + i.cachedBuf = append(i.cachedBuf, i.key...) +} + +// IsLowerBound implements the block.DataBlockIterator interface. +func (i *Iter) IsLowerBound(k []byte) bool { + // Note: we ignore HideObsoletePoints, but false negatives are allowed. + return i.cmp(i.firstUserKey, k) >= 0 +} + +// SeekGE implements internalIterator.SeekGE, as documented in the pebble +// package. +func (i *Iter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV { + if invariants.Enabled && i.IsDataInvalidated() { + panic(errors.AssertionFailedf("invalidated blockIter used")) + } + searchKey := key + if i.transforms.HasSyntheticPrefix() { + syntheticPrefix := i.transforms.SyntheticPrefix() + if !bytes.HasPrefix(key, syntheticPrefix) { + // The seek key is before or after the entire block of keys that start + // with SyntheticPrefix. To determine which, we need to compare against a + // valid key in the block. We use firstUserKey which has the synthetic + // prefix. + if i.cmp(i.firstUserKey, key) >= 0 { + return i.First() + } + // Set the offset to the end of the block to mimic the offset of an + // invalid iterator. This ensures a subsequent i.Prev() returns a valid + // result. + i.offset = i.restarts + i.nextOffset = i.restarts + return nil + } + searchKey = key[len(syntheticPrefix):] + } + + i.clearCache() + // Find the index of the smallest restart point whose key is > the key + // sought; index will be numRestarts if there is no such restart point. + i.offset = 0 + var index int32 + + { + // NB: manually inlined sort.Seach is ~5% faster. + // + // Define f(-1) == false and f(n) == true. + // Invariant: f(index-1) == false, f(upper) == true. + upper := i.numRestarts + for index < upper { + h := int32(uint(index+upper) >> 1) // avoid overflow when computing h + + // index ≤ h < upper + offset := decodeRestart(i.data[i.restarts+4*offsetInBlock(h):]) + // For a restart point, there are 0 bytes shared with the previous key. + // The varint encoding of 0 occupies 1 byte. + ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) + + // Decode the key at that restart point, and compare it to the key + // sought. See the comment in readEntry for why we manually inline the + // varint decoding. + var v1 uint32 + if a := *((*uint8)(ptr)); a < 128 { + v1 = uint32(a) + ptr = unsafe.Add(ptr, 1) + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + v1 = uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 2) + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 3) + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 4) + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 5) + } + + if *((*uint8)(ptr)) < 128 { + ptr = unsafe.Add(ptr, 1) + } else if *((*uint8)(unsafe.Add(ptr, 1))) < 128 { + ptr = unsafe.Add(ptr, 2) + } else if *((*uint8)(unsafe.Add(ptr, 2))) < 128 { + ptr = unsafe.Add(ptr, 3) + } else if *((*uint8)(unsafe.Add(ptr, 3))) < 128 { + ptr = unsafe.Add(ptr, 4) + } else { + ptr = unsafe.Add(ptr, 5) + } + + // Manually inlining part of base.DecodeInternalKey provides a 5-10% + // speedup on BlockIter benchmarks. + s := unsafe.Slice((*byte)(ptr), int(v1)) + var k []byte + if n := len(s) - 8; n >= 0 { + k = s[:n:n] + } + // Else k is invalid, and left as nil + + if i.cmp(searchKey, k) > 0 { + // The search key is greater than the user key at this restart point. + // Search beyond this restart point, since we are trying to find the + // first restart point with a user key >= the search key. + index = h + 1 // preserves f(i-1) == false + } else { + // k >= search key, so prune everything after index (since index + // satisfies the property we are looking for). + upper = h // preserves f(j) == true + } + } + // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true + // => answer is index. + } + + // index is the first restart point with key >= search key. Define the keys + // between a restart point and the next restart point as belonging to that + // restart point. + // + // Since keys are strictly increasing, if index > 0 then the restart point + // at index-1 will be the first one that has some keys belonging to it that + // could be equal to the search key. If index == 0, then all keys in this + // block are larger than the key sought, and offset remains at zero. + if index > 0 { + i.offset = decodeRestart(i.data[i.restarts+4*offsetInBlock(index-1):]) + } + i.readEntry() + hiddenPoint := i.decodeInternalKey(i.key) + + // Iterate from that restart point to somewhere >= the key sought. + if !i.Valid() { + return nil + } + + // A note on seeking in a block with a suffix replacement rule: even though + // the binary search above was conducted on keys without suffix replacement, + // Seek will still return the correct suffix replaced key. A binary + // search without suffix replacement will land on a key that is _less_ than + // the key the search would have landed on if all keys were already suffix + // replaced. Since Seek then conducts forward iteration to the first suffix + // replaced user key that is greater than or equal to the search key, the + // correct key is still returned. + // + // As an example, consider the following block with a restart interval of 1, + // with a replacement suffix of "4": + // - Pre-suffix replacement: apple@1, banana@3 + // - Post-suffix replacement: apple@4, banana@4 + // + // Suppose the client seeks with apple@3. Assuming suffixes sort in reverse + // chronological order (i.e. apple@1>apple@3), the binary search without + // suffix replacement would return apple@1. A binary search with suffix + // replacement would return banana@4. After beginning forward iteration from + // either returned restart point, forward iteration would + // always return the correct key, banana@4. + // + // Further, if the user searched with apple@0 (i.e. a suffix less than the + // pre replacement suffix) or with apple@5 (a suffix larger than the post + // replacement suffix), the binary search with or without suffix replacement + // would land on the same key, as we assume the following: + // (1) no two keys in the sst share the same prefix. + // (2) pebble.Compare(replacementSuffix,originalSuffix) > 0 + + i.maybeReplaceSuffix() + + if !hiddenPoint && i.cmp(i.ikv.K.UserKey, key) >= 0 { + // Initialize i.lazyValue + if !i.lazyValueHandling.hasValuePrefix || + i.ikv.K.Kind() != base.InternalKeyKindSet { + i.ikv.V = base.MakeInPlaceValue(i.val) + } else if i.lazyValueHandling.getValue == nil || block.ValuePrefix(i.val[0]).IsInPlaceValue() { + i.ikv.V = base.MakeInPlaceValue(i.val[1:]) + } else { + i.ikv.V = i.lazyValueHandling.getValue.GetInternalValueForPrefixAndValueHandle(i.val) + } + return &i.ikv + } + for i.Next(); i.Valid(); i.Next() { + if i.cmp(i.ikv.K.UserKey, key) >= 0 { + // i.Next() has already initialized i.ikv.LazyValue. + return &i.ikv + } + } + return nil +} + +// SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the +// pebble package. +func (i *Iter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *base.InternalKV { + // This should never be called as prefix iteration is handled by sstable.Iterator. + panic("pebble: SeekPrefixGE unimplemented") +} + +// SeekLT implements internalIterator.SeekLT, as documented in the pebble +// package. +func (i *Iter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV { + if invariants.Enabled && i.IsDataInvalidated() { + panic(errors.AssertionFailedf("invalidated blockIter used")) + } + searchKey := key + if i.transforms.HasSyntheticPrefix() { + syntheticPrefix := i.transforms.SyntheticPrefix() + if !bytes.HasPrefix(key, syntheticPrefix) { + // The seek key is before or after the entire block of keys that start + // with SyntheticPrefix. To determine which, we need to compare against a + // valid key in the block. We use firstUserKey which has the synthetic + // prefix. + if i.cmp(i.firstUserKey, key) < 0 { + return i.Last() + } + // Set the offset to the beginning of the block to mimic an exhausted + // iterator that has conducted backward interation. This ensures a + // subsequent Next() call returns the first key in the block. + i.offset = -1 + i.nextOffset = 0 + return nil + } + searchKey = key[len(syntheticPrefix):] + } + + i.clearCache() + // Find the index of the smallest restart point whose key is >= the key + // sought; index will be numRestarts if there is no such restart point. + i.offset = 0 + var index int32 + + { + // NB: manually inlined sort.Search is ~5% faster. + // + // Define f(-1) == false and f(n) == true. + // Invariant: f(index-1) == false, f(upper) == true. + upper := i.numRestarts + for index < upper { + h := int32(uint(index+upper) >> 1) // avoid overflow when computing h + + // index ≤ h < upper + offset := decodeRestart(i.data[i.restarts+4*offsetInBlock(h):]) + // For a restart point, there are 0 bytes shared with the previous key. + // The varint encoding of 0 occupies 1 byte. + ptr := unsafe.Pointer(uintptr(i.ptr) + uintptr(offset+1)) + + // Decode the key at that restart point, and compare it to the key + // sought. See the comment in readEntry for why we manually inline the + // varint decoding. + var v1 uint32 + if a := *((*uint8)(ptr)); a < 128 { + v1 = uint32(a) + ptr = unsafe.Add(ptr, 1) + } else if a, b := a&0x7f, *((*uint8)(unsafe.Add(ptr, 1))); b < 128 { + v1 = uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 2) + } else if b, c := b&0x7f, *((*uint8)(unsafe.Add(ptr, 2))); c < 128 { + v1 = uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 3) + } else if c, d := c&0x7f, *((*uint8)(unsafe.Add(ptr, 3))); d < 128 { + v1 = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 4) + } else { + d, e := d&0x7f, *((*uint8)(unsafe.Add(ptr, 4))) + v1 = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) + ptr = unsafe.Add(ptr, 5) + } + + if *((*uint8)(ptr)) < 128 { + ptr = unsafe.Add(ptr, 1) + } else if *((*uint8)(unsafe.Add(ptr, 1))) < 128 { + ptr = unsafe.Add(ptr, 2) + } else if *((*uint8)(unsafe.Add(ptr, 2))) < 128 { + ptr = unsafe.Add(ptr, 3) + } else if *((*uint8)(unsafe.Add(ptr, 3))) < 128 { + ptr = unsafe.Add(ptr, 4) + } else { + ptr = unsafe.Add(ptr, 5) + } + + // Manually inlining part of base.DecodeInternalKey provides a 5-10% + // speedup on BlockIter benchmarks. + s := unsafe.Slice((*byte)(ptr), int(v1)) + var k []byte + if n := len(s) - 8; n >= 0 { + k = s[:n:n] + } + // Else k is invalid, and left as nil + + if i.cmp(searchKey, k) > 0 { + // The search key is greater than the user key at this restart point. + // Search beyond this restart point, since we are trying to find the + // first restart point with a user key >= the search key. + index = h + 1 // preserves f(i-1) == false + } else { + // k >= search key, so prune everything after index (since index + // satisfies the property we are looking for). + upper = h // preserves f(j) == true + } + } + // index == upper, f(index-1) == false, and f(upper) (= f(index)) == true + // => answer is index. + } + + if index == 0 { + if i.transforms.HasSyntheticSuffix() { + // The binary search was conducted on keys without suffix replacement, + // implying the first key in the block may be less than the search key. To + // double check, get the first key in the block with suffix replacement + // and compare to the search key. Consider the following example: suppose + // the user searches with a@3, the first key in the block is a@2 and the + // block contains a suffix replacement rule of 4. Since a@3 sorts before + // a@2, the binary search would return index==0. Without conducting the + // suffix replacement, the SeekLT would incorrectly return nil. With + // suffix replacement though, a@4 should be returned as a@4 sorts before + // a@3. + ikv := i.First() + if i.cmp(ikv.K.UserKey, key) < 0 { + return ikv + } + } + // If index == 0 then all keys in this block are larger than the key + // sought, so there is no match. + i.offset = -1 + i.nextOffset = 0 + return nil + } + + // INVARIANT: index > 0 + + // Ignoring suffix replacement, index is the first restart point with key >= + // search key. Define the keys between a restart point and the next restart + // point as belonging to that restart point. Note that index could be equal to + // i.numRestarts, i.e., we are past the last restart. Since keys are strictly + // increasing, then the restart point at index-1 will be the first one that + // has some keys belonging to it that are less than the search key. + // + // Next, we will search between the restart at index-1 and the restart point + // at index, for the first key >= key, and then on finding it, return + // i.Prev(). We need to know when we have hit the offset for index, since then + // we can stop searching. targetOffset encodes that offset for index. + targetOffset := i.restarts + i.offset = decodeRestart(i.data[i.restarts+4*offsetInBlock(index-1):]) + if index < i.numRestarts { + targetOffset = decodeRestart(i.data[i.restarts+4*offsetInBlock(index):]) + + if i.transforms.HasSyntheticSuffix() { + // The binary search was conducted on keys without suffix replacement, + // implying the returned restart point (index) may be less than the search + // key, breaking the assumption described above. + // + // For example: consider this block with a replacement ts of 4, and + // restart interval of 1: - pre replacement: a@3,b@2,c@3 - post + // replacement: a@4,b@4,c@4 + // + // Suppose the client calls SeekLT(b@3), SeekLT must return b@4. + // + // If the client calls SeekLT(b@3), the binary search would return b@2, + // the lowest key geq to b@3, pre-suffix replacement. Then, SeekLT will + // begin forward iteration from a@3, the previous restart point, to + // b{suffix}. The iteration stops when it encounters a key geq to the + // search key or if it reaches the upper bound. Without suffix + // replacement, we can assume that the upper bound of this forward + // iteration, b{suffix}, is greater than the search key, as implied by the + // binary search. + // + // If we naively hold this assumption with suffix replacement, the + // iteration would terminate at the upper bound, b@4, call i.Prev, and + // incorrectly return a@4. To correct for this, if the original returned + // index is less than the search key, shift our forward iteration to begin + // at index instead of index -1. With suffix replacement the key at index + // is guaranteed to be the highest restart point less than the seach key + // (i.e. the same property of index-1 for a block without suffix + // replacement). This property holds because of the invariant that a block + // with suffix replacement will not have two keys that share the same + // prefix. To consider the above example, binary searching with b@3 landed + // naively at a@3, but since b@4